Unicode changes
							parent
							
								
									f00cfd653a
								
							
						
					
					
						commit
						d1aba5effe
					
				| 
						 | 
				
			
			@ -1,6 +1,6 @@
 | 
			
		|||
USING: unicode kernel math const combinators splitting
 | 
			
		||||
sequences math.parser io.files io assocs arrays namespaces
 | 
			
		||||
;
 | 
			
		||||
math.ranges unicode.normalize unicode.syntax ;
 | 
			
		||||
IN: unicode.breaks
 | 
			
		||||
 | 
			
		||||
ENUM: Any L V T Extend Control CR LF graphemes ;
 | 
			
		||||
| 
						 | 
				
			
			@ -25,7 +25,7 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ;
 | 
			
		|||
: process-other-extend ( lines -- set )
 | 
			
		||||
    [ "#" split1 drop ";" split1 drop trim-blank ] map
 | 
			
		||||
    [ empty? not ] subset
 | 
			
		||||
    [ ".." split1 [ dup ] unless* [ hex> ] 2apply range ] map
 | 
			
		||||
    [ ".." split1 [ dup ] unless* [ hex> ] 2apply [a,b] ] map
 | 
			
		||||
    concat >set ;
 | 
			
		||||
 | 
			
		||||
: other-extend-lines ( -- lines )
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,111 @@
 | 
			
		|||
USING: kernel unicode.load sequences sequences.next namespaces assocs.lib
 | 
			
		||||
unicode.normalize math unicode combinators assocs ;
 | 
			
		||||
IN: unicode.case
 | 
			
		||||
 | 
			
		||||
: ch>lower ( ch -- lower ) simple-lower at-default ;
 | 
			
		||||
: ch>upper ( ch -- upper ) simple-upper at-default ;
 | 
			
		||||
: ch>title ( ch -- title ) simple-title at-default ;
 | 
			
		||||
 | 
			
		||||
SYMBOL: locale ! Just casing locale, or overall?
 | 
			
		||||
 | 
			
		||||
: i-dot? ( -- ? )
 | 
			
		||||
    locale get { "tr" "az" } member? ;
 | 
			
		||||
 | 
			
		||||
: lithuanian? ( -- ? ) locale get "lt" = ;
 | 
			
		||||
 | 
			
		||||
: dot-over ( -- ch ) CHAR: \u0307 ;
 | 
			
		||||
 | 
			
		||||
: lithuanian-ch>upper ( ? next ch -- ? )
 | 
			
		||||
    rot [ 2drop f ]
 | 
			
		||||
    [ swap dot-over = over "ij" member? and swap , ] if ;
 | 
			
		||||
 | 
			
		||||
: lithuanian>upper ( string -- lower )
 | 
			
		||||
    [ f swap [ lithuanian-ch>upper ] each-next drop ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: mark-above? ( ch -- ? )
 | 
			
		||||
    combining-class 230 = ;
 | 
			
		||||
 | 
			
		||||
: lithuanian-ch>lower ( next ch -- )
 | 
			
		||||
    ! This fails to add a dot above in certain edge cases
 | 
			
		||||
    ! where there is a non-above combining mark before an above one
 | 
			
		||||
    ! in Lithuanian
 | 
			
		||||
    dup , "IJ" member? swap mark-above? and [ dot-over , ] when ;
 | 
			
		||||
 | 
			
		||||
: lithuanian>lower ( string -- lower )
 | 
			
		||||
    [ [ lithuanian-ch>lower ] each-next ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: turk-ch>upper ( ch -- )
 | 
			
		||||
    dup CHAR: i = 
 | 
			
		||||
    [ drop CHAR: I , dot-over , ] [ , ] if ;
 | 
			
		||||
 | 
			
		||||
: turk>upper ( string -- upper-i )
 | 
			
		||||
    [ [ turk-ch>upper ] each ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: turk-ch>lower ( ? next ch -- ? )
 | 
			
		||||
    {
 | 
			
		||||
        { [ rot ] [ 2drop f ] }
 | 
			
		||||
        { [ dup CHAR: I = ] [
 | 
			
		||||
            drop dot-over =
 | 
			
		||||
            dup CHAR: i CHAR: \u0131 ? ,
 | 
			
		||||
        ] }
 | 
			
		||||
        { [ t ] [ , drop f ] }
 | 
			
		||||
    } cond ;
 | 
			
		||||
 | 
			
		||||
: turk>lower ( string -- lower-i )
 | 
			
		||||
    [ f swap [ turk-ch>lower ] each-next drop ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: word-boundary ( prev char -- new ? )
 | 
			
		||||
    dup non-starter? [ drop dup ] when
 | 
			
		||||
    swap uncased? ;
 | 
			
		||||
 | 
			
		||||
: sigma-map ( string -- string )
 | 
			
		||||
    [
 | 
			
		||||
        swap [ uncased? ] keep not or
 | 
			
		||||
        [ drop HEX: 3C2 ] when
 | 
			
		||||
    ] map-next ;
 | 
			
		||||
 | 
			
		||||
: final-sigma ( string -- string )
 | 
			
		||||
    HEX: 3A3 over member? [ sigma-map ] when ;
 | 
			
		||||
 | 
			
		||||
: map-case ( string string-quot char-quot -- case )
 | 
			
		||||
    [
 | 
			
		||||
        rot [
 | 
			
		||||
            -rot [
 | 
			
		||||
                rot dup special-casing at
 | 
			
		||||
                [ -rot drop call % ]
 | 
			
		||||
                [ -rot nip call , ] ?if
 | 
			
		||||
            ] 2keep
 | 
			
		||||
        ] each 2drop
 | 
			
		||||
    ] "" make* ; inline
 | 
			
		||||
 | 
			
		||||
: >lower ( string -- lower )
 | 
			
		||||
    i-dot? [ turk>lower ] when
 | 
			
		||||
    final-sigma [ code-point-lower ] [ ch>lower ] map-case ;
 | 
			
		||||
 | 
			
		||||
: >upper ( string -- upper )
 | 
			
		||||
    i-dot? [ turk>upper ] when
 | 
			
		||||
    [ code-point-upper ] [ ch>upper ] map-case ;
 | 
			
		||||
 | 
			
		||||
: >title ( string -- title )
 | 
			
		||||
    final-sigma
 | 
			
		||||
    CHAR: \s swap
 | 
			
		||||
    [ tuck word-boundary swapd
 | 
			
		||||
        [ code-point-title ] [ code-point-lower ] if ]
 | 
			
		||||
    [ tuck word-boundary swapd 
 | 
			
		||||
        [ ch>title ] [ ch>lower ] if ]
 | 
			
		||||
    map-case nip ;
 | 
			
		||||
 | 
			
		||||
: >case-fold ( string -- fold )
 | 
			
		||||
    >upper >lower ;
 | 
			
		||||
 | 
			
		||||
: insensitive= ( str1 str2 -- ? )
 | 
			
		||||
    [ >case-fold ] 2apply = ;
 | 
			
		||||
 | 
			
		||||
: lower? ( string -- ? )
 | 
			
		||||
    dup >lower = ;
 | 
			
		||||
: upper? ( string -- ? )
 | 
			
		||||
    dup >lower = ;
 | 
			
		||||
: title? ( string -- ? )
 | 
			
		||||
    dup >title = ;
 | 
			
		||||
: case-fold? ( string -- ? )
 | 
			
		||||
    dup >case-fold = ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,143 @@
 | 
			
		|||
USING: assocs math kernel sequences io.files hashtables quotations
 | 
			
		||||
splitting arrays math.parser combinators.lib hash2 byte-arrays words
 | 
			
		||||
namespaces ;
 | 
			
		||||
IN: unicode.load
 | 
			
		||||
 | 
			
		||||
! Convenience functions
 | 
			
		||||
: 1+* ( n/f _ -- n+1 )
 | 
			
		||||
    drop [ 1+ ] [ 0 ] if* ;
 | 
			
		||||
 | 
			
		||||
: define-value ( value word -- )
 | 
			
		||||
    swap 1quotation define-compound ;
 | 
			
		||||
 | 
			
		||||
: ?between? ( n/f from to -- ? )
 | 
			
		||||
    pick [ between? ] [ 3drop f ] if ;
 | 
			
		||||
 | 
			
		||||
! Remove this soon
 | 
			
		||||
USE: parser
 | 
			
		||||
DEFER: >>
 | 
			
		||||
: << \ >> parse-until >quotation call ; parsing
 | 
			
		||||
 | 
			
		||||
! Loading data from UnicodeData.txt
 | 
			
		||||
 | 
			
		||||
: data ( filename -- data )
 | 
			
		||||
    file-lines [ ";" split ] map ;
 | 
			
		||||
 | 
			
		||||
: load-data ( -- data )
 | 
			
		||||
    "extra/unicode/UnicodeData.txt" resource-path data ;
 | 
			
		||||
 | 
			
		||||
: (process-data) ( index data -- newdata )
 | 
			
		||||
    [ [ nth ] keep first swap 2array ] curry* map
 | 
			
		||||
    [ second empty? not ] subset
 | 
			
		||||
    [ >r hex> r> ] assoc-map ;
 | 
			
		||||
 | 
			
		||||
: process-data ( index data -- hash )
 | 
			
		||||
    (process-data) [ hex> ] assoc-map >hashtable ;
 | 
			
		||||
 | 
			
		||||
: (chain-decomposed) ( hash value -- newvalue )
 | 
			
		||||
    [
 | 
			
		||||
        2dup swap at
 | 
			
		||||
        [ (chain-decomposed) ] [ 1array nip ] ?if
 | 
			
		||||
    ] curry* map concat ;
 | 
			
		||||
 | 
			
		||||
: chain-decomposed ( hash -- newhash )
 | 
			
		||||
    dup [ swap (chain-decomposed) ] curry assoc-map ;
 | 
			
		||||
 | 
			
		||||
: first* ( seq -- ? )
 | 
			
		||||
    second [ empty? ] [ first ] either ;
 | 
			
		||||
 | 
			
		||||
: (process-decomposed) ( data -- alist )
 | 
			
		||||
    5 swap (process-data)
 | 
			
		||||
    [ " " split [ hex> ] map ] assoc-map ;
 | 
			
		||||
 | 
			
		||||
: process-canonical ( data -- hash2 hash )
 | 
			
		||||
    (process-decomposed) [ first* ] subset
 | 
			
		||||
    [
 | 
			
		||||
        [ second length 2 = ] subset
 | 
			
		||||
        ! using 1009 as the size, the maximum load is 4
 | 
			
		||||
        [ first2 first2 rot 3array ] map 1009 alist>hash2
 | 
			
		||||
    ] keep
 | 
			
		||||
    >hashtable chain-decomposed ;
 | 
			
		||||
 | 
			
		||||
: process-compat ( data -- hash )
 | 
			
		||||
    (process-decomposed)
 | 
			
		||||
    [ dup first* [ first2 1 tail 2array ] unless ] map
 | 
			
		||||
    >hashtable chain-decomposed ;
 | 
			
		||||
 | 
			
		||||
: process-combining ( data -- hash )
 | 
			
		||||
    3 swap (process-data)
 | 
			
		||||
    [ string>number ] assoc-map
 | 
			
		||||
    [ nip 0 = not ] assoc-subset
 | 
			
		||||
    >hashtable ;
 | 
			
		||||
 | 
			
		||||
: categories ( -- names )
 | 
			
		||||
    ! For non-existent characters, use Cn
 | 
			
		||||
    { "Lu" "Ll" "Lt" "Lm" "Lo"
 | 
			
		||||
      "Mn" "Mc" "Me"
 | 
			
		||||
      "Nd" "Nl" "No"
 | 
			
		||||
      "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
 | 
			
		||||
      "Sm" "Sc" "Sk" "So"
 | 
			
		||||
      "Zs" "Zl" "Zp"
 | 
			
		||||
      "Cc" "Cf" "Cs" "Co" "Cn" } ;
 | 
			
		||||
 | 
			
		||||
: unicode-chars HEX: 2FA1E ;
 | 
			
		||||
! the maximum unicode char in the first 3 planes
 | 
			
		||||
 | 
			
		||||
: process-category ( data -- category-listing )
 | 
			
		||||
    2 swap (process-data)
 | 
			
		||||
    unicode-chars <byte-array> swap dupd swap [
 | 
			
		||||
        >r over unicode-chars >= [ r> 3drop ]
 | 
			
		||||
        [ categories index swap r> set-nth ] if
 | 
			
		||||
    ] curry assoc-each ;
 | 
			
		||||
 | 
			
		||||
: ascii-lower ( string -- lower )
 | 
			
		||||
    [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
 | 
			
		||||
 | 
			
		||||
: replace ( seq old new -- newseq )
 | 
			
		||||
    swap rot [ 2dup = [ drop over ] when ] map 2nip ;
 | 
			
		||||
 | 
			
		||||
: process-names ( data -- names-hash )
 | 
			
		||||
    1 swap (process-data)
 | 
			
		||||
    [ ascii-lower CHAR: \s CHAR: - replace swap ] assoc-map
 | 
			
		||||
    >hashtable ;
 | 
			
		||||
 | 
			
		||||
: multihex ( hexstring -- string )
 | 
			
		||||
    " " split [ hex> ] map [ ] subset ;
 | 
			
		||||
 | 
			
		||||
TUPLE: code-point lower title upper ;
 | 
			
		||||
 | 
			
		||||
C: <code-point> code-point
 | 
			
		||||
 | 
			
		||||
: set-code-point ( seq -- )
 | 
			
		||||
    4 head [ multihex ] map first4
 | 
			
		||||
    <code-point> swap first set ;
 | 
			
		||||
 | 
			
		||||
: load-special-casing ( -- special-casing )
 | 
			
		||||
    "extra/unicode/SpecialCasing.txt" resource-path data
 | 
			
		||||
    [ length 5 = ] subset
 | 
			
		||||
    [ [ set-code-point ] each ] H{ } make-assoc ;
 | 
			
		||||
 | 
			
		||||
DEFER: simple-lower
 | 
			
		||||
DEFER: simple-upper
 | 
			
		||||
DEFER: simple-title
 | 
			
		||||
DEFER: canonical-map
 | 
			
		||||
DEFER: combine-map
 | 
			
		||||
DEFER: class-map
 | 
			
		||||
DEFER: compat-map
 | 
			
		||||
DEFER: category-map
 | 
			
		||||
DEFER: name-map
 | 
			
		||||
DEFER: special-casing
 | 
			
		||||
 | 
			
		||||
<<
 | 
			
		||||
    load-data
 | 
			
		||||
    dup process-names \ name-map define-value
 | 
			
		||||
    13 over process-data \ simple-lower define-value
 | 
			
		||||
    12 over process-data tuck \ simple-upper define-value
 | 
			
		||||
    14 over process-data swapd union \ simple-title define-value
 | 
			
		||||
    dup process-combining \ class-map define-value
 | 
			
		||||
    dup process-canonical \ canonical-map define-value
 | 
			
		||||
        \ combine-map define-value
 | 
			
		||||
    dup process-compat \ compat-map define-value
 | 
			
		||||
    process-category \ category-map define-value
 | 
			
		||||
    load-special-casing \ special-casing define-value
 | 
			
		||||
>>
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,176 @@
 | 
			
		|||
USING: sequences namespaces unicode.load kernel combinators.lib math
 | 
			
		||||
unicode arrays ;
 | 
			
		||||
IN: unicode.normalize
 | 
			
		||||
 | 
			
		||||
! Utility word
 | 
			
		||||
: make* ( seq quot exemplar -- newseq )
 | 
			
		||||
    ! quot has access to original seq on stack
 | 
			
		||||
    ! this just makes the new-resizable the same length as seq
 | 
			
		||||
    [
 | 
			
		||||
        [
 | 
			
		||||
            pick length swap new-resizable
 | 
			
		||||
            [ building set call ] keep
 | 
			
		||||
        ] keep like
 | 
			
		||||
    ] with-scope ; inline
 | 
			
		||||
 | 
			
		||||
! Conjoining Jamo behavior
 | 
			
		||||
 | 
			
		||||
: hangul-base HEX: ac00 ; inline
 | 
			
		||||
: hangul-end HEX: D7AF ; inline
 | 
			
		||||
: initial-base HEX: 1100 ; inline
 | 
			
		||||
: medial-base HEX: 1161 ; inline
 | 
			
		||||
: final-base HEX: 11a7 ; inline
 | 
			
		||||
 | 
			
		||||
: initial-count 19 ; inline
 | 
			
		||||
: medial-count 21 ; inline
 | 
			
		||||
: final-count 28 ; inline
 | 
			
		||||
 | 
			
		||||
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
 | 
			
		||||
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
 | 
			
		||||
 | 
			
		||||
! These numbers come from UAX 29
 | 
			
		||||
: initial? ( ch -- ? )
 | 
			
		||||
    [ HEX: 1100 HEX: 1159 ?between? ] [ HEX: 115F = ] either ;
 | 
			
		||||
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
 | 
			
		||||
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
 | 
			
		||||
 | 
			
		||||
: hangul>jamo ( hangul -- jamo-string )
 | 
			
		||||
    hangul-base - final-count /mod final-base +
 | 
			
		||||
    >r medial-count /mod medial-base +
 | 
			
		||||
    >r initial-base + r> r>
 | 
			
		||||
    dup zero? [ drop 2array ] [ 3array ] if ;
 | 
			
		||||
 | 
			
		||||
: jamo>hangul ( initial medial final -- hangul )
 | 
			
		||||
    >r >r initial-base - medial-count *
 | 
			
		||||
    r> medial-base - + final-count *
 | 
			
		||||
    r> final-base - + hangul-base + ;
 | 
			
		||||
 | 
			
		||||
! Normalization -- Decomposition 
 | 
			
		||||
 | 
			
		||||
: (insert) ( seq n quot -- )
 | 
			
		||||
    over 0 = [ 3drop ] [
 | 
			
		||||
        [ >r dup 1- rot [ nth ] curry 2apply r> 2apply > ] 3keep
 | 
			
		||||
        roll [ 3drop ]
 | 
			
		||||
        [ >r [ dup 1- rot exchange ] 2keep 1- r> (insert) ] if
 | 
			
		||||
    ] if ; inline
 | 
			
		||||
 | 
			
		||||
: insert ( seq quot elt n -- )
 | 
			
		||||
    swap rot >r -rot [ swap set-nth ] 2keep r> (insert) ; inline
 | 
			
		||||
 | 
			
		||||
: insertion-sort ( seq quot -- )
 | 
			
		||||
    ! quot is a transformation on elements
 | 
			
		||||
    over dup length
 | 
			
		||||
    [ >r >r 2dup r> r> insert ] 2each 2drop ; inline
 | 
			
		||||
 | 
			
		||||
: reorder-slice ( string start -- slice done? )
 | 
			
		||||
    2dup swap [ non-starter? not ] find* drop
 | 
			
		||||
    [ [ over length ] unless* rot <slice> ] keep not ;
 | 
			
		||||
 | 
			
		||||
: reorder-next ( string i -- new-i done? )
 | 
			
		||||
    over [ non-starter? ] find* drop [
 | 
			
		||||
        reorder-slice
 | 
			
		||||
        >r dup [ combining-class ] insertion-sort slice-to r>
 | 
			
		||||
    ] [ length t ] if* ;
 | 
			
		||||
 | 
			
		||||
: reorder-loop ( string start -- )
 | 
			
		||||
    dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
 | 
			
		||||
 | 
			
		||||
: reorder ( string -- )
 | 
			
		||||
    0 reorder-loop ;
 | 
			
		||||
 | 
			
		||||
: reorder-back ( string i -- )
 | 
			
		||||
    over [ non-starter? not ] find-last* 1+* reorder-next 2drop ;
 | 
			
		||||
 | 
			
		||||
: decompose ( string quot -- decomposed )
 | 
			
		||||
    ! When there are 8 and 32-bit strings, this'll be
 | 
			
		||||
    ! equivalent to clone on 8 and the contents of the last
 | 
			
		||||
    ! main quotation on 32.
 | 
			
		||||
    over [ 127 < ] all? [ drop ] [
 | 
			
		||||
        swap [ [
 | 
			
		||||
            dup hangul? [ hangul>jamo % drop ]
 | 
			
		||||
            [ dup rot call [ % ] [ , ] ?if ] if
 | 
			
		||||
        ] curry* each ] "" make*
 | 
			
		||||
        dup reorder
 | 
			
		||||
    ] if ; inline
 | 
			
		||||
 | 
			
		||||
: nfd ( string -- string )
 | 
			
		||||
    [ canonical-entry ] decompose ;
 | 
			
		||||
 | 
			
		||||
: nfkd ( string -- string )
 | 
			
		||||
    [ compat-entry ] decompose ;
 | 
			
		||||
 | 
			
		||||
: string-append ( s1 s2 -- string )
 | 
			
		||||
    ! This could be more optimized,
 | 
			
		||||
    ! but in practice, it'll almost always just be append
 | 
			
		||||
    [ append ] keep
 | 
			
		||||
    0 over ?nth non-starter?
 | 
			
		||||
    [ length dupd reorder-back ] [ drop ] if ;
 | 
			
		||||
 | 
			
		||||
! Normalization -- Composition
 | 
			
		||||
SYMBOL: main-str
 | 
			
		||||
SYMBOL: ind
 | 
			
		||||
SYMBOL: after
 | 
			
		||||
SYMBOL: char
 | 
			
		||||
 | 
			
		||||
: get-str ( i -- ch ) ind get + main-str get ?nth ;
 | 
			
		||||
: current ( -- ch ) 0 get-str ;
 | 
			
		||||
: to ( -- ) ind inc ;
 | 
			
		||||
 | 
			
		||||
: initial-medial? ( -- ? )
 | 
			
		||||
    current initial? [ 1 get-str medial? ] [ f ] if ;
 | 
			
		||||
 | 
			
		||||
: --final? ( -- ? )
 | 
			
		||||
    2 get-str final? ;
 | 
			
		||||
 | 
			
		||||
: imf, ( -- )
 | 
			
		||||
    current to current to current jamo>hangul , ;
 | 
			
		||||
 | 
			
		||||
: im, ( -- )
 | 
			
		||||
    current to current 0 jamo>hangul , ;
 | 
			
		||||
 | 
			
		||||
: compose-jamo ( -- )
 | 
			
		||||
    initial-medial? [
 | 
			
		||||
        --final? [ imf, ] [ im, ] if
 | 
			
		||||
    ] when to current jamo? [ compose-jamo ] when ;
 | 
			
		||||
 | 
			
		||||
: pass-combining ( -- )
 | 
			
		||||
    current non-starter? [ current , to pass-combining ] when ;
 | 
			
		||||
 | 
			
		||||
: try-compose ( last-class char current-class -- )
 | 
			
		||||
    swapd = [ after get push ] [
 | 
			
		||||
        char get over combine-chars
 | 
			
		||||
        [ nip char set ] [ after get push ] if*
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
: compose-iter ( n -- )
 | 
			
		||||
    current [
 | 
			
		||||
        dup combining-class dup
 | 
			
		||||
        [ [ try-compose ] keep to compose-iter ] [ 3drop ] if
 | 
			
		||||
    ] [ drop ] if* ;
 | 
			
		||||
 | 
			
		||||
: ?new-after ( -- )
 | 
			
		||||
    after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
 | 
			
		||||
 | 
			
		||||
: (compose) ( -- )
 | 
			
		||||
    current [
 | 
			
		||||
        dup jamo? [ drop compose-jamo ] [
 | 
			
		||||
            char set to ?new-after
 | 
			
		||||
            0 compose-iter
 | 
			
		||||
            char get , after get %
 | 
			
		||||
            to
 | 
			
		||||
        ] if (compose)
 | 
			
		||||
    ] when* ;
 | 
			
		||||
 | 
			
		||||
: compose ( str -- comp )
 | 
			
		||||
    [
 | 
			
		||||
        main-str set
 | 
			
		||||
        0 ind set
 | 
			
		||||
        SBUF" " clone after set
 | 
			
		||||
        pass-combining (compose)
 | 
			
		||||
    ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: nfc ( string -- nfc )
 | 
			
		||||
    nfd compose ;
 | 
			
		||||
 | 
			
		||||
: nfkc ( string -- nfkc )
 | 
			
		||||
    nfkc compose ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,56 @@
 | 
			
		|||
USING: unicode.load kernel math sequences parser bit-arrays namespaces 
 | 
			
		||||
sequences.private arrays quotations classes.predicate ;
 | 
			
		||||
IN: unicode.syntax
 | 
			
		||||
 | 
			
		||||
! Character classes (categories)
 | 
			
		||||
 | 
			
		||||
: category# ( char -- category )
 | 
			
		||||
    ! There are a few characters that should be Cn
 | 
			
		||||
    ! that this gives Cf or Mn
 | 
			
		||||
    ! Cf = 26; Mn = 5; Cn = 29
 | 
			
		||||
    ! Use a compressed array instead?
 | 
			
		||||
    dup category-map ?nth [ ] [
 | 
			
		||||
        dup HEX: E0001 HEX: E007F between?
 | 
			
		||||
        [ drop 26 ] [
 | 
			
		||||
            HEX: E0100 HEX: E01EF between?  5 29 ?
 | 
			
		||||
        ] if
 | 
			
		||||
    ] ?if ;
 | 
			
		||||
 | 
			
		||||
: category ( char -- category )
 | 
			
		||||
    category# categories nth ;
 | 
			
		||||
 | 
			
		||||
: >category-array ( categories -- bitarray )
 | 
			
		||||
    categories [ swap member? ] curry* map >bit-array ;
 | 
			
		||||
 | 
			
		||||
: as-string ( strings -- bit-array )
 | 
			
		||||
    concat "\"" tuck 3append parse first ;
 | 
			
		||||
 | 
			
		||||
: [category] ( categories -- quot )
 | 
			
		||||
    [
 | 
			
		||||
        [ [ categories member? not ] subset as-string ] keep 
 | 
			
		||||
        [ categories member? ] subset >category-array
 | 
			
		||||
        [ dup category# ] % , [ nth-unsafe [ drop t ] ] %
 | 
			
		||||
        \ member? 2array >quotation ,
 | 
			
		||||
        \ if ,
 | 
			
		||||
    ] [ ] make ;
 | 
			
		||||
 | 
			
		||||
: define-category ( word categories -- )
 | 
			
		||||
    [category] fixnum -rot define-predicate-class ;
 | 
			
		||||
 | 
			
		||||
: CATEGORY:
 | 
			
		||||
    CREATE ";" parse-tokens define-category ; parsing
 | 
			
		||||
 | 
			
		||||
: seq-minus ( seq1 seq2 -- diff )
 | 
			
		||||
    [ member? not ] curry subset ;
 | 
			
		||||
 | 
			
		||||
: CATEGORY-NOT:
 | 
			
		||||
    CREATE ";" parse-tokens
 | 
			
		||||
    categories swap seq-minus define-category ; parsing
 | 
			
		||||
 | 
			
		||||
TUPLE: code-point lower title upper ;
 | 
			
		||||
 | 
			
		||||
C: <code-point> code-point
 | 
			
		||||
 | 
			
		||||
: set-code-point ( seq -- )
 | 
			
		||||
    4 head [ multihex ] map first4
 | 
			
		||||
    <code-point> swap first set ;
 | 
			
		||||
| 
						 | 
				
			
			@ -1,188 +1,13 @@
 | 
			
		|||
USING: kernel hashtables sequences io arrays math 
 | 
			
		||||
    hash2 namespaces strings assocs words splitting sequences.next
 | 
			
		||||
    byte-arrays quotations sequences.private io.files bit-arrays
 | 
			
		||||
    combinators math.parser io.streams.lines parser classes
 | 
			
		||||
    classes.predicate ;
 | 
			
		||||
USING: unicode.syntax hash2 assocs unicode.load kernel parser ;
 | 
			
		||||
IN: unicode
 | 
			
		||||
 | 
			
		||||
! Convenience functions
 | 
			
		||||
: >set ( seq -- hash )
 | 
			
		||||
    [ dup ] H{ } map>assoc ;
 | 
			
		||||
 | 
			
		||||
: either ( object first second -- ? )
 | 
			
		||||
    >r over slip swap [ r> drop ] [ r> call ] ?if ; inline
 | 
			
		||||
 | 
			
		||||
: 1+* ( n/f _ -- n+1 )
 | 
			
		||||
    drop [ 1+ ] [ 0 ] if* ;
 | 
			
		||||
 | 
			
		||||
: define-value ( value word -- )
 | 
			
		||||
    swap 1quotation define-compound ;
 | 
			
		||||
 | 
			
		||||
: ?between? ( n/f from to -- ? )
 | 
			
		||||
    pick [ between? ] [ 3drop f ] if ;
 | 
			
		||||
 | 
			
		||||
: range ( from to -- seq )
 | 
			
		||||
    1+ over - [ + ] curry* map ;
 | 
			
		||||
 | 
			
		||||
! Loading data from UnicodeData.txt
 | 
			
		||||
 | 
			
		||||
: data ( filename -- data )
 | 
			
		||||
    <file-reader> lines [ ";" split ] map ;
 | 
			
		||||
 | 
			
		||||
: load-data ( -- data )
 | 
			
		||||
    "extra/unicode/UnicodeData.txt" resource-path data ;
 | 
			
		||||
 | 
			
		||||
: (process-data) ( index data -- newdata )
 | 
			
		||||
    [ [ nth ] keep first swap 2array ] curry* map
 | 
			
		||||
    [ second empty? not ] subset
 | 
			
		||||
    [ >r hex> r> ] assoc-map ;
 | 
			
		||||
 | 
			
		||||
: process-data ( index data -- hash )
 | 
			
		||||
    (process-data) [ hex> ] assoc-map >hashtable ;
 | 
			
		||||
 | 
			
		||||
: (chain-decomposed) ( hash value -- newvalue )
 | 
			
		||||
    [
 | 
			
		||||
        2dup swap at
 | 
			
		||||
        [ (chain-decomposed) ] [ 1array nip ] ?if
 | 
			
		||||
    ] curry* map concat ;
 | 
			
		||||
 | 
			
		||||
: chain-decomposed ( hash -- newhash )
 | 
			
		||||
    dup [ swap (chain-decomposed) ] curry assoc-map ;
 | 
			
		||||
 | 
			
		||||
: first* ( seq -- ? )
 | 
			
		||||
    second [ empty? ] [ first ] either ;
 | 
			
		||||
 | 
			
		||||
: (process-decomposed) ( data -- alist )
 | 
			
		||||
    5 swap (process-data)
 | 
			
		||||
    [ " " split [ hex> ] map ] assoc-map ;
 | 
			
		||||
 | 
			
		||||
: process-canonical ( data -- hash2 hash )
 | 
			
		||||
    (process-decomposed) [ first* ] subset
 | 
			
		||||
    [
 | 
			
		||||
        [ second length 2 = ] subset
 | 
			
		||||
        ! using 1009 as the size, the maximum load is 4
 | 
			
		||||
        [ first2 first2 rot 3array ] map 1009 alist>hash2
 | 
			
		||||
    ] keep
 | 
			
		||||
    >hashtable chain-decomposed ;
 | 
			
		||||
 | 
			
		||||
: process-compat ( data -- hash )
 | 
			
		||||
    (process-decomposed)
 | 
			
		||||
    [ dup first* [ first2 1 tail 2array ] unless ] map
 | 
			
		||||
    >hashtable chain-decomposed ;
 | 
			
		||||
 | 
			
		||||
: process-combining ( data -- hash )
 | 
			
		||||
    3 swap (process-data)
 | 
			
		||||
    [ string>number ] assoc-map
 | 
			
		||||
    [ nip 0 = not ] assoc-subset
 | 
			
		||||
    >hashtable ;
 | 
			
		||||
 | 
			
		||||
: categories ( -- names )
 | 
			
		||||
    ! For non-existent characters, use Cn
 | 
			
		||||
    { "Lu" "Ll" "Lt" "Lm" "Lo"
 | 
			
		||||
      "Mn" "Mc" "Me"
 | 
			
		||||
      "Nd" "Nl" "No"
 | 
			
		||||
      "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
 | 
			
		||||
      "Sm" "Sc" "Sk" "So"
 | 
			
		||||
      "Zs" "Zl" "Zp"
 | 
			
		||||
      "Cc" "Cf" "Cs" "Co" "Cn" } ;
 | 
			
		||||
 | 
			
		||||
: unicode-chars HEX: 2FA1E ;
 | 
			
		||||
! the maximum unicode char in the first 3 planes
 | 
			
		||||
 | 
			
		||||
: process-category ( data -- category-listing )
 | 
			
		||||
    2 swap (process-data)
 | 
			
		||||
    unicode-chars <byte-array> swap dupd swap [
 | 
			
		||||
        >r over unicode-chars >= [ r> 3drop ]
 | 
			
		||||
        [ categories index swap r> set-nth ] if
 | 
			
		||||
    ] curry assoc-each ;
 | 
			
		||||
 | 
			
		||||
: ascii-lower ( string -- lower )
 | 
			
		||||
    [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
 | 
			
		||||
 | 
			
		||||
: replace ( seq old new -- newseq )
 | 
			
		||||
    swap rot [ 2dup = [ drop over ] when ] map 2nip ;
 | 
			
		||||
 | 
			
		||||
: process-names ( data -- names-hash )
 | 
			
		||||
    1 swap (process-data)
 | 
			
		||||
    [ ascii-lower CHAR: \s CHAR: - replace swap ] assoc-map
 | 
			
		||||
    >hashtable ;
 | 
			
		||||
 | 
			
		||||
DEFER: simple-lower
 | 
			
		||||
DEFER: simple-upper
 | 
			
		||||
DEFER: simple-title
 | 
			
		||||
DEFER: canonical-map
 | 
			
		||||
DEFER: combine-map
 | 
			
		||||
DEFER: class-map
 | 
			
		||||
DEFER: compat-map
 | 
			
		||||
DEFER: category-map
 | 
			
		||||
DEFER: name-map
 | 
			
		||||
 | 
			
		||||
: load-tables ( -- )
 | 
			
		||||
    load-data
 | 
			
		||||
    dup process-names \ name-map define-value
 | 
			
		||||
    13 over process-data \ simple-lower define-value
 | 
			
		||||
    12 over process-data tuck \ simple-upper define-value
 | 
			
		||||
    14 over process-data swapd union \ simple-title define-value
 | 
			
		||||
    dup process-combining \ class-map define-value
 | 
			
		||||
    dup process-canonical \ canonical-map define-value
 | 
			
		||||
    \ combine-map define-value
 | 
			
		||||
    dup process-compat \ compat-map define-value
 | 
			
		||||
    process-category \ category-map define-value ; parsing
 | 
			
		||||
load-tables
 | 
			
		||||
 | 
			
		||||
: canonical-entry ( char -- seq ) canonical-map at ;
 | 
			
		||||
: combine-chars ( a b -- char/f ) combine-map hash2 ;
 | 
			
		||||
: compat-entry ( char -- seq ) compat-map at  ;
 | 
			
		||||
: combining-class ( char -- n ) class-map at ;
 | 
			
		||||
: non-starter? ( char -- ? ) class-map key? ;
 | 
			
		||||
: name>char ( string -- char ) name-map at ;
 | 
			
		||||
 | 
			
		||||
: UNICHAR:
 | 
			
		||||
    scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
 | 
			
		||||
 | 
			
		||||
! Character classes (categories)
 | 
			
		||||
 | 
			
		||||
: category# ( char -- category )
 | 
			
		||||
    ! There are a few characters that should be Cn
 | 
			
		||||
    ! that this gives Cf or Mn
 | 
			
		||||
    ! Cf = 26; Mn = 5; Cn = 29
 | 
			
		||||
    dup category-map ?nth [ ] [
 | 
			
		||||
        dup HEX: E0001 HEX: E007F between?
 | 
			
		||||
        [ drop 26 ] [
 | 
			
		||||
            HEX: E0100 HEX: E01EF between?  5 29 ?
 | 
			
		||||
        ] if
 | 
			
		||||
    ] ?if ;
 | 
			
		||||
 | 
			
		||||
: category ( char -- category )
 | 
			
		||||
    category# categories nth ;
 | 
			
		||||
 | 
			
		||||
: >category-array ( categories -- bitarray )
 | 
			
		||||
    categories [ swap member? ] curry* map >bit-array ;
 | 
			
		||||
 | 
			
		||||
: as-string ( strings -- bit-array )
 | 
			
		||||
    concat "\"" tuck 3append parse first ;
 | 
			
		||||
 | 
			
		||||
: [category] ( categories -- quot )
 | 
			
		||||
    [
 | 
			
		||||
        [ [ categories member? not ] subset as-string ] keep 
 | 
			
		||||
        [ categories member? ] subset >category-array
 | 
			
		||||
        [ dup category# ] % , [ nth-unsafe [ drop t ] ] %
 | 
			
		||||
        \ member? 2array >quotation ,
 | 
			
		||||
        \ if ,
 | 
			
		||||
    ] [ ] make ;
 | 
			
		||||
 | 
			
		||||
: define-category ( word categories -- )
 | 
			
		||||
    [category] fixnum -rot define-predicate-class ;
 | 
			
		||||
 | 
			
		||||
: CATEGORY:
 | 
			
		||||
    CREATE ";" parse-tokens define-category ; parsing
 | 
			
		||||
 | 
			
		||||
: seq-minus ( seq1 seq2 -- diff )
 | 
			
		||||
    [ member? not ] curry subset ;
 | 
			
		||||
 | 
			
		||||
: CATEGORY-NOT:
 | 
			
		||||
    CREATE ";" parse-tokens
 | 
			
		||||
    categories swap seq-minus define-category ; parsing
 | 
			
		||||
: char>name ( char -- string ) name-map value-at ;
 | 
			
		||||
 | 
			
		||||
CATEGORY: blank Zs Zl Zp ;
 | 
			
		||||
CATEGORY: letter Ll ;
 | 
			
		||||
| 
						 | 
				
			
			@ -195,309 +20,6 @@ CATEGORY: control Cc ;
 | 
			
		|||
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ; 
 | 
			
		||||
CATEGORY-NOT: character Cn ;
 | 
			
		||||
 | 
			
		||||
! Utility word
 | 
			
		||||
: make* ( seq quot exemplar -- newseq )
 | 
			
		||||
    ! quot has access to original seq on stack
 | 
			
		||||
    ! this just makes the new-resizable the same length as seq
 | 
			
		||||
    [
 | 
			
		||||
        [
 | 
			
		||||
            pick length swap new-resizable
 | 
			
		||||
            [ building set call ] keep
 | 
			
		||||
        ] keep like
 | 
			
		||||
    ] with-scope ; inline
 | 
			
		||||
 | 
			
		||||
! Case mapping
 | 
			
		||||
 | 
			
		||||
: hash-default ( key hash -- value/key )
 | 
			
		||||
    dupd at [ nip ] when* ;
 | 
			
		||||
 | 
			
		||||
: ch>lower ( ch -- lower ) simple-lower hash-default ;
 | 
			
		||||
: ch>upper ( ch -- upper ) simple-upper hash-default ;
 | 
			
		||||
: ch>title ( ch -- title ) simple-title hash-default ;
 | 
			
		||||
 | 
			
		||||
: load-special-data ( -- data )
 | 
			
		||||
    "extra/unicode/SpecialCasing.txt" resource-path data
 | 
			
		||||
    [ length 5 = ] subset ;
 | 
			
		||||
 | 
			
		||||
: multihex ( hexstring -- string )
 | 
			
		||||
    " " split [ hex> ] map [ ] subset ;
 | 
			
		||||
 | 
			
		||||
TUPLE: code-point lower title upper ;
 | 
			
		||||
 | 
			
		||||
C: <code-point> code-point
 | 
			
		||||
 | 
			
		||||
: set-code-point ( seq -- )
 | 
			
		||||
    4 head [ multihex ] map first4
 | 
			
		||||
    <code-point> swap first set ;
 | 
			
		||||
 | 
			
		||||
DEFER: special-casing
 | 
			
		||||
: load-special-casing
 | 
			
		||||
    load-special-data [ [ set-code-point ] each ] H{ } make-assoc
 | 
			
		||||
    \ special-casing define-value ; parsing
 | 
			
		||||
load-special-casing
 | 
			
		||||
 | 
			
		||||
SYMBOL: locale ! Just casing locale, or overall?
 | 
			
		||||
 | 
			
		||||
: i-dot? ( -- ? )
 | 
			
		||||
    locale get { "tr" "az" } member? ;
 | 
			
		||||
 | 
			
		||||
: lithuanian? ( -- ? ) locale get "lt" = ;
 | 
			
		||||
 | 
			
		||||
: dot-over ( -- ch ) CHAR: \u0307 ;
 | 
			
		||||
 | 
			
		||||
: lithuanian-ch>upper ( ? next ch -- ? )
 | 
			
		||||
    rot [ 2drop f ]
 | 
			
		||||
    [ swap dot-over = over "ij" member? and swap , ] if ;
 | 
			
		||||
 | 
			
		||||
: lithuanian>upper ( string -- lower )
 | 
			
		||||
    [ f swap [ lithuanian-ch>upper ] each-next drop ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: mark-above? ( ch -- ? )
 | 
			
		||||
    combining-class 230 = ;
 | 
			
		||||
 | 
			
		||||
: lithuanian-ch>lower ( next ch -- )
 | 
			
		||||
    ! This fails to add a dot above in certain edge cases
 | 
			
		||||
    ! where there is a non-above combining mark before an above one
 | 
			
		||||
    ! in Lithuanian
 | 
			
		||||
    dup , "IJ" member? swap mark-above? and [ dot-over , ] when ;
 | 
			
		||||
 | 
			
		||||
: lithuanian>lower ( string -- lower )
 | 
			
		||||
    [ [ lithuanian-ch>lower ] each-next ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: turk-ch>upper ( ch -- )
 | 
			
		||||
    dup CHAR: i = 
 | 
			
		||||
    [ drop CHAR: I , dot-over , ] [ , ] if ;
 | 
			
		||||
 | 
			
		||||
: turk>upper ( string -- upper-i )
 | 
			
		||||
    [ [ turk-ch>upper ] each ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: turk-ch>lower ( ? next ch -- ? )
 | 
			
		||||
    {
 | 
			
		||||
        { [ rot ] [ 2drop f ] }
 | 
			
		||||
        { [ dup CHAR: I = ] [
 | 
			
		||||
            drop dot-over =
 | 
			
		||||
            dup CHAR: i CHAR: \u0131 ? ,
 | 
			
		||||
        ] }
 | 
			
		||||
        { [ t ] [ , drop f ] }
 | 
			
		||||
    } cond ;
 | 
			
		||||
 | 
			
		||||
: turk>lower ( string -- lower-i )
 | 
			
		||||
    [ f swap [ turk-ch>lower ] each-next drop ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: word-boundary ( prev char -- new ? )
 | 
			
		||||
    dup non-starter? [ drop dup ] when
 | 
			
		||||
    swap uncased? ;
 | 
			
		||||
 | 
			
		||||
: sigma-map ( string -- string )
 | 
			
		||||
    [
 | 
			
		||||
        swap [ uncased? ] keep not or
 | 
			
		||||
        [ drop HEX: 3C2 ] when
 | 
			
		||||
    ] map-next ;
 | 
			
		||||
 | 
			
		||||
: final-sigma ( string -- string )
 | 
			
		||||
    HEX: 3A3 over member? [ sigma-map ] when ;
 | 
			
		||||
 | 
			
		||||
: map-case ( string string-quot char-quot -- case )
 | 
			
		||||
    [
 | 
			
		||||
        rot [
 | 
			
		||||
            -rot [
 | 
			
		||||
                rot dup special-casing at
 | 
			
		||||
                [ -rot drop call % ]
 | 
			
		||||
                [ -rot nip call , ] ?if
 | 
			
		||||
            ] 2keep
 | 
			
		||||
        ] each 2drop
 | 
			
		||||
    ] "" make* ; inline
 | 
			
		||||
 | 
			
		||||
: >lower ( string -- lower )
 | 
			
		||||
    i-dot? [ turk>lower ] when
 | 
			
		||||
    final-sigma [ code-point-lower ] [ ch>lower ] map-case ;
 | 
			
		||||
 | 
			
		||||
: >upper ( string -- upper )
 | 
			
		||||
    i-dot? [ turk>upper ] when
 | 
			
		||||
    [ code-point-upper ] [ ch>upper ] map-case ;
 | 
			
		||||
 | 
			
		||||
: >title ( string -- title )
 | 
			
		||||
    final-sigma
 | 
			
		||||
    CHAR: \s swap
 | 
			
		||||
    [ tuck word-boundary swapd
 | 
			
		||||
        [ code-point-title ] [ code-point-lower ] if ]
 | 
			
		||||
    [ tuck word-boundary swapd 
 | 
			
		||||
        [ ch>title ] [ ch>lower ] if ]
 | 
			
		||||
    map-case nip ;
 | 
			
		||||
 | 
			
		||||
: >case-fold ( string -- fold )
 | 
			
		||||
    >upper >lower ;
 | 
			
		||||
 | 
			
		||||
: insensitive= ( str1 str2 -- ? )
 | 
			
		||||
    [ >case-fold ] 2apply = ;
 | 
			
		||||
 | 
			
		||||
: lower? ( string -- ? )
 | 
			
		||||
    dup >lower = ;
 | 
			
		||||
: upper? ( string -- ? )
 | 
			
		||||
    dup >lower = ;
 | 
			
		||||
: title? ( string -- ? )
 | 
			
		||||
    dup >title = ;
 | 
			
		||||
: case-fold? ( string -- ? )
 | 
			
		||||
    dup >case-fold = ;
 | 
			
		||||
 | 
			
		||||
! Conjoining Jamo behavior
 | 
			
		||||
 | 
			
		||||
: hangul-base HEX: ac00 ; inline
 | 
			
		||||
: hangul-end HEX: D7AF ; inline
 | 
			
		||||
: initial-base HEX: 1100 ; inline
 | 
			
		||||
: medial-base HEX: 1161 ; inline
 | 
			
		||||
: final-base HEX: 11a7 ; inline
 | 
			
		||||
 | 
			
		||||
: initial-count 19 ; inline
 | 
			
		||||
: medial-count 21 ; inline
 | 
			
		||||
: final-count 28 ; inline
 | 
			
		||||
 | 
			
		||||
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
 | 
			
		||||
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
 | 
			
		||||
 | 
			
		||||
! These numbers come from UAX 29
 | 
			
		||||
: initial? ( ch -- ? )
 | 
			
		||||
    [ HEX: 1100 HEX: 1159 ?between? ] [ HEX: 115F = ] either ;
 | 
			
		||||
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
 | 
			
		||||
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
 | 
			
		||||
 | 
			
		||||
: hangul>jamo ( hangul -- jamo-string )
 | 
			
		||||
    hangul-base - final-count /mod final-base +
 | 
			
		||||
    >r medial-count /mod medial-base +
 | 
			
		||||
    >r initial-base + r> r>
 | 
			
		||||
    dup zero? [ drop 2array ] [ 3array ] if ;
 | 
			
		||||
 | 
			
		||||
: jamo>hangul ( initial medial final -- hangul )
 | 
			
		||||
    >r >r initial-base - medial-count *
 | 
			
		||||
    r> medial-base - + final-count *
 | 
			
		||||
    r> final-base - + hangul-base + ;
 | 
			
		||||
 | 
			
		||||
! Normalization -- Decomposition 
 | 
			
		||||
 | 
			
		||||
: (insert) ( seq n quot -- )
 | 
			
		||||
    over 0 = [ 3drop ] [
 | 
			
		||||
        [ >r dup 1- rot [ nth ] curry 2apply r> 2apply > ] 3keep
 | 
			
		||||
        roll [ 3drop ]
 | 
			
		||||
        [ >r [ dup 1- rot exchange ] 2keep 1- r> (insert) ] if
 | 
			
		||||
    ] if ; inline
 | 
			
		||||
 | 
			
		||||
: insert ( seq quot elt n -- )
 | 
			
		||||
    swap rot >r -rot [ swap set-nth ] 2keep r> (insert) ; inline
 | 
			
		||||
 | 
			
		||||
: insertion-sort ( seq quot -- )
 | 
			
		||||
    ! quot is a transformation on elements
 | 
			
		||||
    over dup length
 | 
			
		||||
    [ >r >r 2dup r> r> insert ] 2each 2drop ; inline
 | 
			
		||||
 | 
			
		||||
: reorder-slice ( string start -- slice done? )
 | 
			
		||||
    2dup swap [ non-starter? not ] find* drop
 | 
			
		||||
    [ [ over length ] unless* rot <slice> ] keep not ;
 | 
			
		||||
 | 
			
		||||
: reorder-next ( string i -- new-i done? )
 | 
			
		||||
    over [ non-starter? ] find* drop [
 | 
			
		||||
        reorder-slice
 | 
			
		||||
        >r dup [ combining-class ] insertion-sort slice-to r>
 | 
			
		||||
    ] [ length t ] if* ;
 | 
			
		||||
 | 
			
		||||
: reorder-loop ( string start -- )
 | 
			
		||||
    dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
 | 
			
		||||
 | 
			
		||||
: reorder ( string -- )
 | 
			
		||||
    0 reorder-loop ;
 | 
			
		||||
 | 
			
		||||
: reorder-back ( string i -- )
 | 
			
		||||
    over [ non-starter? not ] find-last* 1+* reorder-next 2drop ;
 | 
			
		||||
 | 
			
		||||
: decompose ( string quot -- decomposed )
 | 
			
		||||
    ! When there are 8 and 32-bit strings, this'll be
 | 
			
		||||
    ! equivalent to clone on 8 and the contents of the last
 | 
			
		||||
    ! main quotation on 32.
 | 
			
		||||
    over [ 127 < ] all? [ drop ] [
 | 
			
		||||
        swap [ [
 | 
			
		||||
            dup hangul? [ hangul>jamo % drop ]
 | 
			
		||||
            [ dup rot call [ % ] [ , ] ?if ] if
 | 
			
		||||
        ] curry* each ] "" make*
 | 
			
		||||
        dup reorder
 | 
			
		||||
    ] if ; inline
 | 
			
		||||
 | 
			
		||||
: nfd ( string -- string )
 | 
			
		||||
    [ canonical-entry ] decompose ;
 | 
			
		||||
 | 
			
		||||
: nfkd ( string -- string )
 | 
			
		||||
    [ compat-entry ] decompose ;
 | 
			
		||||
 | 
			
		||||
: string-append ( s1 s2 -- string )
 | 
			
		||||
    ! This could be more optimized,
 | 
			
		||||
    ! but in practice, it'll almost always just be append
 | 
			
		||||
    [ append ] keep
 | 
			
		||||
    0 over ?nth non-starter?
 | 
			
		||||
    [ length dupd reorder-back ] [ drop ] if ;
 | 
			
		||||
 | 
			
		||||
! Normalization -- Composition
 | 
			
		||||
SYMBOL: main-str
 | 
			
		||||
SYMBOL: ind
 | 
			
		||||
SYMBOL: after
 | 
			
		||||
SYMBOL: char
 | 
			
		||||
 | 
			
		||||
: get-str ( i -- ch ) ind get + main-str get ?nth ;
 | 
			
		||||
: current ( -- ch ) 0 get-str ;
 | 
			
		||||
: to ( -- ) ind inc ;
 | 
			
		||||
 | 
			
		||||
: initial-medial? ( -- ? )
 | 
			
		||||
    current initial? [ 1 get-str medial? ] [ f ] if ;
 | 
			
		||||
 | 
			
		||||
: --final? ( -- ? )
 | 
			
		||||
    2 get-str final? ;
 | 
			
		||||
 | 
			
		||||
: imf, ( -- )
 | 
			
		||||
    current to current to current jamo>hangul , ;
 | 
			
		||||
 | 
			
		||||
: im, ( -- )
 | 
			
		||||
    current to current 0 jamo>hangul , ;
 | 
			
		||||
 | 
			
		||||
: compose-jamo ( -- )
 | 
			
		||||
    initial-medial? [
 | 
			
		||||
        --final? [ imf, ] [ im, ] if
 | 
			
		||||
    ] when to current jamo? [ compose-jamo ] when ;
 | 
			
		||||
 | 
			
		||||
: pass-combining ( -- )
 | 
			
		||||
    current non-starter? [ current , to pass-combining ] when ;
 | 
			
		||||
 | 
			
		||||
: try-compose ( last-class char current-class -- )
 | 
			
		||||
    swapd = [ after get push ] [
 | 
			
		||||
        char get over combine-chars
 | 
			
		||||
        [ nip char set ] [ after get push ] if*
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
: compose-iter ( n -- )
 | 
			
		||||
    current [
 | 
			
		||||
        dup combining-class dup
 | 
			
		||||
        [ [ try-compose ] keep to compose-iter ] [ 3drop ] if
 | 
			
		||||
    ] [ drop ] if* ;
 | 
			
		||||
 | 
			
		||||
: ?new-after ( -- )
 | 
			
		||||
    after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
 | 
			
		||||
 | 
			
		||||
: (compose) ( -- )
 | 
			
		||||
    current [
 | 
			
		||||
        dup jamo? [ drop compose-jamo ] [
 | 
			
		||||
            char set to ?new-after
 | 
			
		||||
            0 compose-iter
 | 
			
		||||
            char get , after get %
 | 
			
		||||
            to
 | 
			
		||||
        ] if (compose)
 | 
			
		||||
    ] when* ;
 | 
			
		||||
 | 
			
		||||
: compose ( str -- comp )
 | 
			
		||||
    [
 | 
			
		||||
        main-str set
 | 
			
		||||
        0 ind set
 | 
			
		||||
        SBUF" " clone after set
 | 
			
		||||
        pass-combining (compose)
 | 
			
		||||
    ] "" make* ;
 | 
			
		||||
 | 
			
		||||
: nfc ( string -- nfc )
 | 
			
		||||
    nfd compose ;
 | 
			
		||||
 | 
			
		||||
: nfkc ( string -- nfkc )
 | 
			
		||||
    nfkc compose ;
 | 
			
		||||
: UNICHAR:
 | 
			
		||||
    ! This should be part of CHAR:
 | 
			
		||||
    scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue