diff --git a/extra/unicode/breaks/breaks.factor b/extra/unicode/breaks/breaks.factor index 5e0a1feab9..781090496e 100644 --- a/extra/unicode/breaks/breaks.factor +++ b/extra/unicode/breaks/breaks.factor @@ -1,6 +1,6 @@ USING: unicode kernel math const combinators splitting sequences math.parser io.files io assocs arrays namespaces -; +math.ranges unicode.normalize unicode.syntax ; IN: unicode.breaks ENUM: Any L V T Extend Control CR LF graphemes ; @@ -25,7 +25,7 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ; : process-other-extend ( lines -- set ) [ "#" split1 drop ";" split1 drop trim-blank ] map [ empty? not ] subset - [ ".." split1 [ dup ] unless* [ hex> ] 2apply range ] map + [ ".." split1 [ dup ] unless* [ hex> ] 2apply [a,b] ] map concat >set ; : other-extend-lines ( -- lines ) diff --git a/extra/unicode/case/case.factor b/extra/unicode/case/case.factor new file mode 100644 index 0000000000..0c5abf208f --- /dev/null +++ b/extra/unicode/case/case.factor @@ -0,0 +1,111 @@ +USING: kernel unicode.load sequences sequences.next namespaces assocs.lib +unicode.normalize math unicode combinators assocs ; +IN: unicode.case + +: ch>lower ( ch -- lower ) simple-lower at-default ; +: ch>upper ( ch -- upper ) simple-upper at-default ; +: ch>title ( ch -- title ) simple-title at-default ; + +SYMBOL: locale ! Just casing locale, or overall? + +: i-dot? ( -- ? ) + locale get { "tr" "az" } member? ; + +: lithuanian? ( -- ? ) locale get "lt" = ; + +: dot-over ( -- ch ) CHAR: \u0307 ; + +: lithuanian-ch>upper ( ? next ch -- ? ) + rot [ 2drop f ] + [ swap dot-over = over "ij" member? and swap , ] if ; + +: lithuanian>upper ( string -- lower ) + [ f swap [ lithuanian-ch>upper ] each-next drop ] "" make* ; + +: mark-above? ( ch -- ? ) + combining-class 230 = ; + +: lithuanian-ch>lower ( next ch -- ) + ! This fails to add a dot above in certain edge cases + ! where there is a non-above combining mark before an above one + ! in Lithuanian + dup , "IJ" member? swap mark-above? and [ dot-over , ] when ; + +: lithuanian>lower ( string -- lower ) + [ [ lithuanian-ch>lower ] each-next ] "" make* ; + +: turk-ch>upper ( ch -- ) + dup CHAR: i = + [ drop CHAR: I , dot-over , ] [ , ] if ; + +: turk>upper ( string -- upper-i ) + [ [ turk-ch>upper ] each ] "" make* ; + +: turk-ch>lower ( ? next ch -- ? ) + { + { [ rot ] [ 2drop f ] } + { [ dup CHAR: I = ] [ + drop dot-over = + dup CHAR: i CHAR: \u0131 ? , + ] } + { [ t ] [ , drop f ] } + } cond ; + +: turk>lower ( string -- lower-i ) + [ f swap [ turk-ch>lower ] each-next drop ] "" make* ; + +: word-boundary ( prev char -- new ? ) + dup non-starter? [ drop dup ] when + swap uncased? ; + +: sigma-map ( string -- string ) + [ + swap [ uncased? ] keep not or + [ drop HEX: 3C2 ] when + ] map-next ; + +: final-sigma ( string -- string ) + HEX: 3A3 over member? [ sigma-map ] when ; + +: map-case ( string string-quot char-quot -- case ) + [ + rot [ + -rot [ + rot dup special-casing at + [ -rot drop call % ] + [ -rot nip call , ] ?if + ] 2keep + ] each 2drop + ] "" make* ; inline + +: >lower ( string -- lower ) + i-dot? [ turk>lower ] when + final-sigma [ code-point-lower ] [ ch>lower ] map-case ; + +: >upper ( string -- upper ) + i-dot? [ turk>upper ] when + [ code-point-upper ] [ ch>upper ] map-case ; + +: >title ( string -- title ) + final-sigma + CHAR: \s swap + [ tuck word-boundary swapd + [ code-point-title ] [ code-point-lower ] if ] + [ tuck word-boundary swapd + [ ch>title ] [ ch>lower ] if ] + map-case nip ; + +: >case-fold ( string -- fold ) + >upper >lower ; + +: insensitive= ( str1 str2 -- ? ) + [ >case-fold ] 2apply = ; + +: lower? ( string -- ? ) + dup >lower = ; +: upper? ( string -- ? ) + dup >lower = ; +: title? ( string -- ? ) + dup >title = ; +: case-fold? ( string -- ? ) + dup >case-fold = ; diff --git a/extra/unicode/load/load.factor b/extra/unicode/load/load.factor new file mode 100644 index 0000000000..1249677cd4 --- /dev/null +++ b/extra/unicode/load/load.factor @@ -0,0 +1,143 @@ +USING: assocs math kernel sequences io.files hashtables quotations +splitting arrays math.parser combinators.lib hash2 byte-arrays words +namespaces ; +IN: unicode.load + +! Convenience functions +: 1+* ( n/f _ -- n+1 ) + drop [ 1+ ] [ 0 ] if* ; + +: define-value ( value word -- ) + swap 1quotation define-compound ; + +: ?between? ( n/f from to -- ? ) + pick [ between? ] [ 3drop f ] if ; + +! Remove this soon +USE: parser +DEFER: >> +: << \ >> parse-until >quotation call ; parsing + +! Loading data from UnicodeData.txt + +: data ( filename -- data ) + file-lines [ ";" split ] map ; + +: load-data ( -- data ) + "extra/unicode/UnicodeData.txt" resource-path data ; + +: (process-data) ( index data -- newdata ) + [ [ nth ] keep first swap 2array ] curry* map + [ second empty? not ] subset + [ >r hex> r> ] assoc-map ; + +: process-data ( index data -- hash ) + (process-data) [ hex> ] assoc-map >hashtable ; + +: (chain-decomposed) ( hash value -- newvalue ) + [ + 2dup swap at + [ (chain-decomposed) ] [ 1array nip ] ?if + ] curry* map concat ; + +: chain-decomposed ( hash -- newhash ) + dup [ swap (chain-decomposed) ] curry assoc-map ; + +: first* ( seq -- ? ) + second [ empty? ] [ first ] either ; + +: (process-decomposed) ( data -- alist ) + 5 swap (process-data) + [ " " split [ hex> ] map ] assoc-map ; + +: process-canonical ( data -- hash2 hash ) + (process-decomposed) [ first* ] subset + [ + [ second length 2 = ] subset + ! using 1009 as the size, the maximum load is 4 + [ first2 first2 rot 3array ] map 1009 alist>hash2 + ] keep + >hashtable chain-decomposed ; + +: process-compat ( data -- hash ) + (process-decomposed) + [ dup first* [ first2 1 tail 2array ] unless ] map + >hashtable chain-decomposed ; + +: process-combining ( data -- hash ) + 3 swap (process-data) + [ string>number ] assoc-map + [ nip 0 = not ] assoc-subset + >hashtable ; + +: categories ( -- names ) + ! For non-existent characters, use Cn + { "Lu" "Ll" "Lt" "Lm" "Lo" + "Mn" "Mc" "Me" + "Nd" "Nl" "No" + "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po" + "Sm" "Sc" "Sk" "So" + "Zs" "Zl" "Zp" + "Cc" "Cf" "Cs" "Co" "Cn" } ; + +: unicode-chars HEX: 2FA1E ; +! the maximum unicode char in the first 3 planes + +: process-category ( data -- category-listing ) + 2 swap (process-data) + unicode-chars swap dupd swap [ + >r over unicode-chars >= [ r> 3drop ] + [ categories index swap r> set-nth ] if + ] curry assoc-each ; + +: ascii-lower ( string -- lower ) + [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ; + +: replace ( seq old new -- newseq ) + swap rot [ 2dup = [ drop over ] when ] map 2nip ; + +: process-names ( data -- names-hash ) + 1 swap (process-data) + [ ascii-lower CHAR: \s CHAR: - replace swap ] assoc-map + >hashtable ; + +: multihex ( hexstring -- string ) + " " split [ hex> ] map [ ] subset ; + +TUPLE: code-point lower title upper ; + +C: code-point + +: set-code-point ( seq -- ) + 4 head [ multihex ] map first4 + swap first set ; + +: load-special-casing ( -- special-casing ) + "extra/unicode/SpecialCasing.txt" resource-path data + [ length 5 = ] subset + [ [ set-code-point ] each ] H{ } make-assoc ; + +DEFER: simple-lower +DEFER: simple-upper +DEFER: simple-title +DEFER: canonical-map +DEFER: combine-map +DEFER: class-map +DEFER: compat-map +DEFER: category-map +DEFER: name-map +DEFER: special-casing + +<< + load-data + dup process-names \ name-map define-value + 13 over process-data \ simple-lower define-value + 12 over process-data tuck \ simple-upper define-value + 14 over process-data swapd union \ simple-title define-value + dup process-combining \ class-map define-value + dup process-canonical \ canonical-map define-value + \ combine-map define-value + dup process-compat \ compat-map define-value + process-category \ category-map define-value + load-special-casing \ special-casing define-value +>> diff --git a/extra/unicode/normalize/normalize.factor b/extra/unicode/normalize/normalize.factor new file mode 100644 index 0000000000..44c0b194b6 --- /dev/null +++ b/extra/unicode/normalize/normalize.factor @@ -0,0 +1,176 @@ +USING: sequences namespaces unicode.load kernel combinators.lib math +unicode arrays ; +IN: unicode.normalize + +! Utility word +: make* ( seq quot exemplar -- newseq ) + ! quot has access to original seq on stack + ! this just makes the new-resizable the same length as seq + [ + [ + pick length swap new-resizable + [ building set call ] keep + ] keep like + ] with-scope ; inline + +! Conjoining Jamo behavior + +: hangul-base HEX: ac00 ; inline +: hangul-end HEX: D7AF ; inline +: initial-base HEX: 1100 ; inline +: medial-base HEX: 1161 ; inline +: final-base HEX: 11a7 ; inline + +: initial-count 19 ; inline +: medial-count 21 ; inline +: final-count 28 ; inline + +: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ; +: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ; + +! These numbers come from UAX 29 +: initial? ( ch -- ? ) + [ HEX: 1100 HEX: 1159 ?between? ] [ HEX: 115F = ] either ; +: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ; +: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ; + +: hangul>jamo ( hangul -- jamo-string ) + hangul-base - final-count /mod final-base + + >r medial-count /mod medial-base + + >r initial-base + r> r> + dup zero? [ drop 2array ] [ 3array ] if ; + +: jamo>hangul ( initial medial final -- hangul ) + >r >r initial-base - medial-count * + r> medial-base - + final-count * + r> final-base - + hangul-base + ; + +! Normalization -- Decomposition + +: (insert) ( seq n quot -- ) + over 0 = [ 3drop ] [ + [ >r dup 1- rot [ nth ] curry 2apply r> 2apply > ] 3keep + roll [ 3drop ] + [ >r [ dup 1- rot exchange ] 2keep 1- r> (insert) ] if + ] if ; inline + +: insert ( seq quot elt n -- ) + swap rot >r -rot [ swap set-nth ] 2keep r> (insert) ; inline + +: insertion-sort ( seq quot -- ) + ! quot is a transformation on elements + over dup length + [ >r >r 2dup r> r> insert ] 2each 2drop ; inline + +: reorder-slice ( string start -- slice done? ) + 2dup swap [ non-starter? not ] find* drop + [ [ over length ] unless* rot ] keep not ; + +: reorder-next ( string i -- new-i done? ) + over [ non-starter? ] find* drop [ + reorder-slice + >r dup [ combining-class ] insertion-sort slice-to r> + ] [ length t ] if* ; + +: reorder-loop ( string start -- ) + dupd reorder-next [ 2drop ] [ reorder-loop ] if ; + +: reorder ( string -- ) + 0 reorder-loop ; + +: reorder-back ( string i -- ) + over [ non-starter? not ] find-last* 1+* reorder-next 2drop ; + +: decompose ( string quot -- decomposed ) + ! When there are 8 and 32-bit strings, this'll be + ! equivalent to clone on 8 and the contents of the last + ! main quotation on 32. + over [ 127 < ] all? [ drop ] [ + swap [ [ + dup hangul? [ hangul>jamo % drop ] + [ dup rot call [ % ] [ , ] ?if ] if + ] curry* each ] "" make* + dup reorder + ] if ; inline + +: nfd ( string -- string ) + [ canonical-entry ] decompose ; + +: nfkd ( string -- string ) + [ compat-entry ] decompose ; + +: string-append ( s1 s2 -- string ) + ! This could be more optimized, + ! but in practice, it'll almost always just be append + [ append ] keep + 0 over ?nth non-starter? + [ length dupd reorder-back ] [ drop ] if ; + +! Normalization -- Composition +SYMBOL: main-str +SYMBOL: ind +SYMBOL: after +SYMBOL: char + +: get-str ( i -- ch ) ind get + main-str get ?nth ; +: current ( -- ch ) 0 get-str ; +: to ( -- ) ind inc ; + +: initial-medial? ( -- ? ) + current initial? [ 1 get-str medial? ] [ f ] if ; + +: --final? ( -- ? ) + 2 get-str final? ; + +: imf, ( -- ) + current to current to current jamo>hangul , ; + +: im, ( -- ) + current to current 0 jamo>hangul , ; + +: compose-jamo ( -- ) + initial-medial? [ + --final? [ imf, ] [ im, ] if + ] when to current jamo? [ compose-jamo ] when ; + +: pass-combining ( -- ) + current non-starter? [ current , to pass-combining ] when ; + +: try-compose ( last-class char current-class -- ) + swapd = [ after get push ] [ + char get over combine-chars + [ nip char set ] [ after get push ] if* + ] if ; + +: compose-iter ( n -- ) + current [ + dup combining-class dup + [ [ try-compose ] keep to compose-iter ] [ 3drop ] if + ] [ drop ] if* ; + +: ?new-after ( -- ) + after [ dup empty? [ drop SBUF" " clone ] unless ] change ; + +: (compose) ( -- ) + current [ + dup jamo? [ drop compose-jamo ] [ + char set to ?new-after + 0 compose-iter + char get , after get % + to + ] if (compose) + ] when* ; + +: compose ( str -- comp ) + [ + main-str set + 0 ind set + SBUF" " clone after set + pass-combining (compose) + ] "" make* ; + +: nfc ( string -- nfc ) + nfd compose ; + +: nfkc ( string -- nfkc ) + nfkc compose ; diff --git a/extra/unicode/syntax/syntax.factor b/extra/unicode/syntax/syntax.factor new file mode 100644 index 0000000000..c2acf00212 --- /dev/null +++ b/extra/unicode/syntax/syntax.factor @@ -0,0 +1,56 @@ +USING: unicode.load kernel math sequences parser bit-arrays namespaces +sequences.private arrays quotations classes.predicate ; +IN: unicode.syntax + +! Character classes (categories) + +: category# ( char -- category ) + ! There are a few characters that should be Cn + ! that this gives Cf or Mn + ! Cf = 26; Mn = 5; Cn = 29 + ! Use a compressed array instead? + dup category-map ?nth [ ] [ + dup HEX: E0001 HEX: E007F between? + [ drop 26 ] [ + HEX: E0100 HEX: E01EF between? 5 29 ? + ] if + ] ?if ; + +: category ( char -- category ) + category# categories nth ; + +: >category-array ( categories -- bitarray ) + categories [ swap member? ] curry* map >bit-array ; + +: as-string ( strings -- bit-array ) + concat "\"" tuck 3append parse first ; + +: [category] ( categories -- quot ) + [ + [ [ categories member? not ] subset as-string ] keep + [ categories member? ] subset >category-array + [ dup category# ] % , [ nth-unsafe [ drop t ] ] % + \ member? 2array >quotation , + \ if , + ] [ ] make ; + +: define-category ( word categories -- ) + [category] fixnum -rot define-predicate-class ; + +: CATEGORY: + CREATE ";" parse-tokens define-category ; parsing + +: seq-minus ( seq1 seq2 -- diff ) + [ member? not ] curry subset ; + +: CATEGORY-NOT: + CREATE ";" parse-tokens + categories swap seq-minus define-category ; parsing + +TUPLE: code-point lower title upper ; + +C: code-point + +: set-code-point ( seq -- ) + 4 head [ multihex ] map first4 + swap first set ; diff --git a/extra/unicode/unicode.factor b/extra/unicode/unicode.factor index bac768b84c..6b5f2ac5f4 100644 --- a/extra/unicode/unicode.factor +++ b/extra/unicode/unicode.factor @@ -1,188 +1,13 @@ -USING: kernel hashtables sequences io arrays math - hash2 namespaces strings assocs words splitting sequences.next - byte-arrays quotations sequences.private io.files bit-arrays - combinators math.parser io.streams.lines parser classes - classes.predicate ; +USING: unicode.syntax hash2 assocs unicode.load kernel parser ; IN: unicode -! Convenience functions -: >set ( seq -- hash ) - [ dup ] H{ } map>assoc ; - -: either ( object first second -- ? ) - >r over slip swap [ r> drop ] [ r> call ] ?if ; inline - -: 1+* ( n/f _ -- n+1 ) - drop [ 1+ ] [ 0 ] if* ; - -: define-value ( value word -- ) - swap 1quotation define-compound ; - -: ?between? ( n/f from to -- ? ) - pick [ between? ] [ 3drop f ] if ; - -: range ( from to -- seq ) - 1+ over - [ + ] curry* map ; - -! Loading data from UnicodeData.txt - -: data ( filename -- data ) - lines [ ";" split ] map ; - -: load-data ( -- data ) - "extra/unicode/UnicodeData.txt" resource-path data ; - -: (process-data) ( index data -- newdata ) - [ [ nth ] keep first swap 2array ] curry* map - [ second empty? not ] subset - [ >r hex> r> ] assoc-map ; - -: process-data ( index data -- hash ) - (process-data) [ hex> ] assoc-map >hashtable ; - -: (chain-decomposed) ( hash value -- newvalue ) - [ - 2dup swap at - [ (chain-decomposed) ] [ 1array nip ] ?if - ] curry* map concat ; - -: chain-decomposed ( hash -- newhash ) - dup [ swap (chain-decomposed) ] curry assoc-map ; - -: first* ( seq -- ? ) - second [ empty? ] [ first ] either ; - -: (process-decomposed) ( data -- alist ) - 5 swap (process-data) - [ " " split [ hex> ] map ] assoc-map ; - -: process-canonical ( data -- hash2 hash ) - (process-decomposed) [ first* ] subset - [ - [ second length 2 = ] subset - ! using 1009 as the size, the maximum load is 4 - [ first2 first2 rot 3array ] map 1009 alist>hash2 - ] keep - >hashtable chain-decomposed ; - -: process-compat ( data -- hash ) - (process-decomposed) - [ dup first* [ first2 1 tail 2array ] unless ] map - >hashtable chain-decomposed ; - -: process-combining ( data -- hash ) - 3 swap (process-data) - [ string>number ] assoc-map - [ nip 0 = not ] assoc-subset - >hashtable ; - -: categories ( -- names ) - ! For non-existent characters, use Cn - { "Lu" "Ll" "Lt" "Lm" "Lo" - "Mn" "Mc" "Me" - "Nd" "Nl" "No" - "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po" - "Sm" "Sc" "Sk" "So" - "Zs" "Zl" "Zp" - "Cc" "Cf" "Cs" "Co" "Cn" } ; - -: unicode-chars HEX: 2FA1E ; -! the maximum unicode char in the first 3 planes - -: process-category ( data -- category-listing ) - 2 swap (process-data) - unicode-chars swap dupd swap [ - >r over unicode-chars >= [ r> 3drop ] - [ categories index swap r> set-nth ] if - ] curry assoc-each ; - -: ascii-lower ( string -- lower ) - [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ; - -: replace ( seq old new -- newseq ) - swap rot [ 2dup = [ drop over ] when ] map 2nip ; - -: process-names ( data -- names-hash ) - 1 swap (process-data) - [ ascii-lower CHAR: \s CHAR: - replace swap ] assoc-map - >hashtable ; - -DEFER: simple-lower -DEFER: simple-upper -DEFER: simple-title -DEFER: canonical-map -DEFER: combine-map -DEFER: class-map -DEFER: compat-map -DEFER: category-map -DEFER: name-map - -: load-tables ( -- ) - load-data - dup process-names \ name-map define-value - 13 over process-data \ simple-lower define-value - 12 over process-data tuck \ simple-upper define-value - 14 over process-data swapd union \ simple-title define-value - dup process-combining \ class-map define-value - dup process-canonical \ canonical-map define-value - \ combine-map define-value - dup process-compat \ compat-map define-value - process-category \ category-map define-value ; parsing -load-tables - : canonical-entry ( char -- seq ) canonical-map at ; : combine-chars ( a b -- char/f ) combine-map hash2 ; : compat-entry ( char -- seq ) compat-map at ; : combining-class ( char -- n ) class-map at ; : non-starter? ( char -- ? ) class-map key? ; : name>char ( string -- char ) name-map at ; - -: UNICHAR: - scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing - -! Character classes (categories) - -: category# ( char -- category ) - ! There are a few characters that should be Cn - ! that this gives Cf or Mn - ! Cf = 26; Mn = 5; Cn = 29 - dup category-map ?nth [ ] [ - dup HEX: E0001 HEX: E007F between? - [ drop 26 ] [ - HEX: E0100 HEX: E01EF between? 5 29 ? - ] if - ] ?if ; - -: category ( char -- category ) - category# categories nth ; - -: >category-array ( categories -- bitarray ) - categories [ swap member? ] curry* map >bit-array ; - -: as-string ( strings -- bit-array ) - concat "\"" tuck 3append parse first ; - -: [category] ( categories -- quot ) - [ - [ [ categories member? not ] subset as-string ] keep - [ categories member? ] subset >category-array - [ dup category# ] % , [ nth-unsafe [ drop t ] ] % - \ member? 2array >quotation , - \ if , - ] [ ] make ; - -: define-category ( word categories -- ) - [category] fixnum -rot define-predicate-class ; - -: CATEGORY: - CREATE ";" parse-tokens define-category ; parsing - -: seq-minus ( seq1 seq2 -- diff ) - [ member? not ] curry subset ; - -: CATEGORY-NOT: - CREATE ";" parse-tokens - categories swap seq-minus define-category ; parsing +: char>name ( char -- string ) name-map value-at ; CATEGORY: blank Zs Zl Zp ; CATEGORY: letter Ll ; @@ -195,309 +20,6 @@ CATEGORY: control Cc ; CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ; CATEGORY-NOT: character Cn ; -! Utility word -: make* ( seq quot exemplar -- newseq ) - ! quot has access to original seq on stack - ! this just makes the new-resizable the same length as seq - [ - [ - pick length swap new-resizable - [ building set call ] keep - ] keep like - ] with-scope ; inline - -! Case mapping - -: hash-default ( key hash -- value/key ) - dupd at [ nip ] when* ; - -: ch>lower ( ch -- lower ) simple-lower hash-default ; -: ch>upper ( ch -- upper ) simple-upper hash-default ; -: ch>title ( ch -- title ) simple-title hash-default ; - -: load-special-data ( -- data ) - "extra/unicode/SpecialCasing.txt" resource-path data - [ length 5 = ] subset ; - -: multihex ( hexstring -- string ) - " " split [ hex> ] map [ ] subset ; - -TUPLE: code-point lower title upper ; - -C: code-point - -: set-code-point ( seq -- ) - 4 head [ multihex ] map first4 - swap first set ; - -DEFER: special-casing -: load-special-casing - load-special-data [ [ set-code-point ] each ] H{ } make-assoc - \ special-casing define-value ; parsing -load-special-casing - -SYMBOL: locale ! Just casing locale, or overall? - -: i-dot? ( -- ? ) - locale get { "tr" "az" } member? ; - -: lithuanian? ( -- ? ) locale get "lt" = ; - -: dot-over ( -- ch ) CHAR: \u0307 ; - -: lithuanian-ch>upper ( ? next ch -- ? ) - rot [ 2drop f ] - [ swap dot-over = over "ij" member? and swap , ] if ; - -: lithuanian>upper ( string -- lower ) - [ f swap [ lithuanian-ch>upper ] each-next drop ] "" make* ; - -: mark-above? ( ch -- ? ) - combining-class 230 = ; - -: lithuanian-ch>lower ( next ch -- ) - ! This fails to add a dot above in certain edge cases - ! where there is a non-above combining mark before an above one - ! in Lithuanian - dup , "IJ" member? swap mark-above? and [ dot-over , ] when ; - -: lithuanian>lower ( string -- lower ) - [ [ lithuanian-ch>lower ] each-next ] "" make* ; - -: turk-ch>upper ( ch -- ) - dup CHAR: i = - [ drop CHAR: I , dot-over , ] [ , ] if ; - -: turk>upper ( string -- upper-i ) - [ [ turk-ch>upper ] each ] "" make* ; - -: turk-ch>lower ( ? next ch -- ? ) - { - { [ rot ] [ 2drop f ] } - { [ dup CHAR: I = ] [ - drop dot-over = - dup CHAR: i CHAR: \u0131 ? , - ] } - { [ t ] [ , drop f ] } - } cond ; - -: turk>lower ( string -- lower-i ) - [ f swap [ turk-ch>lower ] each-next drop ] "" make* ; - -: word-boundary ( prev char -- new ? ) - dup non-starter? [ drop dup ] when - swap uncased? ; - -: sigma-map ( string -- string ) - [ - swap [ uncased? ] keep not or - [ drop HEX: 3C2 ] when - ] map-next ; - -: final-sigma ( string -- string ) - HEX: 3A3 over member? [ sigma-map ] when ; - -: map-case ( string string-quot char-quot -- case ) - [ - rot [ - -rot [ - rot dup special-casing at - [ -rot drop call % ] - [ -rot nip call , ] ?if - ] 2keep - ] each 2drop - ] "" make* ; inline - -: >lower ( string -- lower ) - i-dot? [ turk>lower ] when - final-sigma [ code-point-lower ] [ ch>lower ] map-case ; - -: >upper ( string -- upper ) - i-dot? [ turk>upper ] when - [ code-point-upper ] [ ch>upper ] map-case ; - -: >title ( string -- title ) - final-sigma - CHAR: \s swap - [ tuck word-boundary swapd - [ code-point-title ] [ code-point-lower ] if ] - [ tuck word-boundary swapd - [ ch>title ] [ ch>lower ] if ] - map-case nip ; - -: >case-fold ( string -- fold ) - >upper >lower ; - -: insensitive= ( str1 str2 -- ? ) - [ >case-fold ] 2apply = ; - -: lower? ( string -- ? ) - dup >lower = ; -: upper? ( string -- ? ) - dup >lower = ; -: title? ( string -- ? ) - dup >title = ; -: case-fold? ( string -- ? ) - dup >case-fold = ; - -! Conjoining Jamo behavior - -: hangul-base HEX: ac00 ; inline -: hangul-end HEX: D7AF ; inline -: initial-base HEX: 1100 ; inline -: medial-base HEX: 1161 ; inline -: final-base HEX: 11a7 ; inline - -: initial-count 19 ; inline -: medial-count 21 ; inline -: final-count 28 ; inline - -: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ; -: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ; - -! These numbers come from UAX 29 -: initial? ( ch -- ? ) - [ HEX: 1100 HEX: 1159 ?between? ] [ HEX: 115F = ] either ; -: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ; -: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ; - -: hangul>jamo ( hangul -- jamo-string ) - hangul-base - final-count /mod final-base + - >r medial-count /mod medial-base + - >r initial-base + r> r> - dup zero? [ drop 2array ] [ 3array ] if ; - -: jamo>hangul ( initial medial final -- hangul ) - >r >r initial-base - medial-count * - r> medial-base - + final-count * - r> final-base - + hangul-base + ; - -! Normalization -- Decomposition - -: (insert) ( seq n quot -- ) - over 0 = [ 3drop ] [ - [ >r dup 1- rot [ nth ] curry 2apply r> 2apply > ] 3keep - roll [ 3drop ] - [ >r [ dup 1- rot exchange ] 2keep 1- r> (insert) ] if - ] if ; inline - -: insert ( seq quot elt n -- ) - swap rot >r -rot [ swap set-nth ] 2keep r> (insert) ; inline - -: insertion-sort ( seq quot -- ) - ! quot is a transformation on elements - over dup length - [ >r >r 2dup r> r> insert ] 2each 2drop ; inline - -: reorder-slice ( string start -- slice done? ) - 2dup swap [ non-starter? not ] find* drop - [ [ over length ] unless* rot ] keep not ; - -: reorder-next ( string i -- new-i done? ) - over [ non-starter? ] find* drop [ - reorder-slice - >r dup [ combining-class ] insertion-sort slice-to r> - ] [ length t ] if* ; - -: reorder-loop ( string start -- ) - dupd reorder-next [ 2drop ] [ reorder-loop ] if ; - -: reorder ( string -- ) - 0 reorder-loop ; - -: reorder-back ( string i -- ) - over [ non-starter? not ] find-last* 1+* reorder-next 2drop ; - -: decompose ( string quot -- decomposed ) - ! When there are 8 and 32-bit strings, this'll be - ! equivalent to clone on 8 and the contents of the last - ! main quotation on 32. - over [ 127 < ] all? [ drop ] [ - swap [ [ - dup hangul? [ hangul>jamo % drop ] - [ dup rot call [ % ] [ , ] ?if ] if - ] curry* each ] "" make* - dup reorder - ] if ; inline - -: nfd ( string -- string ) - [ canonical-entry ] decompose ; - -: nfkd ( string -- string ) - [ compat-entry ] decompose ; - -: string-append ( s1 s2 -- string ) - ! This could be more optimized, - ! but in practice, it'll almost always just be append - [ append ] keep - 0 over ?nth non-starter? - [ length dupd reorder-back ] [ drop ] if ; - -! Normalization -- Composition -SYMBOL: main-str -SYMBOL: ind -SYMBOL: after -SYMBOL: char - -: get-str ( i -- ch ) ind get + main-str get ?nth ; -: current ( -- ch ) 0 get-str ; -: to ( -- ) ind inc ; - -: initial-medial? ( -- ? ) - current initial? [ 1 get-str medial? ] [ f ] if ; - -: --final? ( -- ? ) - 2 get-str final? ; - -: imf, ( -- ) - current to current to current jamo>hangul , ; - -: im, ( -- ) - current to current 0 jamo>hangul , ; - -: compose-jamo ( -- ) - initial-medial? [ - --final? [ imf, ] [ im, ] if - ] when to current jamo? [ compose-jamo ] when ; - -: pass-combining ( -- ) - current non-starter? [ current , to pass-combining ] when ; - -: try-compose ( last-class char current-class -- ) - swapd = [ after get push ] [ - char get over combine-chars - [ nip char set ] [ after get push ] if* - ] if ; - -: compose-iter ( n -- ) - current [ - dup combining-class dup - [ [ try-compose ] keep to compose-iter ] [ 3drop ] if - ] [ drop ] if* ; - -: ?new-after ( -- ) - after [ dup empty? [ drop SBUF" " clone ] unless ] change ; - -: (compose) ( -- ) - current [ - dup jamo? [ drop compose-jamo ] [ - char set to ?new-after - 0 compose-iter - char get , after get % - to - ] if (compose) - ] when* ; - -: compose ( str -- comp ) - [ - main-str set - 0 ind set - SBUF" " clone after set - pass-combining (compose) - ] "" make* ; - -: nfc ( string -- nfc ) - nfd compose ; - -: nfkc ( string -- nfkc ) - nfkc compose ; +: UNICHAR: + ! This should be part of CHAR: + scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing