commit
3546f5150d
|
@ -24,3 +24,8 @@ IN: io.crc32
|
|||
>r HEX: ffffffff dup r> [ (crc32) ] each bitxor ;
|
||||
|
||||
: file-crc32 ( path -- n ) file-contents crc32 ;
|
||||
|
||||
: lines-crc32 ( seq -- n )
|
||||
HEX: ffffffff tuck [
|
||||
[ (crc32) ] each CHAR: \n (crc32)
|
||||
] reduce bitxor ;
|
||||
|
|
|
@ -33,8 +33,8 @@ uses definitions ;
|
|||
dup source-file-path ?resource-path file-modified
|
||||
swap set-source-file-modified ;
|
||||
|
||||
: record-checksum ( contents source-file -- )
|
||||
>r crc32 r> set-source-file-checksum ;
|
||||
: record-checksum ( lines source-file -- )
|
||||
swap lines-crc32 swap set-source-file-checksum ;
|
||||
|
||||
: (xref-source) ( source-file -- pathname uses )
|
||||
dup source-file-path <pathname> swap source-file-uses
|
||||
|
@ -68,7 +68,7 @@ uses definitions ;
|
|||
: reset-checksums ( -- )
|
||||
source-files get [
|
||||
swap ?resource-path dup exists?
|
||||
[ file-contents record-checksum ] [ 2drop ] if
|
||||
[ file-lines record-checksum ] [ 2drop ] if
|
||||
] assoc-each ;
|
||||
|
||||
M: pathname where pathname-string 1 2array ;
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Non-core assoc words
|
|
@ -37,9 +37,9 @@ HINTS: do-line vector string ;
|
|||
] with-stream ;
|
||||
|
||||
: reverse-complement-main ( -- )
|
||||
"reverse-complement-in.txt"
|
||||
"reverse-complement-out.txt"
|
||||
[ home swap path+ ] 2apply
|
||||
"extra/benchmark/reverse-complement/reverse-complement-test-in.txt"
|
||||
"extra/benchmark/reverse-complement/reverse-complement-test-out.txt"
|
||||
[ resource-path ] 2apply
|
||||
reverse-complement ;
|
||||
|
||||
MAIN: reverse-complement-main
|
||||
|
|
|
@ -166,3 +166,6 @@ MACRO: construct-slots ( assoc tuple-class -- tuple )
|
|||
[ construct-empty ] curry swap [
|
||||
[ dip ] curry swap 1quotation [ keep ] curry compose
|
||||
] { } assoc>map concat compose ;
|
||||
|
||||
: either ( object first second -- ? )
|
||||
>r over slip swap [ r> drop ] [ r> call ] ?if ; inline
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
! Copyright (C) 2007 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: parser generic kernel classes words slots io definitions
|
||||
sequences sequences.private assocs prettyprint.sections arrays ;
|
||||
USING: parser generic kernel classes words slots assocs sequences arrays ;
|
||||
IN: delegate
|
||||
|
||||
: define-protocol ( wordlist protocol -- )
|
||||
|
@ -37,31 +36,6 @@ M: tuple-class group-words
|
|||
: CONSULT:
|
||||
scan-word scan-word parse-definition swapd define-consult ; parsing
|
||||
|
||||
PROTOCOL: sequence-protocol
|
||||
clone clone-like like new new-resizable nth nth-unsafe
|
||||
set-nth set-nth-unsafe length set-length lengthen ;
|
||||
|
||||
PROTOCOL: assoc-protocol
|
||||
at* assoc-size >alist set-at assoc-clone-like
|
||||
delete-at clear-assoc new-assoc assoc-like ;
|
||||
! assoc-find excluded because GENERIC# 1
|
||||
! everything should work, just slower (with >alist)
|
||||
|
||||
PROTOCOL: stream-protocol
|
||||
stream-close stream-read1 stream-read stream-read-until
|
||||
stream-flush stream-write1 stream-write stream-format
|
||||
stream-nl make-span-stream make-block-stream stream-readln
|
||||
make-cell-stream stream-write-table set-timeout ;
|
||||
|
||||
PROTOCOL: definition-protocol
|
||||
where set-where forget uses redefined*
|
||||
synopsis* definer definition ;
|
||||
|
||||
PROTOCOL: prettyprint-section-protocol
|
||||
section-fits? indent-section? unindent-first-line?
|
||||
newline-after? short-section? short-section long-section
|
||||
<section> delegate>block add-section ;
|
||||
|
||||
: define-mimic ( group mimicker mimicked -- )
|
||||
>r >r group-words r> r> [
|
||||
pick "methods" word-prop at dup
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
! Copyright (C) 2007 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: delegate sequences.private sequences assocs prettyprint.sections
|
||||
io definitions kernel ;
|
||||
IN: delegate.protocols
|
||||
|
||||
PROTOCOL: sequence-protocol
|
||||
clone clone-like like new new-resizable nth nth-unsafe
|
||||
set-nth set-nth-unsafe length set-length lengthen ;
|
||||
|
||||
PROTOCOL: assoc-protocol
|
||||
at* assoc-size >alist set-at assoc-clone-like
|
||||
delete-at clear-assoc new-assoc assoc-like ;
|
||||
! assoc-find excluded because GENERIC# 1
|
||||
! everything should work, just slower (with >alist)
|
||||
|
||||
PROTOCOL: stream-protocol
|
||||
stream-close stream-read1 stream-read stream-read-until
|
||||
stream-flush stream-write1 stream-write stream-format
|
||||
stream-nl make-span-stream make-block-stream stream-readln
|
||||
make-cell-stream stream-write-table set-timeout ;
|
||||
|
||||
PROTOCOL: definition-protocol
|
||||
where set-where forget uses redefined*
|
||||
synopsis* definer definition ;
|
||||
|
||||
PROTOCOL: prettyprint-section-protocol
|
||||
section-fits? indent-section? unindent-first-line?
|
||||
newline-after? short-section? short-section long-section
|
||||
<section> delegate>block add-section ;
|
||||
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
USING: hash2 help.syntax help.markup ;
|
||||
USING: help.syntax help.markup ;
|
||||
IN: hash2
|
||||
|
||||
ARTICLE: { "hash2" "intro" }
|
||||
"The hash2 vocabulary specifies a simple minimal datastructure for hash tables with two integers as keys. These hash tables are fixed size and do not conform to the associative mapping protocol. Words used in creating and manipulating these hash tables include:"
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
Eduardo Cavazos
|
|
@ -1 +0,0 @@
|
|||
Non-core hashtable words
|
|
@ -1 +0,0 @@
|
|||
collections
|
|
@ -1,6 +1,7 @@
|
|||
USING: unicode kernel math const combinators splitting
|
||||
USING: unicode.categories kernel math const combinators splitting
|
||||
sequences math.parser io.files io assocs arrays namespaces
|
||||
;
|
||||
combinators.lib assocs.lib math.ranges unicode.normalize
|
||||
unicode.syntax unicode.data ;
|
||||
IN: unicode.breaks
|
||||
|
||||
ENUM: Any L V T Extend Control CR LF graphemes ;
|
||||
|
@ -25,17 +26,14 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ;
|
|||
: process-other-extend ( lines -- set )
|
||||
[ "#" split1 drop ";" split1 drop trim-blank ] map
|
||||
[ empty? not ] subset
|
||||
[ ".." split1 [ dup ] unless* [ hex> ] 2apply range ] map
|
||||
[ ".." split1 [ dup ] unless* [ hex> ] 2apply [a,b] ] map
|
||||
concat >set ;
|
||||
|
||||
: other-extend-lines ( -- lines )
|
||||
"extra/unicode/PropList.txt" resource-path <file-reader> lines ;
|
||||
|
||||
DEFER: other-extend
|
||||
: load-other-extend
|
||||
other-extend-lines process-other-extend
|
||||
\ other-extend define-value ; parsing
|
||||
load-other-extend
|
||||
<< other-extend-lines process-other-extend \ other-extend define-value >>
|
||||
|
||||
CATEGORY: (extend) Me Mn ;
|
||||
: extend? ( ch -- ? )
|
||||
|
@ -81,11 +79,11 @@ SYMBOL: table
|
|||
graphemes Extend connect-after ;
|
||||
|
||||
DEFER: grapheme-table
|
||||
: load-grapheme-table
|
||||
<<
|
||||
init-grapheme-table table
|
||||
[ make-grapheme-table finish-table ] with-variable
|
||||
\ grapheme-table define-value ; parsing
|
||||
load-grapheme-table
|
||||
\ grapheme-table define-value
|
||||
>>
|
||||
|
||||
: grapheme-break? ( class1 class2 -- ? )
|
||||
grapheme-table nth nth not ;
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
USING: unicode.case tools.test namespaces ;
|
||||
|
||||
[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
|
||||
[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
|
||||
[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
|
||||
[ t ] [ "hello how are you?" lower? ] unit-test
|
||||
[
|
||||
"tr" locale set
|
||||
[ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
|
||||
! [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
|
||||
[ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
|
||||
"lt" locale set
|
||||
! Lithuanian casing tests
|
||||
] with-scope
|
|
@ -0,0 +1,111 @@
|
|||
USING: kernel unicode.data sequences sequences.next namespaces assocs.lib
|
||||
unicode.normalize math unicode.categories combinators assocs ;
|
||||
IN: unicode.case
|
||||
|
||||
: ch>lower ( ch -- lower ) simple-lower at-default ;
|
||||
: ch>upper ( ch -- upper ) simple-upper at-default ;
|
||||
: ch>title ( ch -- title ) simple-title at-default ;
|
||||
|
||||
SYMBOL: locale ! Just casing locale, or overall?
|
||||
|
||||
: i-dot? ( -- ? )
|
||||
locale get { "tr" "az" } member? ;
|
||||
|
||||
: lithuanian? ( -- ? ) locale get "lt" = ;
|
||||
|
||||
: dot-over ( -- ch ) CHAR: \u0307 ;
|
||||
|
||||
: lithuanian-ch>upper ( ? next ch -- ? )
|
||||
rot [ 2drop f ]
|
||||
[ swap dot-over = over "ij" member? and swap , ] if ;
|
||||
|
||||
: lithuanian>upper ( string -- lower )
|
||||
[ f swap [ lithuanian-ch>upper ] each-next drop ] "" make* ;
|
||||
|
||||
: mark-above? ( ch -- ? )
|
||||
combining-class 230 = ;
|
||||
|
||||
: lithuanian-ch>lower ( next ch -- )
|
||||
! This fails to add a dot above in certain edge cases
|
||||
! where there is a non-above combining mark before an above one
|
||||
! in Lithuanian
|
||||
dup , "IJ" member? swap mark-above? and [ dot-over , ] when ;
|
||||
|
||||
: lithuanian>lower ( string -- lower )
|
||||
[ [ lithuanian-ch>lower ] each-next ] "" make* ;
|
||||
|
||||
: turk-ch>upper ( ch -- )
|
||||
dup CHAR: i =
|
||||
[ drop CHAR: I , dot-over , ] [ , ] if ;
|
||||
|
||||
: turk>upper ( string -- upper-i )
|
||||
[ [ turk-ch>upper ] each ] "" make* ;
|
||||
|
||||
: turk-ch>lower ( ? next ch -- ? )
|
||||
{
|
||||
{ [ rot ] [ 2drop f ] }
|
||||
{ [ dup CHAR: I = ] [
|
||||
drop dot-over =
|
||||
dup CHAR: i CHAR: \u0131 ? ,
|
||||
] }
|
||||
{ [ t ] [ , drop f ] }
|
||||
} cond ;
|
||||
|
||||
: turk>lower ( string -- lower-i )
|
||||
[ f swap [ turk-ch>lower ] each-next drop ] "" make* ;
|
||||
|
||||
: word-boundary ( prev char -- new ? )
|
||||
dup non-starter? [ drop dup ] when
|
||||
swap uncased? ;
|
||||
|
||||
: sigma-map ( string -- string )
|
||||
[
|
||||
swap [ uncased? ] keep not or
|
||||
[ drop HEX: 3C2 ] when
|
||||
] map-next ;
|
||||
|
||||
: final-sigma ( string -- string )
|
||||
HEX: 3A3 over member? [ sigma-map ] when ;
|
||||
|
||||
: map-case ( string string-quot char-quot -- case )
|
||||
[
|
||||
rot [
|
||||
-rot [
|
||||
rot dup special-casing at
|
||||
[ -rot drop call % ]
|
||||
[ -rot nip call , ] ?if
|
||||
] 2keep
|
||||
] each 2drop
|
||||
] "" make* ; inline
|
||||
|
||||
: >lower ( string -- lower )
|
||||
i-dot? [ turk>lower ] when
|
||||
final-sigma [ code-point-lower ] [ ch>lower ] map-case ;
|
||||
|
||||
: >upper ( string -- upper )
|
||||
i-dot? [ turk>upper ] when
|
||||
[ code-point-upper ] [ ch>upper ] map-case ;
|
||||
|
||||
: >title ( string -- title )
|
||||
final-sigma
|
||||
CHAR: \s swap
|
||||
[ tuck word-boundary swapd
|
||||
[ code-point-title ] [ code-point-lower ] if ]
|
||||
[ tuck word-boundary swapd
|
||||
[ ch>title ] [ ch>lower ] if ]
|
||||
map-case nip ;
|
||||
|
||||
: >case-fold ( string -- fold )
|
||||
>upper >lower ;
|
||||
|
||||
: insensitive= ( str1 str2 -- ? )
|
||||
[ >case-fold ] 2apply = ;
|
||||
|
||||
: lower? ( string -- ? )
|
||||
dup >lower = ;
|
||||
: upper? ( string -- ? )
|
||||
dup >lower = ;
|
||||
: title? ( string -- ? )
|
||||
dup >title = ;
|
||||
: case-fold? ( string -- ? )
|
||||
dup >case-fold = ;
|
|
@ -0,0 +1,7 @@
|
|||
USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
|
||||
|
||||
[ { f f t t f t t f f t } ] [ CHAR: A {
|
||||
blank? letter? LETTER? Letter? digit?
|
||||
printable? alpha? control? uncased? character?
|
||||
} [ execute ] curry* map ] unit-test
|
||||
[ "Nd" ] [ CHAR: 3 category ] unit-test
|
|
@ -0,0 +1,13 @@
|
|||
USING: unicode.syntax ;
|
||||
IN: unicode.categories
|
||||
|
||||
CATEGORY: blank Zs Zl Zp ;
|
||||
CATEGORY: letter Ll ;
|
||||
CATEGORY: LETTER Lu ;
|
||||
CATEGORY: Letter Lu Ll Lt Lm Lo ;
|
||||
CATEGORY: digit Nd Nl No ;
|
||||
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
|
||||
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
|
||||
CATEGORY: control Cc ;
|
||||
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
|
||||
CATEGORY-NOT: character Cn ;
|
|
@ -0,0 +1,149 @@
|
|||
USING: assocs math kernel sequences io.files hashtables quotations
|
||||
splitting arrays math.parser combinators.lib hash2 byte-arrays words
|
||||
namespaces words ;
|
||||
IN: unicode.data
|
||||
|
||||
! Convenience functions
|
||||
: 1+* ( n/f _ -- n+1 )
|
||||
drop [ 1+ ] [ 0 ] if* ;
|
||||
|
||||
: define-value ( value word -- )
|
||||
swap 1quotation define ;
|
||||
|
||||
: ?between? ( n/f from to -- ? )
|
||||
pick [ between? ] [ 3drop f ] if ;
|
||||
|
||||
! Loading data from UnicodeData.txt
|
||||
|
||||
: data ( filename -- data )
|
||||
file-lines [ ";" split ] map ;
|
||||
|
||||
: load-data ( -- data )
|
||||
"extra/unicode/UnicodeData.txt" resource-path data ;
|
||||
|
||||
: (process-data) ( index data -- newdata )
|
||||
[ [ nth ] keep first swap 2array ] with map
|
||||
[ second empty? not ] subset
|
||||
[ >r hex> r> ] assoc-map ;
|
||||
|
||||
: process-data ( index data -- hash )
|
||||
(process-data) [ hex> ] assoc-map >hashtable ;
|
||||
|
||||
: (chain-decomposed) ( hash value -- newvalue )
|
||||
[
|
||||
2dup swap at
|
||||
[ (chain-decomposed) ] [ 1array nip ] ?if
|
||||
] with map concat ;
|
||||
|
||||
: chain-decomposed ( hash -- newhash )
|
||||
dup [ swap (chain-decomposed) ] curry assoc-map ;
|
||||
|
||||
: first* ( seq -- ? )
|
||||
second [ empty? ] [ first ] either ;
|
||||
|
||||
: (process-decomposed) ( data -- alist )
|
||||
5 swap (process-data)
|
||||
[ " " split [ hex> ] map ] assoc-map ;
|
||||
|
||||
: process-canonical ( data -- hash2 hash )
|
||||
(process-decomposed) [ first* ] subset
|
||||
[
|
||||
[ second length 2 = ] subset
|
||||
! using 1009 as the size, the maximum load is 4
|
||||
[ first2 first2 rot 3array ] map 1009 alist>hash2
|
||||
] keep
|
||||
>hashtable chain-decomposed ;
|
||||
|
||||
: process-compat ( data -- hash )
|
||||
(process-decomposed)
|
||||
[ dup first* [ first2 1 tail 2array ] unless ] map
|
||||
>hashtable chain-decomposed ;
|
||||
|
||||
: process-combining ( data -- hash )
|
||||
3 swap (process-data)
|
||||
[ string>number ] assoc-map
|
||||
[ nip 0 = not ] assoc-subset
|
||||
>hashtable ;
|
||||
|
||||
: categories ( -- names )
|
||||
! For non-existent characters, use Cn
|
||||
{ "Lu" "Ll" "Lt" "Lm" "Lo"
|
||||
"Mn" "Mc" "Me"
|
||||
"Nd" "Nl" "No"
|
||||
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
|
||||
"Sm" "Sc" "Sk" "So"
|
||||
"Zs" "Zl" "Zp"
|
||||
"Cc" "Cf" "Cs" "Co" "Cn" } ;
|
||||
|
||||
: unicode-chars HEX: 2FA1E ;
|
||||
! the maximum unicode char in the first 3 planes
|
||||
|
||||
: process-category ( data -- category-listing )
|
||||
2 swap (process-data)
|
||||
unicode-chars <byte-array> swap dupd swap [
|
||||
>r over unicode-chars >= [ r> 3drop ]
|
||||
[ categories index swap r> set-nth ] if
|
||||
] curry assoc-each ;
|
||||
|
||||
: ascii-lower ( string -- lower )
|
||||
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
|
||||
|
||||
: replace ( seq old new -- newseq )
|
||||
swap rot [ 2dup = [ drop over ] when ] map 2nip ;
|
||||
|
||||
: process-names ( data -- names-hash )
|
||||
1 swap (process-data)
|
||||
[ ascii-lower CHAR: \s CHAR: - replace swap ] assoc-map
|
||||
>hashtable ;
|
||||
|
||||
: multihex ( hexstring -- string )
|
||||
" " split [ hex> ] map [ ] subset ;
|
||||
|
||||
TUPLE: code-point lower title upper ;
|
||||
|
||||
C: <code-point> code-point
|
||||
|
||||
: set-code-point ( seq -- )
|
||||
4 head [ multihex ] map first4
|
||||
<code-point> swap first set ;
|
||||
|
||||
DEFER: simple-lower
|
||||
DEFER: simple-upper
|
||||
DEFER: simple-title
|
||||
DEFER: canonical-map
|
||||
DEFER: combine-map
|
||||
DEFER: class-map
|
||||
DEFER: compat-map
|
||||
DEFER: category-map
|
||||
DEFER: name-map
|
||||
|
||||
<<
|
||||
load-data
|
||||
dup process-names \ name-map define-value
|
||||
13 over process-data \ simple-lower define-value
|
||||
12 over process-data tuck \ simple-upper define-value
|
||||
14 over process-data swapd union \ simple-title define-value
|
||||
dup process-combining \ class-map define-value
|
||||
dup process-canonical \ canonical-map define-value
|
||||
\ combine-map define-value
|
||||
dup process-compat \ compat-map define-value
|
||||
process-category \ category-map define-value
|
||||
>>
|
||||
|
||||
: canonical-entry ( char -- seq ) canonical-map at ;
|
||||
: combine-chars ( a b -- char/f ) combine-map hash2 ;
|
||||
: compat-entry ( char -- seq ) compat-map at ;
|
||||
: combining-class ( char -- n ) class-map at ;
|
||||
: non-starter? ( char -- ? ) class-map key? ;
|
||||
: name>char ( string -- char ) name-map at ;
|
||||
: char>name ( char -- string ) name-map value-at ;
|
||||
|
||||
! Special casing data
|
||||
: load-special-casing ( -- special-casing )
|
||||
"extra/unicode/SpecialCasing.txt" resource-path data
|
||||
[ length 5 = ] subset
|
||||
[ [ set-code-point ] each ] H{ } make-assoc ;
|
||||
|
||||
DEFER: special-casing
|
||||
|
||||
<< load-special-casing \ special-casing define-value >>
|
|
@ -0,0 +1,18 @@
|
|||
USING: unicode.normalize kernel tools.test sequences ;
|
||||
|
||||
[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
|
||||
|
||||
[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
|
||||
[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
|
||||
[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
|
||||
[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
|
||||
|
||||
[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
|
||||
[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
|
||||
|
||||
[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
|
||||
[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
|
||||
[ t ] [ 54620 hangul? ] unit-test
|
||||
[ f ] [ 0 hangul? ] unit-test
|
||||
[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
|
||||
[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test
|
|
@ -0,0 +1,176 @@
|
|||
USING: sequences namespaces unicode.data kernel combinators.lib math
|
||||
unicode arrays ;
|
||||
IN: unicode.normalize
|
||||
|
||||
! Utility word
|
||||
: make* ( seq quot exemplar -- newseq )
|
||||
! quot has access to original seq on stack
|
||||
! this just makes the new-resizable the same length as seq
|
||||
[
|
||||
[
|
||||
pick length swap new-resizable
|
||||
[ building set call ] keep
|
||||
] keep like
|
||||
] with-scope ; inline
|
||||
|
||||
! Conjoining Jamo behavior
|
||||
|
||||
: hangul-base HEX: ac00 ; inline
|
||||
: hangul-end HEX: D7AF ; inline
|
||||
: initial-base HEX: 1100 ; inline
|
||||
: medial-base HEX: 1161 ; inline
|
||||
: final-base HEX: 11a7 ; inline
|
||||
|
||||
: initial-count 19 ; inline
|
||||
: medial-count 21 ; inline
|
||||
: final-count 28 ; inline
|
||||
|
||||
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
|
||||
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
|
||||
|
||||
! These numbers come from UAX 29
|
||||
: initial? ( ch -- ? )
|
||||
[ HEX: 1100 HEX: 1159 ?between? ] [ HEX: 115F = ] either ;
|
||||
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
|
||||
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
|
||||
|
||||
: hangul>jamo ( hangul -- jamo-string )
|
||||
hangul-base - final-count /mod final-base +
|
||||
>r medial-count /mod medial-base +
|
||||
>r initial-base + r> r>
|
||||
dup zero? [ drop 2array ] [ 3array ] if ;
|
||||
|
||||
: jamo>hangul ( initial medial final -- hangul )
|
||||
>r >r initial-base - medial-count *
|
||||
r> medial-base - + final-count *
|
||||
r> final-base - + hangul-base + ;
|
||||
|
||||
! Normalization -- Decomposition
|
||||
|
||||
: (insert) ( seq n quot -- )
|
||||
over 0 = [ 3drop ] [
|
||||
[ >r dup 1- rot [ nth ] curry 2apply r> 2apply > ] 3keep
|
||||
roll [ 3drop ]
|
||||
[ >r [ dup 1- rot exchange ] 2keep 1- r> (insert) ] if
|
||||
] if ; inline
|
||||
|
||||
: insert ( seq quot elt n -- )
|
||||
swap rot >r -rot [ swap set-nth ] 2keep r> (insert) ; inline
|
||||
|
||||
: insertion-sort ( seq quot -- )
|
||||
! quot is a transformation on elements
|
||||
over dup length
|
||||
[ >r >r 2dup r> r> insert ] 2each 2drop ; inline
|
||||
|
||||
: reorder-slice ( string start -- slice done? )
|
||||
2dup swap [ non-starter? not ] find* drop
|
||||
[ [ over length ] unless* rot <slice> ] keep not ;
|
||||
|
||||
: reorder-next ( string i -- new-i done? )
|
||||
over [ non-starter? ] find* drop [
|
||||
reorder-slice
|
||||
>r dup [ combining-class ] insertion-sort slice-to r>
|
||||
] [ length t ] if* ;
|
||||
|
||||
: reorder-loop ( string start -- )
|
||||
dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
|
||||
|
||||
: reorder ( string -- )
|
||||
0 reorder-loop ;
|
||||
|
||||
: reorder-back ( string i -- )
|
||||
over [ non-starter? not ] find-last* 1+* reorder-next 2drop ;
|
||||
|
||||
: decompose ( string quot -- decomposed )
|
||||
! When there are 8 and 32-bit strings, this'll be
|
||||
! equivalent to clone on 8 and the contents of the last
|
||||
! main quotation on 32.
|
||||
over [ 127 < ] all? [ drop ] [
|
||||
swap [ [
|
||||
dup hangul? [ hangul>jamo % drop ]
|
||||
[ dup rot call [ % ] [ , ] ?if ] if
|
||||
] curry* each ] "" make*
|
||||
dup reorder
|
||||
] if ; inline
|
||||
|
||||
: nfd ( string -- string )
|
||||
[ canonical-entry ] decompose ;
|
||||
|
||||
: nfkd ( string -- string )
|
||||
[ compat-entry ] decompose ;
|
||||
|
||||
: string-append ( s1 s2 -- string )
|
||||
! This could be more optimized,
|
||||
! but in practice, it'll almost always just be append
|
||||
[ append ] keep
|
||||
0 over ?nth non-starter?
|
||||
[ length dupd reorder-back ] [ drop ] if ;
|
||||
|
||||
! Normalization -- Composition
|
||||
SYMBOL: main-str
|
||||
SYMBOL: ind
|
||||
SYMBOL: after
|
||||
SYMBOL: char
|
||||
|
||||
: get-str ( i -- ch ) ind get + main-str get ?nth ;
|
||||
: current ( -- ch ) 0 get-str ;
|
||||
: to ( -- ) ind inc ;
|
||||
|
||||
: initial-medial? ( -- ? )
|
||||
current initial? [ 1 get-str medial? ] [ f ] if ;
|
||||
|
||||
: --final? ( -- ? )
|
||||
2 get-str final? ;
|
||||
|
||||
: imf, ( -- )
|
||||
current to current to current jamo>hangul , ;
|
||||
|
||||
: im, ( -- )
|
||||
current to current 0 jamo>hangul , ;
|
||||
|
||||
: compose-jamo ( -- )
|
||||
initial-medial? [
|
||||
--final? [ imf, ] [ im, ] if
|
||||
] when to current jamo? [ compose-jamo ] when ;
|
||||
|
||||
: pass-combining ( -- )
|
||||
current non-starter? [ current , to pass-combining ] when ;
|
||||
|
||||
: try-compose ( last-class char current-class -- )
|
||||
swapd = [ after get push ] [
|
||||
char get over combine-chars
|
||||
[ nip char set ] [ after get push ] if*
|
||||
] if ;
|
||||
|
||||
: compose-iter ( n -- )
|
||||
current [
|
||||
dup combining-class dup
|
||||
[ [ try-compose ] keep to compose-iter ] [ 3drop ] if
|
||||
] [ drop ] if* ;
|
||||
|
||||
: ?new-after ( -- )
|
||||
after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
|
||||
|
||||
: (compose) ( -- )
|
||||
current [
|
||||
dup jamo? [ drop compose-jamo ] [
|
||||
char set to ?new-after
|
||||
0 compose-iter
|
||||
char get , after get %
|
||||
to
|
||||
] if (compose)
|
||||
] when* ;
|
||||
|
||||
: compose ( str -- comp )
|
||||
[
|
||||
main-str set
|
||||
0 ind set
|
||||
SBUF" " clone after set
|
||||
pass-combining (compose)
|
||||
] "" make* ;
|
||||
|
||||
: nfc ( string -- nfc )
|
||||
nfd compose ;
|
||||
|
||||
: nfkc ( string -- nfkc )
|
||||
nfkc compose ;
|
|
@ -0,0 +1,4 @@
|
|||
USING: unicode.syntax tools.test ;
|
||||
|
||||
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
|
||||
! Write a test for CATEGORY and CATEGORY-NOT
|
|
@ -0,0 +1,60 @@
|
|||
USING: unicode.data kernel math sequences parser bit-arrays namespaces
|
||||
sequences.private arrays quotations classes.predicate ;
|
||||
IN: unicode.syntax
|
||||
|
||||
! Character classes (categories)
|
||||
|
||||
: category# ( char -- category )
|
||||
! There are a few characters that should be Cn
|
||||
! that this gives Cf or Mn
|
||||
! Cf = 26; Mn = 5; Cn = 29
|
||||
! Use a compressed array instead?
|
||||
dup category-map ?nth [ ] [
|
||||
dup HEX: E0001 HEX: E007F between?
|
||||
[ drop 26 ] [
|
||||
HEX: E0100 HEX: E01EF between? 5 29 ?
|
||||
] if
|
||||
] ?if ;
|
||||
|
||||
: category ( char -- category )
|
||||
category# categories nth ;
|
||||
|
||||
: >category-array ( categories -- bitarray )
|
||||
categories [ swap member? ] curry* map >bit-array ;
|
||||
|
||||
: as-string ( strings -- bit-array )
|
||||
concat "\"" tuck 3append parse first ;
|
||||
|
||||
: [category] ( categories -- quot )
|
||||
[
|
||||
[ [ categories member? not ] subset as-string ] keep
|
||||
[ categories member? ] subset >category-array
|
||||
[ dup category# ] % , [ nth-unsafe [ drop t ] ] %
|
||||
\ member? 2array >quotation ,
|
||||
\ if ,
|
||||
] [ ] make ;
|
||||
|
||||
: define-category ( word categories -- )
|
||||
[category] fixnum -rot define-predicate-class ;
|
||||
|
||||
: CATEGORY:
|
||||
CREATE ";" parse-tokens define-category ; parsing
|
||||
|
||||
: seq-minus ( seq1 seq2 -- diff )
|
||||
[ member? not ] curry subset ;
|
||||
|
||||
: CATEGORY-NOT:
|
||||
CREATE ";" parse-tokens
|
||||
categories swap seq-minus define-category ; parsing
|
||||
|
||||
TUPLE: code-point lower title upper ;
|
||||
|
||||
C: <code-point> code-point
|
||||
|
||||
: set-code-point ( seq -- )
|
||||
4 head [ multihex ] map first4
|
||||
<code-point> swap first set ;
|
||||
|
||||
: UNICHAR:
|
||||
! This should be part of CHAR:
|
||||
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
|
|
@ -1,37 +0,0 @@
|
|||
USING: unicode kernel tools.test words sequences namespaces ;
|
||||
|
||||
[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
|
||||
[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
|
||||
[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
|
||||
[ t ] [ "hello how are you?" lower? ] unit-test
|
||||
[
|
||||
"tr" locale set
|
||||
[ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
|
||||
! [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
|
||||
[ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
|
||||
"lt" locale set
|
||||
! Lithuanian casing tests
|
||||
] with-scope
|
||||
|
||||
[ { f f t t f t t f f t } ] [ CHAR: A {
|
||||
blank? letter? LETTER? Letter? digit?
|
||||
printable? alpha? control? uncased? character?
|
||||
} [ execute ] with map ] unit-test
|
||||
[ "Nd" ] [ CHAR: 3 category ] unit-test
|
||||
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
|
||||
[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
|
||||
|
||||
[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
|
||||
[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
|
||||
[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
|
||||
[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
|
||||
|
||||
[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
|
||||
[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
|
||||
|
||||
[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
|
||||
[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
|
||||
[ t ] [ 54620 hangul? ] unit-test
|
||||
[ f ] [ 0 hangul? ] unit-test
|
||||
[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
|
||||
[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test
|
|
@ -1,503 +1,5 @@
|
|||
USING: kernel hashtables sequences io arrays math
|
||||
hash2 namespaces strings assocs words splitting sequences.next
|
||||
byte-arrays quotations sequences.private io.files bit-arrays
|
||||
combinators math.parser io.streams.lines parser classes
|
||||
classes.predicate ;
|
||||
USING: unicode.syntax unicode.data unicode.breaks unicode.normalize
|
||||
unicode.case unicode.categories ;
|
||||
IN: unicode
|
||||
|
||||
! Convenience functions
|
||||
: >set ( seq -- hash )
|
||||
[ dup ] H{ } map>assoc ;
|
||||
|
||||
: either ( object first second -- ? )
|
||||
>r over slip swap [ r> drop ] [ r> call ] ?if ; inline
|
||||
|
||||
: 1+* ( n/f _ -- n+1 )
|
||||
drop [ 1+ ] [ 0 ] if* ;
|
||||
|
||||
: define-value ( value word -- )
|
||||
swap 1quotation define-compound ;
|
||||
|
||||
: ?between? ( n/f from to -- ? )
|
||||
pick [ between? ] [ 3drop f ] if ;
|
||||
|
||||
: range ( from to -- seq )
|
||||
1+ over - [ + ] with map ;
|
||||
|
||||
! Loading data from UnicodeData.txt
|
||||
|
||||
: data ( filename -- data )
|
||||
<file-reader> lines [ ";" split ] map ;
|
||||
|
||||
: load-data ( -- data )
|
||||
"extra/unicode/UnicodeData.txt" resource-path data ;
|
||||
|
||||
: (process-data) ( index data -- newdata )
|
||||
[ [ nth ] keep first swap 2array ] with map
|
||||
[ second empty? not ] subset
|
||||
[ >r hex> r> ] assoc-map ;
|
||||
|
||||
: process-data ( index data -- hash )
|
||||
(process-data) [ hex> ] assoc-map >hashtable ;
|
||||
|
||||
: (chain-decomposed) ( hash value -- newvalue )
|
||||
[
|
||||
2dup swap at
|
||||
[ (chain-decomposed) ] [ 1array nip ] ?if
|
||||
] with map concat ;
|
||||
|
||||
: chain-decomposed ( hash -- newhash )
|
||||
dup [ swap (chain-decomposed) ] curry assoc-map ;
|
||||
|
||||
: first* ( seq -- ? )
|
||||
second [ empty? ] [ first ] either ;
|
||||
|
||||
: (process-decomposed) ( data -- alist )
|
||||
5 swap (process-data)
|
||||
[ " " split [ hex> ] map ] assoc-map ;
|
||||
|
||||
: process-canonical ( data -- hash2 hash )
|
||||
(process-decomposed) [ first* ] subset
|
||||
[
|
||||
[ second length 2 = ] subset
|
||||
! using 1009 as the size, the maximum load is 4
|
||||
[ first2 first2 rot 3array ] map 1009 alist>hash2
|
||||
] keep
|
||||
>hashtable chain-decomposed ;
|
||||
|
||||
: process-compat ( data -- hash )
|
||||
(process-decomposed)
|
||||
[ dup first* [ first2 1 tail 2array ] unless ] map
|
||||
>hashtable chain-decomposed ;
|
||||
|
||||
: process-combining ( data -- hash )
|
||||
3 swap (process-data)
|
||||
[ string>number ] assoc-map
|
||||
[ nip 0 = not ] assoc-subset
|
||||
>hashtable ;
|
||||
|
||||
: categories ( -- names )
|
||||
! For non-existent characters, use Cn
|
||||
{ "Lu" "Ll" "Lt" "Lm" "Lo"
|
||||
"Mn" "Mc" "Me"
|
||||
"Nd" "Nl" "No"
|
||||
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
|
||||
"Sm" "Sc" "Sk" "So"
|
||||
"Zs" "Zl" "Zp"
|
||||
"Cc" "Cf" "Cs" "Co" "Cn" } ;
|
||||
|
||||
: unicode-chars HEX: 2FA1E ;
|
||||
! the maximum unicode char in the first 3 planes
|
||||
|
||||
: process-category ( data -- category-listing )
|
||||
2 swap (process-data)
|
||||
unicode-chars <byte-array> swap dupd swap [
|
||||
>r over unicode-chars >= [ r> 3drop ]
|
||||
[ categories index swap r> set-nth ] if
|
||||
] curry assoc-each ;
|
||||
|
||||
: ascii-lower ( string -- lower )
|
||||
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
|
||||
|
||||
: replace ( seq old new -- newseq )
|
||||
spin [ 2dup = [ drop over ] when ] map 2nip ;
|
||||
|
||||
: process-names ( data -- names-hash )
|
||||
1 swap (process-data)
|
||||
[ ascii-lower CHAR: \s CHAR: - replace swap ] assoc-map
|
||||
>hashtable ;
|
||||
|
||||
DEFER: simple-lower
|
||||
DEFER: simple-upper
|
||||
DEFER: simple-title
|
||||
DEFER: canonical-map
|
||||
DEFER: combine-map
|
||||
DEFER: class-map
|
||||
DEFER: compat-map
|
||||
DEFER: category-map
|
||||
DEFER: name-map
|
||||
|
||||
: load-tables ( -- )
|
||||
load-data
|
||||
dup process-names \ name-map define-value
|
||||
13 over process-data \ simple-lower define-value
|
||||
12 over process-data tuck \ simple-upper define-value
|
||||
14 over process-data swapd union \ simple-title define-value
|
||||
dup process-combining \ class-map define-value
|
||||
dup process-canonical \ canonical-map define-value
|
||||
\ combine-map define-value
|
||||
dup process-compat \ compat-map define-value
|
||||
process-category \ category-map define-value ; parsing
|
||||
load-tables
|
||||
|
||||
: canonical-entry ( char -- seq ) canonical-map at ;
|
||||
: combine-chars ( a b -- char/f ) combine-map hash2 ;
|
||||
: compat-entry ( char -- seq ) compat-map at ;
|
||||
: combining-class ( char -- n ) class-map at ;
|
||||
: non-starter? ( char -- ? ) class-map key? ;
|
||||
: name>char ( string -- char ) name-map at ;
|
||||
|
||||
: UNICHAR:
|
||||
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
|
||||
|
||||
! Character classes (categories)
|
||||
|
||||
: category# ( char -- category )
|
||||
! There are a few characters that should be Cn
|
||||
! that this gives Cf or Mn
|
||||
! Cf = 26; Mn = 5; Cn = 29
|
||||
dup category-map ?nth [ ] [
|
||||
dup HEX: E0001 HEX: E007F between?
|
||||
[ drop 26 ] [
|
||||
HEX: E0100 HEX: E01EF between? 5 29 ?
|
||||
] if
|
||||
] ?if ;
|
||||
|
||||
: category ( char -- category )
|
||||
category# categories nth ;
|
||||
|
||||
: >category-array ( categories -- bitarray )
|
||||
categories [ swap member? ] with map >bit-array ;
|
||||
|
||||
: as-string ( strings -- bit-array )
|
||||
concat "\"" tuck 3append parse first ;
|
||||
|
||||
: [category] ( categories -- quot )
|
||||
[
|
||||
[ [ categories member? not ] subset as-string ] keep
|
||||
[ categories member? ] subset >category-array
|
||||
[ dup category# ] % , [ nth-unsafe [ drop t ] ] %
|
||||
\ member? 2array >quotation ,
|
||||
\ if ,
|
||||
] [ ] make ;
|
||||
|
||||
: define-category ( word categories -- )
|
||||
[category] fixnum -rot define-predicate-class ;
|
||||
|
||||
: CATEGORY:
|
||||
CREATE ";" parse-tokens define-category ; parsing
|
||||
|
||||
: seq-minus ( seq1 seq2 -- diff )
|
||||
[ member? not ] curry subset ;
|
||||
|
||||
: CATEGORY-NOT:
|
||||
CREATE ";" parse-tokens
|
||||
categories swap seq-minus define-category ; parsing
|
||||
|
||||
CATEGORY: blank Zs Zl Zp ;
|
||||
CATEGORY: letter Ll ;
|
||||
CATEGORY: LETTER Lu ;
|
||||
CATEGORY: Letter Lu Ll Lt Lm Lo ;
|
||||
CATEGORY: digit Nd Nl No ;
|
||||
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
|
||||
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
|
||||
CATEGORY: control Cc ;
|
||||
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
|
||||
CATEGORY-NOT: character Cn ;
|
||||
|
||||
! Utility word
|
||||
: make* ( seq quot exemplar -- newseq )
|
||||
! quot has access to original seq on stack
|
||||
! this just makes the new-resizable the same length as seq
|
||||
[
|
||||
[
|
||||
pick length swap new-resizable
|
||||
[ building set call ] keep
|
||||
] keep like
|
||||
] with-scope ; inline
|
||||
|
||||
! Case mapping
|
||||
|
||||
: hash-default ( key hash -- value/key )
|
||||
dupd at [ nip ] when* ;
|
||||
|
||||
: ch>lower ( ch -- lower ) simple-lower hash-default ;
|
||||
: ch>upper ( ch -- upper ) simple-upper hash-default ;
|
||||
: ch>title ( ch -- title ) simple-title hash-default ;
|
||||
|
||||
: load-special-data ( -- data )
|
||||
"extra/unicode/SpecialCasing.txt" resource-path data
|
||||
[ length 5 = ] subset ;
|
||||
|
||||
: multihex ( hexstring -- string )
|
||||
" " split [ hex> ] map [ ] subset ;
|
||||
|
||||
TUPLE: code-point lower title upper ;
|
||||
|
||||
C: <code-point> code-point
|
||||
|
||||
: set-code-point ( seq -- )
|
||||
4 head [ multihex ] map first4
|
||||
<code-point> swap first set ;
|
||||
|
||||
DEFER: special-casing
|
||||
: load-special-casing
|
||||
load-special-data [ [ set-code-point ] each ] H{ } make-assoc
|
||||
\ special-casing define-value ; parsing
|
||||
load-special-casing
|
||||
|
||||
SYMBOL: locale ! Just casing locale, or overall?
|
||||
|
||||
: i-dot? ( -- ? )
|
||||
locale get { "tr" "az" } member? ;
|
||||
|
||||
: lithuanian? ( -- ? ) locale get "lt" = ;
|
||||
|
||||
: dot-over ( -- ch ) CHAR: \u0307 ;
|
||||
|
||||
: lithuanian-ch>upper ( ? next ch -- ? )
|
||||
rot [ 2drop f ]
|
||||
[ swap dot-over = over "ij" member? and swap , ] if ;
|
||||
|
||||
: lithuanian>upper ( string -- lower )
|
||||
[ f swap [ lithuanian-ch>upper ] each-next drop ] "" make* ;
|
||||
|
||||
: mark-above? ( ch -- ? )
|
||||
combining-class 230 = ;
|
||||
|
||||
: lithuanian-ch>lower ( next ch -- )
|
||||
! This fails to add a dot above in certain edge cases
|
||||
! where there is a non-above combining mark before an above one
|
||||
! in Lithuanian
|
||||
dup , "IJ" member? swap mark-above? and [ dot-over , ] when ;
|
||||
|
||||
: lithuanian>lower ( string -- lower )
|
||||
[ [ lithuanian-ch>lower ] each-next ] "" make* ;
|
||||
|
||||
: turk-ch>upper ( ch -- )
|
||||
dup CHAR: i =
|
||||
[ drop CHAR: I , dot-over , ] [ , ] if ;
|
||||
|
||||
: turk>upper ( string -- upper-i )
|
||||
[ [ turk-ch>upper ] each ] "" make* ;
|
||||
|
||||
: turk-ch>lower ( ? next ch -- ? )
|
||||
{
|
||||
{ [ rot ] [ 2drop f ] }
|
||||
{ [ dup CHAR: I = ] [
|
||||
drop dot-over =
|
||||
dup CHAR: i CHAR: \u0131 ? ,
|
||||
] }
|
||||
{ [ t ] [ , drop f ] }
|
||||
} cond ;
|
||||
|
||||
: turk>lower ( string -- lower-i )
|
||||
[ f swap [ turk-ch>lower ] each-next drop ] "" make* ;
|
||||
|
||||
: word-boundary ( prev char -- new ? )
|
||||
dup non-starter? [ drop dup ] when
|
||||
swap uncased? ;
|
||||
|
||||
: sigma-map ( string -- string )
|
||||
[
|
||||
swap [ uncased? ] keep not or
|
||||
[ drop HEX: 3C2 ] when
|
||||
] map-next ;
|
||||
|
||||
: final-sigma ( string -- string )
|
||||
HEX: 3A3 over member? [ sigma-map ] when ;
|
||||
|
||||
: map-case ( string string-quot char-quot -- case )
|
||||
[
|
||||
rot [
|
||||
-rot [
|
||||
rot dup special-casing at
|
||||
[ -rot drop call % ]
|
||||
[ -rot nip call , ] ?if
|
||||
] 2keep
|
||||
] each 2drop
|
||||
] "" make* ; inline
|
||||
|
||||
: >lower ( string -- lower )
|
||||
i-dot? [ turk>lower ] when
|
||||
final-sigma [ code-point-lower ] [ ch>lower ] map-case ;
|
||||
|
||||
: >upper ( string -- upper )
|
||||
i-dot? [ turk>upper ] when
|
||||
[ code-point-upper ] [ ch>upper ] map-case ;
|
||||
|
||||
: >title ( string -- title )
|
||||
final-sigma
|
||||
CHAR: \s swap
|
||||
[ tuck word-boundary swapd
|
||||
[ code-point-title ] [ code-point-lower ] if ]
|
||||
[ tuck word-boundary swapd
|
||||
[ ch>title ] [ ch>lower ] if ]
|
||||
map-case nip ;
|
||||
|
||||
: >case-fold ( string -- fold )
|
||||
>upper >lower ;
|
||||
|
||||
: insensitive= ( str1 str2 -- ? )
|
||||
[ >case-fold ] 2apply = ;
|
||||
|
||||
: lower? ( string -- ? )
|
||||
dup >lower = ;
|
||||
: upper? ( string -- ? )
|
||||
dup >lower = ;
|
||||
: title? ( string -- ? )
|
||||
dup >title = ;
|
||||
: case-fold? ( string -- ? )
|
||||
dup >case-fold = ;
|
||||
|
||||
! Conjoining Jamo behavior
|
||||
|
||||
: hangul-base HEX: ac00 ; inline
|
||||
: hangul-end HEX: D7AF ; inline
|
||||
: initial-base HEX: 1100 ; inline
|
||||
: medial-base HEX: 1161 ; inline
|
||||
: final-base HEX: 11a7 ; inline
|
||||
|
||||
: initial-count 19 ; inline
|
||||
: medial-count 21 ; inline
|
||||
: final-count 28 ; inline
|
||||
|
||||
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
|
||||
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
|
||||
|
||||
! These numbers come from UAX 29
|
||||
: initial? ( ch -- ? )
|
||||
[ HEX: 1100 HEX: 1159 ?between? ] [ HEX: 115F = ] either ;
|
||||
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
|
||||
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
|
||||
|
||||
: hangul>jamo ( hangul -- jamo-string )
|
||||
hangul-base - final-count /mod final-base +
|
||||
>r medial-count /mod medial-base +
|
||||
>r initial-base + r> r>
|
||||
dup zero? [ drop 2array ] [ 3array ] if ;
|
||||
|
||||
: jamo>hangul ( initial medial final -- hangul )
|
||||
>r >r initial-base - medial-count *
|
||||
r> medial-base - + final-count *
|
||||
r> final-base - + hangul-base + ;
|
||||
|
||||
! Normalization -- Decomposition
|
||||
|
||||
: (insert) ( seq n quot -- )
|
||||
over 0 = [ 3drop ] [
|
||||
[ >r dup 1- rot [ nth ] curry 2apply r> 2apply > ] 3keep
|
||||
roll [ 3drop ]
|
||||
[ >r [ dup 1- rot exchange ] 2keep 1- r> (insert) ] if
|
||||
] if ; inline
|
||||
|
||||
: insert ( seq quot elt n -- )
|
||||
spin >r -rot [ swap set-nth ] 2keep r> (insert) ; inline
|
||||
|
||||
: insertion-sort ( seq quot -- )
|
||||
! quot is a transformation on elements
|
||||
over dup length
|
||||
[ >r >r 2dup r> r> insert ] 2each 2drop ; inline
|
||||
|
||||
: reorder-slice ( string start -- slice done? )
|
||||
2dup swap [ non-starter? not ] find* drop
|
||||
[ [ over length ] unless* rot <slice> ] keep not ;
|
||||
|
||||
: reorder-next ( string i -- new-i done? )
|
||||
over [ non-starter? ] find* drop [
|
||||
reorder-slice
|
||||
>r dup [ combining-class ] insertion-sort slice-to r>
|
||||
] [ length t ] if* ;
|
||||
|
||||
: reorder-loop ( string start -- )
|
||||
dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
|
||||
|
||||
: reorder ( string -- )
|
||||
0 reorder-loop ;
|
||||
|
||||
: reorder-back ( string i -- )
|
||||
over [ non-starter? not ] find-last* 1+* reorder-next 2drop ;
|
||||
|
||||
: decompose ( string quot -- decomposed )
|
||||
! When there are 8 and 32-bit strings, this'll be
|
||||
! equivalent to clone on 8 and the contents of the last
|
||||
! main quotation on 32.
|
||||
over [ 127 < ] all? [ drop ] [
|
||||
swap [ [
|
||||
dup hangul? [ hangul>jamo % drop ]
|
||||
[ dup rot call [ % ] [ , ] ?if ] if
|
||||
] with each ] "" make*
|
||||
dup reorder
|
||||
] if ; inline
|
||||
|
||||
: nfd ( string -- string )
|
||||
[ canonical-entry ] decompose ;
|
||||
|
||||
: nfkd ( string -- string )
|
||||
[ compat-entry ] decompose ;
|
||||
|
||||
: string-append ( s1 s2 -- string )
|
||||
! This could be more optimized,
|
||||
! but in practice, it'll almost always just be append
|
||||
[ append ] keep
|
||||
0 over ?nth non-starter?
|
||||
[ length dupd reorder-back ] [ drop ] if ;
|
||||
|
||||
! Normalization -- Composition
|
||||
SYMBOL: main-str
|
||||
SYMBOL: ind
|
||||
SYMBOL: after
|
||||
SYMBOL: char
|
||||
|
||||
: get-str ( i -- ch ) ind get + main-str get ?nth ;
|
||||
: current ( -- ch ) 0 get-str ;
|
||||
: to ( -- ) ind inc ;
|
||||
|
||||
: initial-medial? ( -- ? )
|
||||
current initial? [ 1 get-str medial? ] [ f ] if ;
|
||||
|
||||
: --final? ( -- ? )
|
||||
2 get-str final? ;
|
||||
|
||||
: imf, ( -- )
|
||||
current to current to current jamo>hangul , ;
|
||||
|
||||
: im, ( -- )
|
||||
current to current 0 jamo>hangul , ;
|
||||
|
||||
: compose-jamo ( -- )
|
||||
initial-medial? [
|
||||
--final? [ imf, ] [ im, ] if
|
||||
] when to current jamo? [ compose-jamo ] when ;
|
||||
|
||||
: pass-combining ( -- )
|
||||
current non-starter? [ current , to pass-combining ] when ;
|
||||
|
||||
: try-compose ( last-class char current-class -- )
|
||||
swapd = [ after get push ] [
|
||||
char get over combine-chars
|
||||
[ nip char set ] [ after get push ] if*
|
||||
] if ;
|
||||
|
||||
: compose-iter ( n -- )
|
||||
current [
|
||||
dup combining-class dup
|
||||
[ [ try-compose ] keep to compose-iter ] [ 3drop ] if
|
||||
] [ drop ] if* ;
|
||||
|
||||
: ?new-after ( -- )
|
||||
after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
|
||||
|
||||
: (compose) ( -- )
|
||||
current [
|
||||
dup jamo? [ drop compose-jamo ] [
|
||||
char set to ?new-after
|
||||
0 compose-iter
|
||||
char get , after get %
|
||||
to
|
||||
] if (compose)
|
||||
] when* ;
|
||||
|
||||
: compose ( str -- comp )
|
||||
[
|
||||
main-str set
|
||||
0 ind set
|
||||
SBUF" " clone after set
|
||||
pass-combining (compose)
|
||||
] "" make* ;
|
||||
|
||||
: nfc ( string -- nfc )
|
||||
nfd compose ;
|
||||
|
||||
: nfkc ( string -- nfkc )
|
||||
nfkc compose ;
|
||||
! For now: convenience to load all Unicode vocabs
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
! Copyright (C) 2005, 2007 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel sequences unicode math ;
|
||||
USING: kernel sequences unicode.syntax math ;
|
||||
IN: xml.char-classes
|
||||
|
||||
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u0559\u06E5\u06E6_ ;
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
! Copyright (C) 2005, 2006 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel sequences sequences.private assocs arrays delegate vectors ;
|
||||
USING: kernel sequences sequences.private assocs arrays
|
||||
delegate.protocols delegate vectors ;
|
||||
IN: xml.data
|
||||
|
||||
TUPLE: name space tag url ;
|
||||
|
|
Loading…
Reference in New Issue