Doug Coleman 2008-01-09 22:18:14 -10:00
commit 3546f5150d
28 changed files with 616 additions and 587 deletions

View File

@ -24,3 +24,8 @@ IN: io.crc32
>r HEX: ffffffff dup r> [ (crc32) ] each bitxor ;
: file-crc32 ( path -- n ) file-contents crc32 ;
: lines-crc32 ( seq -- n )
HEX: ffffffff tuck [
[ (crc32) ] each CHAR: \n (crc32)
] reduce bitxor ;

0
core/parser/parser.factor Executable file → Normal file
View File

View File

@ -33,8 +33,8 @@ uses definitions ;
dup source-file-path ?resource-path file-modified
swap set-source-file-modified ;
: record-checksum ( contents source-file -- )
>r crc32 r> set-source-file-checksum ;
: record-checksum ( lines source-file -- )
swap lines-crc32 swap set-source-file-checksum ;
: (xref-source) ( source-file -- pathname uses )
dup source-file-path <pathname> swap source-file-uses
@ -68,7 +68,7 @@ uses definitions ;
: reset-checksums ( -- )
source-files get [
swap ?resource-path dup exists?
[ file-contents record-checksum ] [ 2drop ] if
[ file-lines record-checksum ] [ 2drop ] if
] assoc-each ;
M: pathname where pathname-string 1 2array ;

View File

@ -0,0 +1 @@
Non-core assoc words

View File

@ -37,9 +37,9 @@ HINTS: do-line vector string ;
] with-stream ;
: reverse-complement-main ( -- )
"reverse-complement-in.txt"
"reverse-complement-out.txt"
[ home swap path+ ] 2apply
"extra/benchmark/reverse-complement/reverse-complement-test-in.txt"
"extra/benchmark/reverse-complement/reverse-complement-test-out.txt"
[ resource-path ] 2apply
reverse-complement ;
MAIN: reverse-complement-main

View File

@ -166,3 +166,6 @@ MACRO: construct-slots ( assoc tuple-class -- tuple )
[ construct-empty ] curry swap [
[ dip ] curry swap 1quotation [ keep ] curry compose
] { } assoc>map concat compose ;
: either ( object first second -- ? )
>r over slip swap [ r> drop ] [ r> call ] ?if ; inline

View File

@ -1,7 +1,6 @@
! Copyright (C) 2007 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: parser generic kernel classes words slots io definitions
sequences sequences.private assocs prettyprint.sections arrays ;
USING: parser generic kernel classes words slots assocs sequences arrays ;
IN: delegate
: define-protocol ( wordlist protocol -- )
@ -37,31 +36,6 @@ M: tuple-class group-words
: CONSULT:
scan-word scan-word parse-definition swapd define-consult ; parsing
PROTOCOL: sequence-protocol
clone clone-like like new new-resizable nth nth-unsafe
set-nth set-nth-unsafe length set-length lengthen ;
PROTOCOL: assoc-protocol
at* assoc-size >alist set-at assoc-clone-like
delete-at clear-assoc new-assoc assoc-like ;
! assoc-find excluded because GENERIC# 1
! everything should work, just slower (with >alist)
PROTOCOL: stream-protocol
stream-close stream-read1 stream-read stream-read-until
stream-flush stream-write1 stream-write stream-format
stream-nl make-span-stream make-block-stream stream-readln
make-cell-stream stream-write-table set-timeout ;
PROTOCOL: definition-protocol
where set-where forget uses redefined*
synopsis* definer definition ;
PROTOCOL: prettyprint-section-protocol
section-fits? indent-section? unindent-first-line?
newline-after? short-section? short-section long-section
<section> delegate>block add-section ;
: define-mimic ( group mimicker mimicked -- )
>r >r group-words r> r> [
pick "methods" word-prop at dup

View File

@ -0,0 +1,32 @@
! Copyright (C) 2007 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: delegate sequences.private sequences assocs prettyprint.sections
io definitions kernel ;
IN: delegate.protocols
PROTOCOL: sequence-protocol
clone clone-like like new new-resizable nth nth-unsafe
set-nth set-nth-unsafe length set-length lengthen ;
PROTOCOL: assoc-protocol
at* assoc-size >alist set-at assoc-clone-like
delete-at clear-assoc new-assoc assoc-like ;
! assoc-find excluded because GENERIC# 1
! everything should work, just slower (with >alist)
PROTOCOL: stream-protocol
stream-close stream-read1 stream-read stream-read-until
stream-flush stream-write1 stream-write stream-format
stream-nl make-span-stream make-block-stream stream-readln
make-cell-stream stream-write-table set-timeout ;
PROTOCOL: definition-protocol
where set-where forget uses redefined*
synopsis* definer definition ;
PROTOCOL: prettyprint-section-protocol
section-fits? indent-section? unindent-first-line?
newline-after? short-section? short-section long-section
<section> delegate>block add-section ;

View File

@ -1,4 +1,5 @@
USING: hash2 help.syntax help.markup ;
USING: help.syntax help.markup ;
IN: hash2
ARTICLE: { "hash2" "intro" }
"The hash2 vocabulary specifies a simple minimal datastructure for hash tables with two integers as keys. These hash tables are fixed size and do not conform to the associative mapping protocol. Words used in creating and manipulating these hash tables include:"

View File

@ -1 +0,0 @@
Eduardo Cavazos

View File

@ -1 +0,0 @@
Non-core hashtable words

View File

@ -1 +0,0 @@
collections

View File

@ -1,6 +1,7 @@
USING: unicode kernel math const combinators splitting
USING: unicode.categories kernel math const combinators splitting
sequences math.parser io.files io assocs arrays namespaces
;
combinators.lib assocs.lib math.ranges unicode.normalize
unicode.syntax unicode.data ;
IN: unicode.breaks
ENUM: Any L V T Extend Control CR LF graphemes ;
@ -25,17 +26,14 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ;
: process-other-extend ( lines -- set )
[ "#" split1 drop ";" split1 drop trim-blank ] map
[ empty? not ] subset
[ ".." split1 [ dup ] unless* [ hex> ] 2apply range ] map
[ ".." split1 [ dup ] unless* [ hex> ] 2apply [a,b] ] map
concat >set ;
: other-extend-lines ( -- lines )
"extra/unicode/PropList.txt" resource-path <file-reader> lines ;
DEFER: other-extend
: load-other-extend
other-extend-lines process-other-extend
\ other-extend define-value ; parsing
load-other-extend
<< other-extend-lines process-other-extend \ other-extend define-value >>
CATEGORY: (extend) Me Mn ;
: extend? ( ch -- ? )
@ -81,11 +79,11 @@ SYMBOL: table
graphemes Extend connect-after ;
DEFER: grapheme-table
: load-grapheme-table
<<
init-grapheme-table table
[ make-grapheme-table finish-table ] with-variable
\ grapheme-table define-value ; parsing
load-grapheme-table
\ grapheme-table define-value
>>
: grapheme-break? ( class1 class2 -- ? )
grapheme-table nth nth not ;

View File

@ -0,0 +1,14 @@
USING: unicode.case tools.test namespaces ;
[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
[ t ] [ "hello how are you?" lower? ] unit-test
[
"tr" locale set
[ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
! [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
[ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
"lt" locale set
! Lithuanian casing tests
] with-scope

View File

@ -0,0 +1,111 @@
USING: kernel unicode.data sequences sequences.next namespaces assocs.lib
unicode.normalize math unicode.categories combinators assocs ;
IN: unicode.case
: ch>lower ( ch -- lower ) simple-lower at-default ;
: ch>upper ( ch -- upper ) simple-upper at-default ;
: ch>title ( ch -- title ) simple-title at-default ;
SYMBOL: locale ! Just casing locale, or overall?
: i-dot? ( -- ? )
locale get { "tr" "az" } member? ;
: lithuanian? ( -- ? ) locale get "lt" = ;
: dot-over ( -- ch ) CHAR: \u0307 ;
: lithuanian-ch>upper ( ? next ch -- ? )
rot [ 2drop f ]
[ swap dot-over = over "ij" member? and swap , ] if ;
: lithuanian>upper ( string -- lower )
[ f swap [ lithuanian-ch>upper ] each-next drop ] "" make* ;
: mark-above? ( ch -- ? )
combining-class 230 = ;
: lithuanian-ch>lower ( next ch -- )
! This fails to add a dot above in certain edge cases
! where there is a non-above combining mark before an above one
! in Lithuanian
dup , "IJ" member? swap mark-above? and [ dot-over , ] when ;
: lithuanian>lower ( string -- lower )
[ [ lithuanian-ch>lower ] each-next ] "" make* ;
: turk-ch>upper ( ch -- )
dup CHAR: i =
[ drop CHAR: I , dot-over , ] [ , ] if ;
: turk>upper ( string -- upper-i )
[ [ turk-ch>upper ] each ] "" make* ;
: turk-ch>lower ( ? next ch -- ? )
{
{ [ rot ] [ 2drop f ] }
{ [ dup CHAR: I = ] [
drop dot-over =
dup CHAR: i CHAR: \u0131 ? ,
] }
{ [ t ] [ , drop f ] }
} cond ;
: turk>lower ( string -- lower-i )
[ f swap [ turk-ch>lower ] each-next drop ] "" make* ;
: word-boundary ( prev char -- new ? )
dup non-starter? [ drop dup ] when
swap uncased? ;
: sigma-map ( string -- string )
[
swap [ uncased? ] keep not or
[ drop HEX: 3C2 ] when
] map-next ;
: final-sigma ( string -- string )
HEX: 3A3 over member? [ sigma-map ] when ;
: map-case ( string string-quot char-quot -- case )
[
rot [
-rot [
rot dup special-casing at
[ -rot drop call % ]
[ -rot nip call , ] ?if
] 2keep
] each 2drop
] "" make* ; inline
: >lower ( string -- lower )
i-dot? [ turk>lower ] when
final-sigma [ code-point-lower ] [ ch>lower ] map-case ;
: >upper ( string -- upper )
i-dot? [ turk>upper ] when
[ code-point-upper ] [ ch>upper ] map-case ;
: >title ( string -- title )
final-sigma
CHAR: \s swap
[ tuck word-boundary swapd
[ code-point-title ] [ code-point-lower ] if ]
[ tuck word-boundary swapd
[ ch>title ] [ ch>lower ] if ]
map-case nip ;
: >case-fold ( string -- fold )
>upper >lower ;
: insensitive= ( str1 str2 -- ? )
[ >case-fold ] 2apply = ;
: lower? ( string -- ? )
dup >lower = ;
: upper? ( string -- ? )
dup >lower = ;
: title? ( string -- ? )
dup >title = ;
: case-fold? ( string -- ? )
dup >case-fold = ;

View File

@ -0,0 +1,7 @@
USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
[ { f f t t f t t f f t } ] [ CHAR: A {
blank? letter? LETTER? Letter? digit?
printable? alpha? control? uncased? character?
} [ execute ] curry* map ] unit-test
[ "Nd" ] [ CHAR: 3 category ] unit-test

View File

@ -0,0 +1,13 @@
USING: unicode.syntax ;
IN: unicode.categories
CATEGORY: blank Zs Zl Zp ;
CATEGORY: letter Ll ;
CATEGORY: LETTER Lu ;
CATEGORY: Letter Lu Ll Lt Lm Lo ;
CATEGORY: digit Nd Nl No ;
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
CATEGORY: control Cc ;
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
CATEGORY-NOT: character Cn ;

View File

@ -0,0 +1,149 @@
USING: assocs math kernel sequences io.files hashtables quotations
splitting arrays math.parser combinators.lib hash2 byte-arrays words
namespaces words ;
IN: unicode.data
! Convenience functions
: 1+* ( n/f _ -- n+1 )
drop [ 1+ ] [ 0 ] if* ;
: define-value ( value word -- )
swap 1quotation define ;
: ?between? ( n/f from to -- ? )
pick [ between? ] [ 3drop f ] if ;
! Loading data from UnicodeData.txt
: data ( filename -- data )
file-lines [ ";" split ] map ;
: load-data ( -- data )
"extra/unicode/UnicodeData.txt" resource-path data ;
: (process-data) ( index data -- newdata )
[ [ nth ] keep first swap 2array ] with map
[ second empty? not ] subset
[ >r hex> r> ] assoc-map ;
: process-data ( index data -- hash )
(process-data) [ hex> ] assoc-map >hashtable ;
: (chain-decomposed) ( hash value -- newvalue )
[
2dup swap at
[ (chain-decomposed) ] [ 1array nip ] ?if
] with map concat ;
: chain-decomposed ( hash -- newhash )
dup [ swap (chain-decomposed) ] curry assoc-map ;
: first* ( seq -- ? )
second [ empty? ] [ first ] either ;
: (process-decomposed) ( data -- alist )
5 swap (process-data)
[ " " split [ hex> ] map ] assoc-map ;
: process-canonical ( data -- hash2 hash )
(process-decomposed) [ first* ] subset
[
[ second length 2 = ] subset
! using 1009 as the size, the maximum load is 4
[ first2 first2 rot 3array ] map 1009 alist>hash2
] keep
>hashtable chain-decomposed ;
: process-compat ( data -- hash )
(process-decomposed)
[ dup first* [ first2 1 tail 2array ] unless ] map
>hashtable chain-decomposed ;
: process-combining ( data -- hash )
3 swap (process-data)
[ string>number ] assoc-map
[ nip 0 = not ] assoc-subset
>hashtable ;
: categories ( -- names )
! For non-existent characters, use Cn
{ "Lu" "Ll" "Lt" "Lm" "Lo"
"Mn" "Mc" "Me"
"Nd" "Nl" "No"
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
"Sm" "Sc" "Sk" "So"
"Zs" "Zl" "Zp"
"Cc" "Cf" "Cs" "Co" "Cn" } ;
: unicode-chars HEX: 2FA1E ;
! the maximum unicode char in the first 3 planes
: process-category ( data -- category-listing )
2 swap (process-data)
unicode-chars <byte-array> swap dupd swap [
>r over unicode-chars >= [ r> 3drop ]
[ categories index swap r> set-nth ] if
] curry assoc-each ;
: ascii-lower ( string -- lower )
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
: replace ( seq old new -- newseq )
swap rot [ 2dup = [ drop over ] when ] map 2nip ;
: process-names ( data -- names-hash )
1 swap (process-data)
[ ascii-lower CHAR: \s CHAR: - replace swap ] assoc-map
>hashtable ;
: multihex ( hexstring -- string )
" " split [ hex> ] map [ ] subset ;
TUPLE: code-point lower title upper ;
C: <code-point> code-point
: set-code-point ( seq -- )
4 head [ multihex ] map first4
<code-point> swap first set ;
DEFER: simple-lower
DEFER: simple-upper
DEFER: simple-title
DEFER: canonical-map
DEFER: combine-map
DEFER: class-map
DEFER: compat-map
DEFER: category-map
DEFER: name-map
<<
load-data
dup process-names \ name-map define-value
13 over process-data \ simple-lower define-value
12 over process-data tuck \ simple-upper define-value
14 over process-data swapd union \ simple-title define-value
dup process-combining \ class-map define-value
dup process-canonical \ canonical-map define-value
\ combine-map define-value
dup process-compat \ compat-map define-value
process-category \ category-map define-value
>>
: canonical-entry ( char -- seq ) canonical-map at ;
: combine-chars ( a b -- char/f ) combine-map hash2 ;
: compat-entry ( char -- seq ) compat-map at ;
: combining-class ( char -- n ) class-map at ;
: non-starter? ( char -- ? ) class-map key? ;
: name>char ( string -- char ) name-map at ;
: char>name ( char -- string ) name-map value-at ;
! Special casing data
: load-special-casing ( -- special-casing )
"extra/unicode/SpecialCasing.txt" resource-path data
[ length 5 = ] subset
[ [ set-code-point ] each ] H{ } make-assoc ;
DEFER: special-casing
<< load-special-casing \ special-casing define-value >>

View File

@ -0,0 +1,18 @@
USING: unicode.normalize kernel tools.test sequences ;
[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
[ t ] [ 54620 hangul? ] unit-test
[ f ] [ 0 hangul? ] unit-test
[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test

View File

@ -0,0 +1,176 @@
USING: sequences namespaces unicode.data kernel combinators.lib math
unicode arrays ;
IN: unicode.normalize
! Utility word
: make* ( seq quot exemplar -- newseq )
! quot has access to original seq on stack
! this just makes the new-resizable the same length as seq
[
[
pick length swap new-resizable
[ building set call ] keep
] keep like
] with-scope ; inline
! Conjoining Jamo behavior
: hangul-base HEX: ac00 ; inline
: hangul-end HEX: D7AF ; inline
: initial-base HEX: 1100 ; inline
: medial-base HEX: 1161 ; inline
: final-base HEX: 11a7 ; inline
: initial-count 19 ; inline
: medial-count 21 ; inline
: final-count 28 ; inline
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
! These numbers come from UAX 29
: initial? ( ch -- ? )
[ HEX: 1100 HEX: 1159 ?between? ] [ HEX: 115F = ] either ;
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
: hangul>jamo ( hangul -- jamo-string )
hangul-base - final-count /mod final-base +
>r medial-count /mod medial-base +
>r initial-base + r> r>
dup zero? [ drop 2array ] [ 3array ] if ;
: jamo>hangul ( initial medial final -- hangul )
>r >r initial-base - medial-count *
r> medial-base - + final-count *
r> final-base - + hangul-base + ;
! Normalization -- Decomposition
: (insert) ( seq n quot -- )
over 0 = [ 3drop ] [
[ >r dup 1- rot [ nth ] curry 2apply r> 2apply > ] 3keep
roll [ 3drop ]
[ >r [ dup 1- rot exchange ] 2keep 1- r> (insert) ] if
] if ; inline
: insert ( seq quot elt n -- )
swap rot >r -rot [ swap set-nth ] 2keep r> (insert) ; inline
: insertion-sort ( seq quot -- )
! quot is a transformation on elements
over dup length
[ >r >r 2dup r> r> insert ] 2each 2drop ; inline
: reorder-slice ( string start -- slice done? )
2dup swap [ non-starter? not ] find* drop
[ [ over length ] unless* rot <slice> ] keep not ;
: reorder-next ( string i -- new-i done? )
over [ non-starter? ] find* drop [
reorder-slice
>r dup [ combining-class ] insertion-sort slice-to r>
] [ length t ] if* ;
: reorder-loop ( string start -- )
dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
: reorder ( string -- )
0 reorder-loop ;
: reorder-back ( string i -- )
over [ non-starter? not ] find-last* 1+* reorder-next 2drop ;
: decompose ( string quot -- decomposed )
! When there are 8 and 32-bit strings, this'll be
! equivalent to clone on 8 and the contents of the last
! main quotation on 32.
over [ 127 < ] all? [ drop ] [
swap [ [
dup hangul? [ hangul>jamo % drop ]
[ dup rot call [ % ] [ , ] ?if ] if
] curry* each ] "" make*
dup reorder
] if ; inline
: nfd ( string -- string )
[ canonical-entry ] decompose ;
: nfkd ( string -- string )
[ compat-entry ] decompose ;
: string-append ( s1 s2 -- string )
! This could be more optimized,
! but in practice, it'll almost always just be append
[ append ] keep
0 over ?nth non-starter?
[ length dupd reorder-back ] [ drop ] if ;
! Normalization -- Composition
SYMBOL: main-str
SYMBOL: ind
SYMBOL: after
SYMBOL: char
: get-str ( i -- ch ) ind get + main-str get ?nth ;
: current ( -- ch ) 0 get-str ;
: to ( -- ) ind inc ;
: initial-medial? ( -- ? )
current initial? [ 1 get-str medial? ] [ f ] if ;
: --final? ( -- ? )
2 get-str final? ;
: imf, ( -- )
current to current to current jamo>hangul , ;
: im, ( -- )
current to current 0 jamo>hangul , ;
: compose-jamo ( -- )
initial-medial? [
--final? [ imf, ] [ im, ] if
] when to current jamo? [ compose-jamo ] when ;
: pass-combining ( -- )
current non-starter? [ current , to pass-combining ] when ;
: try-compose ( last-class char current-class -- )
swapd = [ after get push ] [
char get over combine-chars
[ nip char set ] [ after get push ] if*
] if ;
: compose-iter ( n -- )
current [
dup combining-class dup
[ [ try-compose ] keep to compose-iter ] [ 3drop ] if
] [ drop ] if* ;
: ?new-after ( -- )
after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
: (compose) ( -- )
current [
dup jamo? [ drop compose-jamo ] [
char set to ?new-after
0 compose-iter
char get , after get %
to
] if (compose)
] when* ;
: compose ( str -- comp )
[
main-str set
0 ind set
SBUF" " clone after set
pass-combining (compose)
] "" make* ;
: nfc ( string -- nfc )
nfd compose ;
: nfkc ( string -- nfkc )
nfkc compose ;

View File

@ -0,0 +1,4 @@
USING: unicode.syntax tools.test ;
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
! Write a test for CATEGORY and CATEGORY-NOT

View File

@ -0,0 +1,60 @@
USING: unicode.data kernel math sequences parser bit-arrays namespaces
sequences.private arrays quotations classes.predicate ;
IN: unicode.syntax
! Character classes (categories)
: category# ( char -- category )
! There are a few characters that should be Cn
! that this gives Cf or Mn
! Cf = 26; Mn = 5; Cn = 29
! Use a compressed array instead?
dup category-map ?nth [ ] [
dup HEX: E0001 HEX: E007F between?
[ drop 26 ] [
HEX: E0100 HEX: E01EF between? 5 29 ?
] if
] ?if ;
: category ( char -- category )
category# categories nth ;
: >category-array ( categories -- bitarray )
categories [ swap member? ] curry* map >bit-array ;
: as-string ( strings -- bit-array )
concat "\"" tuck 3append parse first ;
: [category] ( categories -- quot )
[
[ [ categories member? not ] subset as-string ] keep
[ categories member? ] subset >category-array
[ dup category# ] % , [ nth-unsafe [ drop t ] ] %
\ member? 2array >quotation ,
\ if ,
] [ ] make ;
: define-category ( word categories -- )
[category] fixnum -rot define-predicate-class ;
: CATEGORY:
CREATE ";" parse-tokens define-category ; parsing
: seq-minus ( seq1 seq2 -- diff )
[ member? not ] curry subset ;
: CATEGORY-NOT:
CREATE ";" parse-tokens
categories swap seq-minus define-category ; parsing
TUPLE: code-point lower title upper ;
C: <code-point> code-point
: set-code-point ( seq -- )
4 head [ multihex ] map first4
<code-point> swap first set ;
: UNICHAR:
! This should be part of CHAR:
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing

View File

@ -1,37 +0,0 @@
USING: unicode kernel tools.test words sequences namespaces ;
[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
[ t ] [ "hello how are you?" lower? ] unit-test
[
"tr" locale set
[ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
! [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
[ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
"lt" locale set
! Lithuanian casing tests
] with-scope
[ { f f t t f t t f f t } ] [ CHAR: A {
blank? letter? LETTER? Letter? digit?
printable? alpha? control? uncased? character?
} [ execute ] with map ] unit-test
[ "Nd" ] [ CHAR: 3 category ] unit-test
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
[ t ] [ 54620 hangul? ] unit-test
[ f ] [ 0 hangul? ] unit-test
[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test

View File

@ -1,503 +1,5 @@
USING: kernel hashtables sequences io arrays math
hash2 namespaces strings assocs words splitting sequences.next
byte-arrays quotations sequences.private io.files bit-arrays
combinators math.parser io.streams.lines parser classes
classes.predicate ;
USING: unicode.syntax unicode.data unicode.breaks unicode.normalize
unicode.case unicode.categories ;
IN: unicode
! Convenience functions
: >set ( seq -- hash )
[ dup ] H{ } map>assoc ;
: either ( object first second -- ? )
>r over slip swap [ r> drop ] [ r> call ] ?if ; inline
: 1+* ( n/f _ -- n+1 )
drop [ 1+ ] [ 0 ] if* ;
: define-value ( value word -- )
swap 1quotation define-compound ;
: ?between? ( n/f from to -- ? )
pick [ between? ] [ 3drop f ] if ;
: range ( from to -- seq )
1+ over - [ + ] with map ;
! Loading data from UnicodeData.txt
: data ( filename -- data )
<file-reader> lines [ ";" split ] map ;
: load-data ( -- data )
"extra/unicode/UnicodeData.txt" resource-path data ;
: (process-data) ( index data -- newdata )
[ [ nth ] keep first swap 2array ] with map
[ second empty? not ] subset
[ >r hex> r> ] assoc-map ;
: process-data ( index data -- hash )
(process-data) [ hex> ] assoc-map >hashtable ;
: (chain-decomposed) ( hash value -- newvalue )
[
2dup swap at
[ (chain-decomposed) ] [ 1array nip ] ?if
] with map concat ;
: chain-decomposed ( hash -- newhash )
dup [ swap (chain-decomposed) ] curry assoc-map ;
: first* ( seq -- ? )
second [ empty? ] [ first ] either ;
: (process-decomposed) ( data -- alist )
5 swap (process-data)
[ " " split [ hex> ] map ] assoc-map ;
: process-canonical ( data -- hash2 hash )
(process-decomposed) [ first* ] subset
[
[ second length 2 = ] subset
! using 1009 as the size, the maximum load is 4
[ first2 first2 rot 3array ] map 1009 alist>hash2
] keep
>hashtable chain-decomposed ;
: process-compat ( data -- hash )
(process-decomposed)
[ dup first* [ first2 1 tail 2array ] unless ] map
>hashtable chain-decomposed ;
: process-combining ( data -- hash )
3 swap (process-data)
[ string>number ] assoc-map
[ nip 0 = not ] assoc-subset
>hashtable ;
: categories ( -- names )
! For non-existent characters, use Cn
{ "Lu" "Ll" "Lt" "Lm" "Lo"
"Mn" "Mc" "Me"
"Nd" "Nl" "No"
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
"Sm" "Sc" "Sk" "So"
"Zs" "Zl" "Zp"
"Cc" "Cf" "Cs" "Co" "Cn" } ;
: unicode-chars HEX: 2FA1E ;
! the maximum unicode char in the first 3 planes
: process-category ( data -- category-listing )
2 swap (process-data)
unicode-chars <byte-array> swap dupd swap [
>r over unicode-chars >= [ r> 3drop ]
[ categories index swap r> set-nth ] if
] curry assoc-each ;
: ascii-lower ( string -- lower )
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
: replace ( seq old new -- newseq )
spin [ 2dup = [ drop over ] when ] map 2nip ;
: process-names ( data -- names-hash )
1 swap (process-data)
[ ascii-lower CHAR: \s CHAR: - replace swap ] assoc-map
>hashtable ;
DEFER: simple-lower
DEFER: simple-upper
DEFER: simple-title
DEFER: canonical-map
DEFER: combine-map
DEFER: class-map
DEFER: compat-map
DEFER: category-map
DEFER: name-map
: load-tables ( -- )
load-data
dup process-names \ name-map define-value
13 over process-data \ simple-lower define-value
12 over process-data tuck \ simple-upper define-value
14 over process-data swapd union \ simple-title define-value
dup process-combining \ class-map define-value
dup process-canonical \ canonical-map define-value
\ combine-map define-value
dup process-compat \ compat-map define-value
process-category \ category-map define-value ; parsing
load-tables
: canonical-entry ( char -- seq ) canonical-map at ;
: combine-chars ( a b -- char/f ) combine-map hash2 ;
: compat-entry ( char -- seq ) compat-map at ;
: combining-class ( char -- n ) class-map at ;
: non-starter? ( char -- ? ) class-map key? ;
: name>char ( string -- char ) name-map at ;
: UNICHAR:
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
! Character classes (categories)
: category# ( char -- category )
! There are a few characters that should be Cn
! that this gives Cf or Mn
! Cf = 26; Mn = 5; Cn = 29
dup category-map ?nth [ ] [
dup HEX: E0001 HEX: E007F between?
[ drop 26 ] [
HEX: E0100 HEX: E01EF between? 5 29 ?
] if
] ?if ;
: category ( char -- category )
category# categories nth ;
: >category-array ( categories -- bitarray )
categories [ swap member? ] with map >bit-array ;
: as-string ( strings -- bit-array )
concat "\"" tuck 3append parse first ;
: [category] ( categories -- quot )
[
[ [ categories member? not ] subset as-string ] keep
[ categories member? ] subset >category-array
[ dup category# ] % , [ nth-unsafe [ drop t ] ] %
\ member? 2array >quotation ,
\ if ,
] [ ] make ;
: define-category ( word categories -- )
[category] fixnum -rot define-predicate-class ;
: CATEGORY:
CREATE ";" parse-tokens define-category ; parsing
: seq-minus ( seq1 seq2 -- diff )
[ member? not ] curry subset ;
: CATEGORY-NOT:
CREATE ";" parse-tokens
categories swap seq-minus define-category ; parsing
CATEGORY: blank Zs Zl Zp ;
CATEGORY: letter Ll ;
CATEGORY: LETTER Lu ;
CATEGORY: Letter Lu Ll Lt Lm Lo ;
CATEGORY: digit Nd Nl No ;
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
CATEGORY: control Cc ;
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
CATEGORY-NOT: character Cn ;
! Utility word
: make* ( seq quot exemplar -- newseq )
! quot has access to original seq on stack
! this just makes the new-resizable the same length as seq
[
[
pick length swap new-resizable
[ building set call ] keep
] keep like
] with-scope ; inline
! Case mapping
: hash-default ( key hash -- value/key )
dupd at [ nip ] when* ;
: ch>lower ( ch -- lower ) simple-lower hash-default ;
: ch>upper ( ch -- upper ) simple-upper hash-default ;
: ch>title ( ch -- title ) simple-title hash-default ;
: load-special-data ( -- data )
"extra/unicode/SpecialCasing.txt" resource-path data
[ length 5 = ] subset ;
: multihex ( hexstring -- string )
" " split [ hex> ] map [ ] subset ;
TUPLE: code-point lower title upper ;
C: <code-point> code-point
: set-code-point ( seq -- )
4 head [ multihex ] map first4
<code-point> swap first set ;
DEFER: special-casing
: load-special-casing
load-special-data [ [ set-code-point ] each ] H{ } make-assoc
\ special-casing define-value ; parsing
load-special-casing
SYMBOL: locale ! Just casing locale, or overall?
: i-dot? ( -- ? )
locale get { "tr" "az" } member? ;
: lithuanian? ( -- ? ) locale get "lt" = ;
: dot-over ( -- ch ) CHAR: \u0307 ;
: lithuanian-ch>upper ( ? next ch -- ? )
rot [ 2drop f ]
[ swap dot-over = over "ij" member? and swap , ] if ;
: lithuanian>upper ( string -- lower )
[ f swap [ lithuanian-ch>upper ] each-next drop ] "" make* ;
: mark-above? ( ch -- ? )
combining-class 230 = ;
: lithuanian-ch>lower ( next ch -- )
! This fails to add a dot above in certain edge cases
! where there is a non-above combining mark before an above one
! in Lithuanian
dup , "IJ" member? swap mark-above? and [ dot-over , ] when ;
: lithuanian>lower ( string -- lower )
[ [ lithuanian-ch>lower ] each-next ] "" make* ;
: turk-ch>upper ( ch -- )
dup CHAR: i =
[ drop CHAR: I , dot-over , ] [ , ] if ;
: turk>upper ( string -- upper-i )
[ [ turk-ch>upper ] each ] "" make* ;
: turk-ch>lower ( ? next ch -- ? )
{
{ [ rot ] [ 2drop f ] }
{ [ dup CHAR: I = ] [
drop dot-over =
dup CHAR: i CHAR: \u0131 ? ,
] }
{ [ t ] [ , drop f ] }
} cond ;
: turk>lower ( string -- lower-i )
[ f swap [ turk-ch>lower ] each-next drop ] "" make* ;
: word-boundary ( prev char -- new ? )
dup non-starter? [ drop dup ] when
swap uncased? ;
: sigma-map ( string -- string )
[
swap [ uncased? ] keep not or
[ drop HEX: 3C2 ] when
] map-next ;
: final-sigma ( string -- string )
HEX: 3A3 over member? [ sigma-map ] when ;
: map-case ( string string-quot char-quot -- case )
[
rot [
-rot [
rot dup special-casing at
[ -rot drop call % ]
[ -rot nip call , ] ?if
] 2keep
] each 2drop
] "" make* ; inline
: >lower ( string -- lower )
i-dot? [ turk>lower ] when
final-sigma [ code-point-lower ] [ ch>lower ] map-case ;
: >upper ( string -- upper )
i-dot? [ turk>upper ] when
[ code-point-upper ] [ ch>upper ] map-case ;
: >title ( string -- title )
final-sigma
CHAR: \s swap
[ tuck word-boundary swapd
[ code-point-title ] [ code-point-lower ] if ]
[ tuck word-boundary swapd
[ ch>title ] [ ch>lower ] if ]
map-case nip ;
: >case-fold ( string -- fold )
>upper >lower ;
: insensitive= ( str1 str2 -- ? )
[ >case-fold ] 2apply = ;
: lower? ( string -- ? )
dup >lower = ;
: upper? ( string -- ? )
dup >lower = ;
: title? ( string -- ? )
dup >title = ;
: case-fold? ( string -- ? )
dup >case-fold = ;
! Conjoining Jamo behavior
: hangul-base HEX: ac00 ; inline
: hangul-end HEX: D7AF ; inline
: initial-base HEX: 1100 ; inline
: medial-base HEX: 1161 ; inline
: final-base HEX: 11a7 ; inline
: initial-count 19 ; inline
: medial-count 21 ; inline
: final-count 28 ; inline
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
! These numbers come from UAX 29
: initial? ( ch -- ? )
[ HEX: 1100 HEX: 1159 ?between? ] [ HEX: 115F = ] either ;
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
: hangul>jamo ( hangul -- jamo-string )
hangul-base - final-count /mod final-base +
>r medial-count /mod medial-base +
>r initial-base + r> r>
dup zero? [ drop 2array ] [ 3array ] if ;
: jamo>hangul ( initial medial final -- hangul )
>r >r initial-base - medial-count *
r> medial-base - + final-count *
r> final-base - + hangul-base + ;
! Normalization -- Decomposition
: (insert) ( seq n quot -- )
over 0 = [ 3drop ] [
[ >r dup 1- rot [ nth ] curry 2apply r> 2apply > ] 3keep
roll [ 3drop ]
[ >r [ dup 1- rot exchange ] 2keep 1- r> (insert) ] if
] if ; inline
: insert ( seq quot elt n -- )
spin >r -rot [ swap set-nth ] 2keep r> (insert) ; inline
: insertion-sort ( seq quot -- )
! quot is a transformation on elements
over dup length
[ >r >r 2dup r> r> insert ] 2each 2drop ; inline
: reorder-slice ( string start -- slice done? )
2dup swap [ non-starter? not ] find* drop
[ [ over length ] unless* rot <slice> ] keep not ;
: reorder-next ( string i -- new-i done? )
over [ non-starter? ] find* drop [
reorder-slice
>r dup [ combining-class ] insertion-sort slice-to r>
] [ length t ] if* ;
: reorder-loop ( string start -- )
dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
: reorder ( string -- )
0 reorder-loop ;
: reorder-back ( string i -- )
over [ non-starter? not ] find-last* 1+* reorder-next 2drop ;
: decompose ( string quot -- decomposed )
! When there are 8 and 32-bit strings, this'll be
! equivalent to clone on 8 and the contents of the last
! main quotation on 32.
over [ 127 < ] all? [ drop ] [
swap [ [
dup hangul? [ hangul>jamo % drop ]
[ dup rot call [ % ] [ , ] ?if ] if
] with each ] "" make*
dup reorder
] if ; inline
: nfd ( string -- string )
[ canonical-entry ] decompose ;
: nfkd ( string -- string )
[ compat-entry ] decompose ;
: string-append ( s1 s2 -- string )
! This could be more optimized,
! but in practice, it'll almost always just be append
[ append ] keep
0 over ?nth non-starter?
[ length dupd reorder-back ] [ drop ] if ;
! Normalization -- Composition
SYMBOL: main-str
SYMBOL: ind
SYMBOL: after
SYMBOL: char
: get-str ( i -- ch ) ind get + main-str get ?nth ;
: current ( -- ch ) 0 get-str ;
: to ( -- ) ind inc ;
: initial-medial? ( -- ? )
current initial? [ 1 get-str medial? ] [ f ] if ;
: --final? ( -- ? )
2 get-str final? ;
: imf, ( -- )
current to current to current jamo>hangul , ;
: im, ( -- )
current to current 0 jamo>hangul , ;
: compose-jamo ( -- )
initial-medial? [
--final? [ imf, ] [ im, ] if
] when to current jamo? [ compose-jamo ] when ;
: pass-combining ( -- )
current non-starter? [ current , to pass-combining ] when ;
: try-compose ( last-class char current-class -- )
swapd = [ after get push ] [
char get over combine-chars
[ nip char set ] [ after get push ] if*
] if ;
: compose-iter ( n -- )
current [
dup combining-class dup
[ [ try-compose ] keep to compose-iter ] [ 3drop ] if
] [ drop ] if* ;
: ?new-after ( -- )
after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
: (compose) ( -- )
current [
dup jamo? [ drop compose-jamo ] [
char set to ?new-after
0 compose-iter
char get , after get %
to
] if (compose)
] when* ;
: compose ( str -- comp )
[
main-str set
0 ind set
SBUF" " clone after set
pass-combining (compose)
] "" make* ;
: nfc ( string -- nfc )
nfd compose ;
: nfkc ( string -- nfkc )
nfkc compose ;
! For now: convenience to load all Unicode vocabs

View File

@ -1,6 +1,6 @@
! Copyright (C) 2005, 2007 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel sequences unicode math ;
USING: kernel sequences unicode.syntax math ;
IN: xml.char-classes
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u0559\u06E5\u06E6_ ;

View File

@ -1,6 +1,7 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel sequences sequences.private assocs arrays delegate vectors ;
USING: kernel sequences sequences.private assocs arrays
delegate.protocols delegate vectors ;
IN: xml.data
TUPLE: name space tag url ;