Unicode changes
parent
e4de6fd8af
commit
71f6e7d4ba
|
@ -0,0 +1 @@
|
|||
Non-core assoc words
|
|
@ -1 +0,0 @@
|
|||
Eduardo Cavazos
|
|
@ -1 +0,0 @@
|
|||
Non-core hashtable words
|
|
@ -1 +0,0 @@
|
|||
collections
|
|
@ -1,6 +1,7 @@
|
|||
USING: unicode kernel math const combinators splitting
|
||||
USING: unicode.categories kernel math const combinators splitting
|
||||
sequences math.parser io.files io assocs arrays namespaces
|
||||
math.ranges unicode.normalize unicode.syntax ;
|
||||
combinators.lib assocs.lib math.ranges unicode.normalize
|
||||
unicode.syntax unicode.data ;
|
||||
IN: unicode.breaks
|
||||
|
||||
ENUM: Any L V T Extend Control CR LF graphemes ;
|
||||
|
@ -32,10 +33,7 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ;
|
|||
"extra/unicode/PropList.txt" resource-path <file-reader> lines ;
|
||||
|
||||
DEFER: other-extend
|
||||
: load-other-extend
|
||||
other-extend-lines process-other-extend
|
||||
\ other-extend define-value ; parsing
|
||||
load-other-extend
|
||||
<< other-extend-lines process-other-extend \ other-extend define-value >>
|
||||
|
||||
CATEGORY: (extend) Me Mn ;
|
||||
: extend? ( ch -- ? )
|
||||
|
@ -81,11 +79,11 @@ SYMBOL: table
|
|||
graphemes Extend connect-after ;
|
||||
|
||||
DEFER: grapheme-table
|
||||
: load-grapheme-table
|
||||
<<
|
||||
init-grapheme-table table
|
||||
[ make-grapheme-table finish-table ] with-variable
|
||||
\ grapheme-table define-value ; parsing
|
||||
load-grapheme-table
|
||||
\ grapheme-table define-value
|
||||
>>
|
||||
|
||||
: grapheme-break? ( class1 class2 -- ? )
|
||||
grapheme-table nth nth not ;
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
USING: unicode.case tools.test namespaces ;
|
||||
|
||||
[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
|
||||
[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
|
||||
[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
|
||||
[ t ] [ "hello how are you?" lower? ] unit-test
|
||||
[
|
||||
"tr" locale set
|
||||
[ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
|
||||
! [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
|
||||
[ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
|
||||
"lt" locale set
|
||||
! Lithuanian casing tests
|
||||
] with-scope
|
|
@ -1,5 +1,5 @@
|
|||
USING: kernel unicode.load sequences sequences.next namespaces assocs.lib
|
||||
unicode.normalize math unicode combinators assocs ;
|
||||
USING: kernel unicode.data sequences sequences.next namespaces assocs.lib
|
||||
unicode.normalize math unicode.categories combinators assocs ;
|
||||
IN: unicode.case
|
||||
|
||||
: ch>lower ( ch -- lower ) simple-lower at-default ;
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
|
||||
|
||||
[ { f f t t f t t f f t } ] [ CHAR: A {
|
||||
blank? letter? LETTER? Letter? digit?
|
||||
printable? alpha? control? uncased? character?
|
||||
} [ execute ] curry* map ] unit-test
|
||||
[ "Nd" ] [ CHAR: 3 category ] unit-test
|
|
@ -0,0 +1,13 @@
|
|||
USING: unicode.syntax ;
|
||||
IN: unicode.categories
|
||||
|
||||
CATEGORY: blank Zs Zl Zp ;
|
||||
CATEGORY: letter Ll ;
|
||||
CATEGORY: LETTER Lu ;
|
||||
CATEGORY: Letter Lu Ll Lt Lm Lo ;
|
||||
CATEGORY: digit Nd Nl No ;
|
||||
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
|
||||
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
|
||||
CATEGORY: control Cc ;
|
||||
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
|
||||
CATEGORY-NOT: character Cn ;
|
|
@ -1,7 +1,7 @@
|
|||
USING: assocs math kernel sequences io.files hashtables quotations
|
||||
splitting arrays math.parser combinators.lib hash2 byte-arrays words
|
||||
namespaces ;
|
||||
IN: unicode.load
|
||||
IN: unicode.data
|
||||
|
||||
! Convenience functions
|
||||
: 1+* ( n/f _ -- n+1 )
|
||||
|
@ -112,11 +112,6 @@ C: <code-point> code-point
|
|||
4 head [ multihex ] map first4
|
||||
<code-point> swap first set ;
|
||||
|
||||
: load-special-casing ( -- special-casing )
|
||||
"extra/unicode/SpecialCasing.txt" resource-path data
|
||||
[ length 5 = ] subset
|
||||
[ [ set-code-point ] each ] H{ } make-assoc ;
|
||||
|
||||
DEFER: simple-lower
|
||||
DEFER: simple-upper
|
||||
DEFER: simple-title
|
||||
|
@ -126,7 +121,6 @@ DEFER: class-map
|
|||
DEFER: compat-map
|
||||
DEFER: category-map
|
||||
DEFER: name-map
|
||||
DEFER: special-casing
|
||||
|
||||
<<
|
||||
load-data
|
||||
|
@ -139,5 +133,22 @@ DEFER: special-casing
|
|||
\ combine-map define-value
|
||||
dup process-compat \ compat-map define-value
|
||||
process-category \ category-map define-value
|
||||
load-special-casing \ special-casing define-value
|
||||
>>
|
||||
|
||||
: canonical-entry ( char -- seq ) canonical-map at ;
|
||||
: combine-chars ( a b -- char/f ) combine-map hash2 ;
|
||||
: compat-entry ( char -- seq ) compat-map at ;
|
||||
: combining-class ( char -- n ) class-map at ;
|
||||
: non-starter? ( char -- ? ) class-map key? ;
|
||||
: name>char ( string -- char ) name-map at ;
|
||||
: char>name ( char -- string ) name-map value-at ;
|
||||
|
||||
! Special casing data
|
||||
: load-special-casing ( -- special-casing )
|
||||
"extra/unicode/SpecialCasing.txt" resource-path data
|
||||
[ length 5 = ] subset
|
||||
[ [ set-code-point ] each ] H{ } make-assoc ;
|
||||
|
||||
DEFER: special-casing
|
||||
|
||||
<< load-special-casing \ special-casing define-value >>
|
|
@ -0,0 +1,18 @@
|
|||
USING: unicode.normalize kernel tools.test sequences ;
|
||||
|
||||
[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
|
||||
|
||||
[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
|
||||
[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
|
||||
[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
|
||||
[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
|
||||
|
||||
[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
|
||||
[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
|
||||
|
||||
[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
|
||||
[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
|
||||
[ t ] [ 54620 hangul? ] unit-test
|
||||
[ f ] [ 0 hangul? ] unit-test
|
||||
[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
|
||||
[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test
|
|
@ -1,4 +1,4 @@
|
|||
USING: sequences namespaces unicode.load kernel combinators.lib math
|
||||
USING: sequences namespaces unicode.data kernel combinators.lib math
|
||||
unicode arrays ;
|
||||
IN: unicode.normalize
|
||||
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
USING: unicode.syntax tools.test ;
|
||||
|
||||
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
|
||||
! Write a test for CATEGORY and CATEGORY-NOT
|
|
@ -1,4 +1,4 @@
|
|||
USING: unicode.load kernel math sequences parser bit-arrays namespaces
|
||||
USING: unicode.data kernel math sequences parser bit-arrays namespaces
|
||||
sequences.private arrays quotations classes.predicate ;
|
||||
IN: unicode.syntax
|
||||
|
||||
|
@ -54,3 +54,7 @@ C: <code-point> code-point
|
|||
: set-code-point ( seq -- )
|
||||
4 head [ multihex ] map first4
|
||||
<code-point> swap first set ;
|
||||
|
||||
: UNICHAR:
|
||||
! This should be part of CHAR:
|
||||
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
|
||||
|
|
|
@ -1,37 +1 @@
|
|||
USING: unicode kernel tools.test words sequences namespaces ;
|
||||
|
||||
[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
|
||||
[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
|
||||
[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
|
||||
[ t ] [ "hello how are you?" lower? ] unit-test
|
||||
[
|
||||
"tr" locale set
|
||||
[ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
|
||||
! [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
|
||||
[ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
|
||||
"lt" locale set
|
||||
! Lithuanian casing tests
|
||||
] with-scope
|
||||
|
||||
[ { f f t t f t t f f t } ] [ CHAR: A {
|
||||
blank? letter? LETTER? Letter? digit?
|
||||
printable? alpha? control? uncased? character?
|
||||
} [ execute ] curry* map ] unit-test
|
||||
[ "Nd" ] [ CHAR: 3 category ] unit-test
|
||||
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
|
||||
[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
|
||||
|
||||
[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
|
||||
[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
|
||||
[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
|
||||
[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
|
||||
|
||||
[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
|
||||
[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
|
||||
|
||||
[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
|
||||
[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
|
||||
[ t ] [ 54620 hangul? ] unit-test
|
||||
[ f ] [ 0 hangul? ] unit-test
|
||||
[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
|
||||
[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test
|
||||
|
|
|
@ -1,25 +1,5 @@
|
|||
USING: unicode.syntax hash2 assocs unicode.load kernel parser ;
|
||||
USING: unicode.syntax unicode.data unicode.breaks unicode.normalize
|
||||
unicode.case unicode.categories ;
|
||||
IN: unicode
|
||||
|
||||
: canonical-entry ( char -- seq ) canonical-map at ;
|
||||
: combine-chars ( a b -- char/f ) combine-map hash2 ;
|
||||
: compat-entry ( char -- seq ) compat-map at ;
|
||||
: combining-class ( char -- n ) class-map at ;
|
||||
: non-starter? ( char -- ? ) class-map key? ;
|
||||
: name>char ( string -- char ) name-map at ;
|
||||
: char>name ( char -- string ) name-map value-at ;
|
||||
|
||||
CATEGORY: blank Zs Zl Zp ;
|
||||
CATEGORY: letter Ll ;
|
||||
CATEGORY: LETTER Lu ;
|
||||
CATEGORY: Letter Lu Ll Lt Lm Lo ;
|
||||
CATEGORY: digit Nd Nl No ;
|
||||
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
|
||||
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
|
||||
CATEGORY: control Cc ;
|
||||
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
|
||||
CATEGORY-NOT: character Cn ;
|
||||
|
||||
: UNICHAR:
|
||||
! This should be part of CHAR:
|
||||
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
|
||||
! For now: convenience to load all Unicode vocabs
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
! Copyright (C) 2005, 2007 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel sequences unicode math ;
|
||||
USING: kernel sequences unicode.syntax math ;
|
||||
IN: xml.char-classes
|
||||
|
||||
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u0559\u06E5\u06E6_ ;
|
||||
|
|
Loading…
Reference in New Issue