Unicode changes

db4
Daniel Ehrenberg 2008-01-09 17:13:26 -06:00
parent e4de6fd8af
commit 71f6e7d4ba
17 changed files with 95 additions and 84 deletions

View File

@ -0,0 +1 @@
Non-core assoc words

View File

@ -1 +0,0 @@
Eduardo Cavazos

View File

@ -1 +0,0 @@
Non-core hashtable words

View File

@ -1 +0,0 @@
collections

View File

@ -1,6 +1,7 @@
USING: unicode kernel math const combinators splitting
USING: unicode.categories kernel math const combinators splitting
sequences math.parser io.files io assocs arrays namespaces
math.ranges unicode.normalize unicode.syntax ;
combinators.lib assocs.lib math.ranges unicode.normalize
unicode.syntax unicode.data ;
IN: unicode.breaks
ENUM: Any L V T Extend Control CR LF graphemes ;
@ -32,10 +33,7 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ;
"extra/unicode/PropList.txt" resource-path <file-reader> lines ;
DEFER: other-extend
: load-other-extend
other-extend-lines process-other-extend
\ other-extend define-value ; parsing
load-other-extend
<< other-extend-lines process-other-extend \ other-extend define-value >>
CATEGORY: (extend) Me Mn ;
: extend? ( ch -- ? )
@ -81,11 +79,11 @@ SYMBOL: table
graphemes Extend connect-after ;
DEFER: grapheme-table
: load-grapheme-table
<<
init-grapheme-table table
[ make-grapheme-table finish-table ] with-variable
\ grapheme-table define-value ; parsing
load-grapheme-table
\ grapheme-table define-value
>>
: grapheme-break? ( class1 class2 -- ? )
grapheme-table nth nth not ;

View File

@ -0,0 +1,14 @@
USING: unicode.case tools.test namespaces ;
[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
[ t ] [ "hello how are you?" lower? ] unit-test
[
"tr" locale set
[ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
! [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
[ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
"lt" locale set
! Lithuanian casing tests
] with-scope

View File

@ -1,5 +1,5 @@
USING: kernel unicode.load sequences sequences.next namespaces assocs.lib
unicode.normalize math unicode combinators assocs ;
USING: kernel unicode.data sequences sequences.next namespaces assocs.lib
unicode.normalize math unicode.categories combinators assocs ;
IN: unicode.case
: ch>lower ( ch -- lower ) simple-lower at-default ;

View File

@ -0,0 +1,7 @@
USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
[ { f f t t f t t f f t } ] [ CHAR: A {
blank? letter? LETTER? Letter? digit?
printable? alpha? control? uncased? character?
} [ execute ] curry* map ] unit-test
[ "Nd" ] [ CHAR: 3 category ] unit-test

View File

@ -0,0 +1,13 @@
USING: unicode.syntax ;
IN: unicode.categories
CATEGORY: blank Zs Zl Zp ;
CATEGORY: letter Ll ;
CATEGORY: LETTER Lu ;
CATEGORY: Letter Lu Ll Lt Lm Lo ;
CATEGORY: digit Nd Nl No ;
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
CATEGORY: control Cc ;
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
CATEGORY-NOT: character Cn ;

View File

@ -1,7 +1,7 @@
USING: assocs math kernel sequences io.files hashtables quotations
splitting arrays math.parser combinators.lib hash2 byte-arrays words
namespaces ;
IN: unicode.load
IN: unicode.data
! Convenience functions
: 1+* ( n/f _ -- n+1 )
@ -112,11 +112,6 @@ C: <code-point> code-point
4 head [ multihex ] map first4
<code-point> swap first set ;
: load-special-casing ( -- special-casing )
"extra/unicode/SpecialCasing.txt" resource-path data
[ length 5 = ] subset
[ [ set-code-point ] each ] H{ } make-assoc ;
DEFER: simple-lower
DEFER: simple-upper
DEFER: simple-title
@ -126,7 +121,6 @@ DEFER: class-map
DEFER: compat-map
DEFER: category-map
DEFER: name-map
DEFER: special-casing
<<
load-data
@ -139,5 +133,22 @@ DEFER: special-casing
\ combine-map define-value
dup process-compat \ compat-map define-value
process-category \ category-map define-value
load-special-casing \ special-casing define-value
>>
: canonical-entry ( char -- seq ) canonical-map at ;
: combine-chars ( a b -- char/f ) combine-map hash2 ;
: compat-entry ( char -- seq ) compat-map at ;
: combining-class ( char -- n ) class-map at ;
: non-starter? ( char -- ? ) class-map key? ;
: name>char ( string -- char ) name-map at ;
: char>name ( char -- string ) name-map value-at ;
! Special casing data
: load-special-casing ( -- special-casing )
"extra/unicode/SpecialCasing.txt" resource-path data
[ length 5 = ] subset
[ [ set-code-point ] each ] H{ } make-assoc ;
DEFER: special-casing
<< load-special-casing \ special-casing define-value >>

View File

@ -0,0 +1,18 @@
USING: unicode.normalize kernel tools.test sequences ;
[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
[ t ] [ 54620 hangul? ] unit-test
[ f ] [ 0 hangul? ] unit-test
[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test

View File

@ -1,4 +1,4 @@
USING: sequences namespaces unicode.load kernel combinators.lib math
USING: sequences namespaces unicode.data kernel combinators.lib math
unicode arrays ;
IN: unicode.normalize

View File

@ -0,0 +1,4 @@
USING: unicode.syntax tools.test ;
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
! Write a test for CATEGORY and CATEGORY-NOT

View File

@ -1,4 +1,4 @@
USING: unicode.load kernel math sequences parser bit-arrays namespaces
USING: unicode.data kernel math sequences parser bit-arrays namespaces
sequences.private arrays quotations classes.predicate ;
IN: unicode.syntax
@ -54,3 +54,7 @@ C: <code-point> code-point
: set-code-point ( seq -- )
4 head [ multihex ] map first4
<code-point> swap first set ;
: UNICHAR:
! This should be part of CHAR:
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing

View File

@ -1,37 +1 @@
USING: unicode kernel tools.test words sequences namespaces ;
[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
[ t ] [ "hello how are you?" lower? ] unit-test
[
"tr" locale set
[ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
! [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
[ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
"lt" locale set
! Lithuanian casing tests
] with-scope
[ { f f t t f t t f f t } ] [ CHAR: A {
blank? letter? LETTER? Letter? digit?
printable? alpha? control? uncased? character?
} [ execute ] curry* map ] unit-test
[ "Nd" ] [ CHAR: 3 category ] unit-test
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
[ t ] [ 54620 hangul? ] unit-test
[ f ] [ 0 hangul? ] unit-test
[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test

View File

@ -1,25 +1,5 @@
USING: unicode.syntax hash2 assocs unicode.load kernel parser ;
USING: unicode.syntax unicode.data unicode.breaks unicode.normalize
unicode.case unicode.categories ;
IN: unicode
: canonical-entry ( char -- seq ) canonical-map at ;
: combine-chars ( a b -- char/f ) combine-map hash2 ;
: compat-entry ( char -- seq ) compat-map at ;
: combining-class ( char -- n ) class-map at ;
: non-starter? ( char -- ? ) class-map key? ;
: name>char ( string -- char ) name-map at ;
: char>name ( char -- string ) name-map value-at ;
CATEGORY: blank Zs Zl Zp ;
CATEGORY: letter Ll ;
CATEGORY: LETTER Lu ;
CATEGORY: Letter Lu Ll Lt Lm Lo ;
CATEGORY: digit Nd Nl No ;
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
CATEGORY: control Cc ;
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
CATEGORY-NOT: character Cn ;
: UNICHAR:
! This should be part of CHAR:
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
! For now: convenience to load all Unicode vocabs

View File

@ -1,6 +1,6 @@
! Copyright (C) 2005, 2007 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel sequences unicode math ;
USING: kernel sequences unicode.syntax math ;
IN: xml.char-classes
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u0559\u06E5\u06E6_ ;