Unicode changes

2008-01-09 17:13:26 -06:00 · 2008-01-09 17:13:26 -06:00 · 71f6e7d4ba
parent e4de6fd8af
commit 71f6e7d4ba
17 changed files with 95 additions and 84 deletions
--- a/extra/assocs/lib/summary.txt
+++ b/extra/assocs/lib/summary.txt
@ -0,0 +1 @@
+Non-core assoc words
--- a/extra/hashtables/lib/authors.txt
+++ b/extra/hashtables/lib/authors.txt
@ -1 +0,0 @@
-Eduardo Cavazos
--- a/extra/hashtables/lib/summary.txt
+++ b/extra/hashtables/lib/summary.txt
@ -1 +0,0 @@
-Non-core hashtable words
--- a/extra/hashtables/lib/tags.txt
+++ b/extra/hashtables/lib/tags.txt
@ -1 +0,0 @@
-collections
--- a/extra/unicode/breaks/breaks.factor
+++ b/extra/unicode/breaks/breaks.factor
@ -1,6 +1,7 @@
-USING: unicode kernel math const combinators splitting
+USING: unicode.categories kernel math const combinators splitting
 sequences math.parser io.files io assocs arrays namespaces
-math.ranges unicode.normalize unicode.syntax ;
+combinators.lib assocs.lib math.ranges unicode.normalize
+unicode.syntax unicode.data ;
 IN: unicode.breaks

 ENUM: Any L V T Extend Control CR LF graphemes ;
@ -32,10 +33,7 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ;
    "extra/unicode/PropList.txt" resource-path <file-reader> lines ;

 DEFER: other-extend
-: load-other-extend 
-    other-extend-lines process-other-extend
-    \ other-extend define-value ; parsing
-load-other-extend
+<< other-extend-lines process-other-extend \ other-extend define-value >>

 CATEGORY: (extend) Me Mn ;
 : extend? ( ch -- ? )
@ -81,11 +79,11 @@ SYMBOL: table
    graphemes Extend connect-after ;

 DEFER: grapheme-table
-: load-grapheme-table
+<<
    init-grapheme-table table
    [ make-grapheme-table finish-table ] with-variable
-    \ grapheme-table define-value ; parsing
-load-grapheme-table
+    \ grapheme-table define-value
+>>

 : grapheme-break? ( class1 class2 -- ? )
    grapheme-table nth nth not ;
--- a/extra/unicode/case/case-tests.factor
+++ b/extra/unicode/case/case-tests.factor
@ -0,0 +1,14 @@
+USING: unicode.case tools.test namespaces ;
+
+[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
+[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
+[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
+[ t ] [ "hello how are you?" lower? ] unit-test
+[
+    "tr" locale set
+    [ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
+!    [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
+    [ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
+    "lt" locale set
+    ! Lithuanian casing tests
+] with-scope
--- a/extra/unicode/case/case.factor
+++ b/extra/unicode/case/case.factor
@ -1,5 +1,5 @@
-USING: kernel unicode.load sequences sequences.next namespaces assocs.lib
-unicode.normalize math unicode combinators assocs ;
+USING: kernel unicode.data sequences sequences.next namespaces assocs.lib
+unicode.normalize math unicode.categories combinators assocs ;
 IN: unicode.case

 : ch>lower ( ch -- lower ) simple-lower at-default ;
--- a/extra/unicode/categories/categories-tests.factor
+++ b/extra/unicode/categories/categories-tests.factor
@ -0,0 +1,7 @@
+USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
+
+[ { f f t t f t t f f t } ] [ CHAR: A { 
+    blank? letter? LETTER? Letter? digit? 
+    printable? alpha? control? uncased? character? 
+} [ execute ] curry* map ] unit-test
+[ "Nd" ] [ CHAR: 3 category ] unit-test
--- a/extra/unicode/categories/categories.factor
+++ b/extra/unicode/categories/categories.factor
@ -0,0 +1,13 @@
+USING: unicode.syntax ;
+IN: unicode.categories
+
+CATEGORY: blank Zs Zl Zp ;
+CATEGORY: letter Ll ;
+CATEGORY: LETTER Lu ;
+CATEGORY: Letter Lu Ll Lt Lm Lo ;
+CATEGORY: digit Nd Nl No ;
+CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
+CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
+CATEGORY: control Cc ;
+CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ; 
+CATEGORY-NOT: character Cn ;
--- a/extra/unicode/data/data.factor
+++ b/extra/unicode/data/data.factor
@ -1,7 +1,7 @@
 USING: assocs math kernel sequences io.files hashtables quotations
 splitting arrays math.parser combinators.lib hash2 byte-arrays words
 namespaces ;
-IN: unicode.load
+IN: unicode.data

 ! Convenience functions
 : 1+* ( n/f _ -- n+1 )
@ -112,11 +112,6 @@ C: <code-point> code-point
    4 head [ multihex ] map first4
    <code-point> swap first set ;

-: load-special-casing ( -- special-casing )
-    "extra/unicode/SpecialCasing.txt" resource-path data
-    [ length 5 = ] subset
-    [ [ set-code-point ] each ] H{ } make-assoc ;
-
 DEFER: simple-lower
 DEFER: simple-upper
 DEFER: simple-title
@ -126,7 +121,6 @@ DEFER: class-map
 DEFER: compat-map
 DEFER: category-map
 DEFER: name-map
-DEFER: special-casing

 <<
    load-data
@ -139,5 +133,22 @@ DEFER: special-casing
        \ combine-map define-value
    dup process-compat \ compat-map define-value
    process-category \ category-map define-value
-    load-special-casing \ special-casing define-value
 >>
+
+: canonical-entry ( char -- seq ) canonical-map at ;
+: combine-chars ( a b -- char/f ) combine-map hash2 ;
+: compat-entry ( char -- seq ) compat-map at  ;
+: combining-class ( char -- n ) class-map at ;
+: non-starter? ( char -- ? ) class-map key? ;
+: name>char ( string -- char ) name-map at ;
+: char>name ( char -- string ) name-map value-at ;
+
+! Special casing data
+: load-special-casing ( -- special-casing )
+    "extra/unicode/SpecialCasing.txt" resource-path data
+    [ length 5 = ] subset
+    [ [ set-code-point ] each ] H{ } make-assoc ;
+
+DEFER: special-casing
+
+<< load-special-casing \ special-casing define-value >>
--- a/extra/unicode/normalize/normalize-tests.factor
+++ b/extra/unicode/normalize/normalize-tests.factor
@ -0,0 +1,18 @@
+USING: unicode.normalize kernel tools.test sequences ;
+
+[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
+
+[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
+[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
+[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
+[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
+
+[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
+[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
+
+[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
+[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
+[ t ] [ 54620 hangul? ] unit-test
+[ f ] [ 0 hangul? ] unit-test
+[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
+[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test
--- a/extra/unicode/normalize/normalize.factor
+++ b/extra/unicode/normalize/normalize.factor
@ -1,4 +1,4 @@
-USING: sequences namespaces unicode.load kernel combinators.lib math
+USING: sequences namespaces unicode.data kernel combinators.lib math
 unicode arrays ;
 IN: unicode.normalize

--- a/extra/unicode/syntax/syntax-tests.factor
+++ b/extra/unicode/syntax/syntax-tests.factor
@ -0,0 +1,4 @@
+USING: unicode.syntax tools.test ;
+
+[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
+! Write a test for CATEGORY and CATEGORY-NOT
--- a/extra/unicode/syntax/syntax.factor
+++ b/extra/unicode/syntax/syntax.factor
@ -1,4 +1,4 @@
-USING: unicode.load kernel math sequences parser bit-arrays namespaces 
+USING: unicode.data kernel math sequences parser bit-arrays namespaces 
 sequences.private arrays quotations classes.predicate ;
 IN: unicode.syntax

@ -54,3 +54,7 @@ C: <code-point> code-point
 : set-code-point ( seq -- )
    4 head [ multihex ] map first4
    <code-point> swap first set ;
+
+: UNICHAR:
+    ! This should be part of CHAR:
+    scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
--- a/extra/unicode/unicode-tests.factor
+++ b/extra/unicode/unicode-tests.factor
@ -1,37 +1 @@
 USING: unicode kernel tools.test words sequences namespaces ;
-
-[ "Hello How Are You? I'M Good" ] [ "hEllo how ARE yOU? I'm good" >title ] unit-test
-[ "FUSS" ] [ "Fu\u00DF" >upper ] unit-test
-[ "\u03C3\u03C2" ] [ "\u03A3\u03A3" >lower ] unit-test
-[ t ] [ "hello how are you?" lower? ] unit-test
-[
-    "tr" locale set
-    [ "i\u0131i \u0131jj" ] [ "i\u0131I\u0307 IJj" >lower ] unit-test
-!    [ "I\u307\u0131i Ijj" ] [ "i\u0131I\u0307 IJj" >title ] unit-test
-    [ "I\u0307II\u0307 IJJ" ] [ "i\u0131I\u0307 IJj" >upper ] unit-test
-    "lt" locale set
-    ! Lithuanian casing tests
-] with-scope
-
-[ { f f t t f t t f f t } ] [ CHAR: A { 
-    blank? letter? LETTER? Letter? digit? 
-    printable? alpha? control? uncased? character? 
-} [ execute ] curry* map ] unit-test
-[ "Nd" ] [ CHAR: 3 category ] unit-test
-[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
-[ "ab\u0323\u0302cd" ] [ "ab\u0302" "\u0323cd" string-append ] unit-test
-
-[ "ab\u064b\u034d\u034e\u0347\u0346" ] [ "ab\u0346\u0347\u064b\u034e\u034d" dup reorder ] unit-test
-[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
-[ "\uFB012\u2075\u017F\u0323\u0307" "fi25s\u0323\u0307" ]
-[ "\uFB012\u2075\u1E9B\u0323" [ nfd ] keep nfkd ] unit-test
-
-[ "\u1E69" "s\u0323\u0307" ] [ "\u1E69" [ nfc ] keep nfd ] unit-test
-[ "\u1E0D\u0307" ] [ "\u1E0B\u0323" nfc ] unit-test
-
-[ 54620 ] [ 4370 4449 4523 jamo>hangul ] unit-test
-[ 4370 4449 4523 ] [ 54620 hangul>jamo first3 ] unit-test
-[ t ] [ 54620 hangul? ] unit-test
-[ f ] [ 0 hangul? ] unit-test
-[ "\u1112\u1161\u11ab" ] [ "\ud55c" nfd ] unit-test
-[ "\ud55c" ] [ "\u1112\u1161\u11ab" nfc ] unit-test
--- a/extra/unicode/unicode.factor
+++ b/extra/unicode/unicode.factor
@ -1,25 +1,5 @@
-USING: unicode.syntax hash2 assocs unicode.load kernel parser ;
+USING: unicode.syntax unicode.data unicode.breaks unicode.normalize
+unicode.case unicode.categories ;
 IN: unicode

-: canonical-entry ( char -- seq ) canonical-map at ;
-: combine-chars ( a b -- char/f ) combine-map hash2 ;
-: compat-entry ( char -- seq ) compat-map at  ;
-: combining-class ( char -- n ) class-map at ;
-: non-starter? ( char -- ? ) class-map key? ;
-: name>char ( string -- char ) name-map at ;
-: char>name ( char -- string ) name-map value-at ;
-
-CATEGORY: blank Zs Zl Zp ;
-CATEGORY: letter Ll ;
-CATEGORY: LETTER Lu ;
-CATEGORY: Letter Lu Ll Lt Lm Lo ;
-CATEGORY: digit Nd Nl No ;
-CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
-CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
-CATEGORY: control Cc ;
-CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ; 
-CATEGORY-NOT: character Cn ;
-
-: UNICHAR:
-    ! This should be part of CHAR:
-    scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
+! For now: convenience to load all Unicode vocabs
--- a/extra/xml/char-classes/char-classes.factor
+++ b/extra/xml/char-classes/char-classes.factor
@ -1,6 +1,6 @@
 ! Copyright (C) 2005, 2007 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel sequences unicode math ;
+USING: kernel sequences unicode.syntax math ;
 IN: xml.char-classes

 CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u0559\u06E5\u06E6_ ;