Fixing normalize errors

db4
Daniel Ehrenberg 2009-01-07 23:13:04 -06:00
parent c0ad6b7c55
commit 8b351b1ad6
4 changed files with 17 additions and 26 deletions

View File

@ -28,10 +28,6 @@ VALUE: properties
: char>name ( char -- string ) name-map value-at ; : char>name ( char -- string ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ; : property? ( char property -- ? ) properties at interval-key? ;
! Convenience functions
: ?between? ( n/f from to -- ? )
pick [ between? ] [ 3drop f ] if ;
! Loading data from UnicodeData.txt ! Loading data from UnicodeData.txt
: split-; ( line -- array ) : split-; ( line -- array )
@ -206,9 +202,9 @@ SYMBOL: interned
: expand-ranges ( assoc -- interval-map ) : expand-ranges ( assoc -- interval-map )
[ [
[ [
CHAR: . pick member? [ swap CHAR: . over member? [
swap ".." split1 [ hex> ] bi@ 2array ".." split1 [ hex> ] bi@ 2array
] [ swap hex> ] if range, ] [ hex> ] if range,
] assoc-each ] assoc-each
] { } make <interval-map> ; ] { } make <interval-map> ;

View File

@ -8,9 +8,7 @@ ARTICLE: "unicode.normalize" "Unicode normalization"
{ $subsection nfc } { $subsection nfc }
{ $subsection nfd } { $subsection nfd }
{ $subsection nfkc } { $subsection nfkc }
{ $subsection nfkd } { $subsection nfkd } ;
"If two strings in a normalization form are appended, the result may not be in that normalization form still. To append two strings in NFD and make sure the result is in NFD, the following procedure is supplied:"
{ $subsection string-append } ;
HELP: nfc HELP: nfc
{ $values { "string" string } { "nfc" "a string in NFC" } } { $values { "string" string } { "nfc" "a string in NFC" } }
@ -27,7 +25,3 @@ HELP: nfkc
HELP: nfkd HELP: nfkd
{ $values { "string" string } { "nfc" "a string in NFKD" } } { $values { "string" string } { "nfc" "a string in NFKD" } }
{ $description "Converts a string to Normalization Form KD" } ; { $description "Converts a string to Normalization Form KD" } ;
HELP: string-append
{ $values { "s1" "a string in NFD" } { "s2" "a string in NFD" } { "string" "a string in NFD" } }
{ $description "Appends two strings, putting the result in NFD." } ;

View File

@ -1,6 +1,6 @@
USING: unicode.normalize kernel tools.test sequences USING: unicode.normalize kernel tools.test sequences
unicode.data io.encodings.utf8 io.files splitting math.parser unicode.data io.encodings.utf8 io.files splitting math.parser
locals math quotations assocs combinators ; locals math quotations assocs combinators unicode.normalize.private ;
IN: unicode.normalize.tests IN: unicode.normalize.tests
[ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test [ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test

View File

@ -1,21 +1,24 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: sequences namespaces make unicode.data kernel math arrays USING: sequences namespaces make unicode.data kernel math arrays
locals sorting.insertion accessors assocs ; locals sorting.insertion accessors assocs math.order ;
IN: unicode.normalize IN: unicode.normalize
<PRIVATE <PRIVATE
! Conjoining Jamo behavior ! Conjoining Jamo behavior
: hangul-base HEX: ac00 ; inline CONSTANT: hangul-base HEX: ac00
: hangul-end HEX: D7AF ; inline CONSTANT: hangul-end HEX: D7AF
: initial-base HEX: 1100 ; inline CONSTANT: initial-base HEX: 1100
: medial-base HEX: 1161 ; inline CONSTANT: medial-base HEX: 1161
: final-base HEX: 11a7 ; inline CONSTANT: final-base HEX: 11a7
: initial-count 19 ; inline CONSTANT: initial-count 19
: medial-count 21 ; inline CONSTANT: medial-count 21
: final-count 28 ; inline CONSTANT: final-count 28
: ?between? ( n/f from to -- ? )
pick [ between? ] [ 3drop f ] if ;
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ; : hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ; : jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
@ -84,8 +87,6 @@ PRIVATE>
[ compatibility-entry ] decompose ; [ compatibility-entry ] decompose ;
: string-append ( s1 s2 -- string ) : string-append ( s1 s2 -- string )
! This could be more optimized,
! but in practice, it'll almost always just be append
[ append ] keep [ append ] keep
0 over ?nth non-starter? 0 over ?nth non-starter?
[ length dupd reorder-back ] [ drop ] if ; [ length dupd reorder-back ] [ drop ] if ;