Fixing normalize errors

db4
Daniel Ehrenberg 2009-01-07 23:13:04 -06:00
parent c0ad6b7c55
commit 8b351b1ad6
4 changed files with 17 additions and 26 deletions

View File

@ -28,10 +28,6 @@ VALUE: properties
: char>name ( char -- string ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
! Convenience functions
: ?between? ( n/f from to -- ? )
pick [ between? ] [ 3drop f ] if ;
! Loading data from UnicodeData.txt
: split-; ( line -- array )
@ -206,9 +202,9 @@ SYMBOL: interned
: expand-ranges ( assoc -- interval-map )
[
[
CHAR: . pick member? [
swap ".." split1 [ hex> ] bi@ 2array
] [ swap hex> ] if range,
swap CHAR: . over member? [
".." split1 [ hex> ] bi@ 2array
] [ hex> ] if range,
] assoc-each
] { } make <interval-map> ;

View File

@ -8,9 +8,7 @@ ARTICLE: "unicode.normalize" "Unicode normalization"
{ $subsection nfc }
{ $subsection nfd }
{ $subsection nfkc }
{ $subsection nfkd }
"If two strings in a normalization form are appended, the result may not be in that normalization form still. To append two strings in NFD and make sure the result is in NFD, the following procedure is supplied:"
{ $subsection string-append } ;
{ $subsection nfkd } ;
HELP: nfc
{ $values { "string" string } { "nfc" "a string in NFC" } }
@ -27,7 +25,3 @@ HELP: nfkc
HELP: nfkd
{ $values { "string" string } { "nfc" "a string in NFKD" } }
{ $description "Converts a string to Normalization Form KD" } ;
HELP: string-append
{ $values { "s1" "a string in NFD" } { "s2" "a string in NFD" } { "string" "a string in NFD" } }
{ $description "Appends two strings, putting the result in NFD." } ;

View File

@ -1,6 +1,6 @@
USING: unicode.normalize kernel tools.test sequences
unicode.data io.encodings.utf8 io.files splitting math.parser
locals math quotations assocs combinators ;
locals math quotations assocs combinators unicode.normalize.private ;
IN: unicode.normalize.tests
[ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test

View File

@ -1,21 +1,24 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: sequences namespaces make unicode.data kernel math arrays
locals sorting.insertion accessors assocs ;
locals sorting.insertion accessors assocs math.order ;
IN: unicode.normalize
<PRIVATE
! Conjoining Jamo behavior
: hangul-base HEX: ac00 ; inline
: hangul-end HEX: D7AF ; inline
: initial-base HEX: 1100 ; inline
: medial-base HEX: 1161 ; inline
: final-base HEX: 11a7 ; inline
CONSTANT: hangul-base HEX: ac00
CONSTANT: hangul-end HEX: D7AF
CONSTANT: initial-base HEX: 1100
CONSTANT: medial-base HEX: 1161
CONSTANT: final-base HEX: 11a7
: initial-count 19 ; inline
: medial-count 21 ; inline
: final-count 28 ; inline
CONSTANT: initial-count 19
CONSTANT: medial-count 21
CONSTANT: final-count 28
: ?between? ( n/f from to -- ? )
pick [ between? ] [ 3drop f ] if ;
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
@ -84,8 +87,6 @@ PRIVATE>
[ compatibility-entry ] decompose ;
: string-append ( s1 s2 -- string )
! This could be more optimized,
! but in practice, it'll almost always just be append
[ append ] keep
0 over ?nth non-starter?
[ length dupd reorder-back ] [ drop ] if ;