Bug fixes for normalization

db4
Daniel Ehrenberg 2008-05-20 16:57:53 -05:00
parent 8b6e234709
commit 1ef44694cd
9 changed files with 159886 additions and 57 deletions

View File

@ -79,3 +79,5 @@ IN: sequences.lib.tests
[ ] [ { } 0 firstn ] unit-test [ ] [ { } 0 firstn ] unit-test
[ "a" ] [ { "a" } 1 firstn ] unit-test [ "a" ] [ { "a" } 1 firstn ] unit-test
[ { { 1 1 } { 1 2 } { 2 0 } } ] [ { { 2 0 } { 1 1 } { 1 2 } } dup [ first ] insertion-sort ] unit-test

View File

@ -4,7 +4,7 @@
USING: combinators.lib kernel sequences math namespaces assocs USING: combinators.lib kernel sequences math namespaces assocs
random sequences.private shuffle math.functions mirrors random sequences.private shuffle math.functions mirrors
arrays math.parser math.private sorting strings ascii macros arrays math.parser math.private sorting strings ascii macros
assocs.lib quotations hashtables math.order ; assocs.lib quotations hashtables math.order locals ;
IN: sequences.lib IN: sequences.lib
: each-withn ( seq quot n -- ) nwith each ; inline : each-withn ( seq quot n -- ) nwith each ; inline
@ -243,3 +243,17 @@ PRIVATE>
: short ( seq n -- seq n' ) : short ( seq n -- seq n' )
over length min ; inline over length min ; inline
<PRIVATE
:: insert ( seq quot n -- )
n zero? [
n n 1- [ seq nth quot call ] bi@ >= [
n n 1- seq exchange
seq quot n 1- insert
] unless
] unless ; inline
PRIVATE>
: insertion-sort ( seq quot -- )
! quot is a transformation on elements
over length [ insert ] 2with each ; inline

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

19811
extra/unicode/collation/allkeys.txt Executable file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
USING: io io.files splitting unicode.collation sequences kernel
io.encodings.utf8 math.parser math.order tools.test assocs ;
IN: unicode.collation.tests
: parse-test ( -- strings )
"resource:extra/unicode/collation/CollationTest_SHIFTED.txt"
utf8 file-lines 5 tail
[ ";" split1 drop " " split [ hex> ] "" map-as ] map ;
: test-two ( str1 str2 -- )
[ +lt+ ] -rot [ string<=> ] 2curry unit-test ;
: test parse-test 2 <clumps> [ test-two ] assoc-each ;
: find-failure
parse-test dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-find drop ;
: failures
parse-test dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-filter dup assoc-size ;

View File

@ -0,0 +1,89 @@
USING: sequences io.files io.encodings.ascii kernel values
splitting accessors math.parser ascii io assocs strings math
namespaces sorting combinators math.order arrays
unicode.normalize ;
IN: unicode.collation
VALUE: ducet
TUPLE: weight primary secondary tertiary ignorable? ;
: remove-comments ( lines -- lines )
[ "#" split1 drop "@" split1 drop ] map
[ empty? not ] filter ;
: parse-weight ( string -- weight )
"]" split but-last [
weight new swap rest unclip CHAR: * = swapd >>ignorable?
swap "." split first3 [ hex> ] tri@
[ >>primary ] [ >>secondary ] [ >>tertiary ] tri*
] map ;
: parse-line ( line -- code-poing weight )
";" split1 [ [ blank? ] trim ] bi@
[ " " split [ hex> ] "" map-as ] [ parse-weight ] bi* ;
: parse-ducet ( stream -- ducet )
lines remove-comments
[ parse-line ] H{ } map>assoc ;
"resource:extra/unicode/collation/allkeys.txt"
ascii <file-reader> parse-ducet \ ducet set-value
: derive-weight ( char -- weight )
! This should check Noncharacter_Code_Point
! If yes, then ignore the character
! otherwise, apply derivation formula
drop { } ;
: string>weights ( string -- weights )
! This should actually look things up with
! multichar collation elements
! Also, do weight derivation for things not in DUCET
[ dup 1string ducet at [ ] [ derive-weight ] ?if ]
{ } map-as concat ;
: append-weights ( weights quot -- )
swap [ ignorable?>> not ] filter
swap map [ zero? not ] filter % 0 , ;
: variable-weight ( weight -- )
dup ignorable?>> [ primary>> ] [ drop HEX: FFFF ] if , ;
: weights>bytes ( weights -- byte-array )
[
{
[ [ primary>> ] append-weights ]
[ [ secondary>> ] append-weights ]
[ [ tertiary>> ] append-weights ]
[ [ variable-weight ] each ]
} cleave
] { } make ;
: completely-ignorable? ( weight -- ? )
[ primary>> ] [ secondary>> ] [ tertiary>> ] tri
[ zero? ] tri@ and and ;
: filter-ignorable ( weights -- weights' )
! Filters primary-ignorables which follow variable weighteds
! and all completely-ignorables
>r f r> [
tuck primary>> zero? and
[ swap ignorable?>> or ]
[ swap completely-ignorable? or not ] 2bi
] filter nip ;
: collation-key ( string -- key )
nfd string>weights filter-ignorable weights>bytes ;
: compare-collation ( {str1,key} {str2,key} -- <=> )
2dup [ second ] bi@ <=> dup +eq+ =
[ drop <=> ] [ 2nip ] if ;
: sort-strings ( strings -- sorted )
[ dup collation-key ] { } map>assoc
[ compare-collation ] sort
keys ;
: string<=> ( str1 str2 -- <=> )
[ dup collation-key 2array ] bi@ compare-collation ;

View File

@ -2,7 +2,7 @@ USING: unicode.normalize kernel tools.test sequences ;
[ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test [ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test
[ "ab\u00064b\u00034d\u00034e\u000347\u000346" ] [ "ab\u000346\u000347\u00064b\u00034e\u00034d" dup reorder ] unit-test [ "ab\u00064b\u000347\u00034e\u00034d\u000346" ] [ "ab\u000346\u000347\u00064b\u00034e\u00034d" dup reorder ] unit-test
[ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test [ "hello" "hello" ] [ "hello" [ nfd ] keep nfkd ] unit-test
[ "\u00FB012\u002075\u00017F\u000323\u000307" "fi25s\u000323\u000307" ] [ "\u00FB012\u002075\u00017F\u000323\u000307" "fi25s\u000323\u000307" ]
[ "\u00FB012\u002075\u001E9B\u000323" [ nfd ] keep nfkd ] unit-test [ "\u00FB012\u002075\u001E9B\u000323" [ nfd ] keep nfkd ] unit-test

18
extra/unicode/normalize/normalize.factor Normal file → Executable file
View File

@ -1,4 +1,5 @@
USING: sequences namespaces unicode.data kernel math arrays ; USING: sequences namespaces unicode.data kernel math arrays
locals combinators.lib sequences.lib ;
IN: unicode.normalize IN: unicode.normalize
! Conjoining Jamo behavior ! Conjoining Jamo behavior
@ -35,21 +36,6 @@ IN: unicode.normalize
! Normalization -- Decomposition ! Normalization -- Decomposition
: (insert) ( seq n quot -- )
over 0 = [ 3drop ] [
[ >r dup 1- rot [ nth ] curry bi@ r> bi@ > ] 3keep
roll [ 3drop ]
[ >r [ dup 1- rot exchange ] 2keep 1- r> (insert) ] if
] if ; inline
: insert ( seq quot elt n -- )
swap rot >r -rot [ swap set-nth ] 2keep r> (insert) ; inline
: insertion-sort ( seq quot -- )
! quot is a transformation on elements
over dup length
[ >r >r 2dup r> r> insert ] 2each 2drop ; inline
: reorder-slice ( string start -- slice done? ) : reorder-slice ( string start -- slice done? )
2dup swap [ non-starter? not ] find-from drop 2dup swap [ non-starter? not ] find-from drop
[ [ over length ] unless* rot <slice> ] keep not ; [ [ over length ] unless* rot <slice> ] keep not ;