Speeding up normalization

db4
Daniel Ehrenberg 2009-01-09 14:03:33 -06:00
parent 8d8efb6dce
commit 1ed964e539
3 changed files with 114 additions and 91 deletions

View File

@ -1,16 +1,18 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: unicode.data sequences sequences.next namespaces make unicode.syntax USING: unicode.data sequences sequences.next namespaces
unicode.normalize math unicode.categories combinators unicode.syntax sbufs make unicode.syntax unicode.normalize math hints
assocs strings splitting kernel accessors unicode.breaks fry ; unicode.categories combinators unicode.syntax assocs
strings splitting kernel accessors unicode.breaks fry locals ;
QUALIFIED: ascii
IN: unicode.case IN: unicode.case
<PRIVATE <PRIVATE
: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ; : at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ; inline
: ch>lower ( ch -- lower ) simple-lower at-default ; : ch>lower ( ch -- lower ) simple-lower at-default ; inline
: ch>upper ( ch -- upper ) simple-upper at-default ; : ch>upper ( ch -- upper ) simple-upper at-default ; inline
: ch>title ( ch -- title ) simple-title at-default ; : ch>title ( ch -- title ) simple-title at-default ; inline
PRIVATE> PRIVATE>
SYMBOL: locale ! Just casing locale, or overall? SYMBOL: locale ! Just casing locale, or overall?
@ -21,7 +23,7 @@ SYMBOL: locale ! Just casing locale, or overall?
[ dup ] swap '[ _ split1-slice swap ] [ ] produce nip ; [ dup ] swap '[ _ split1-slice swap ] [ ] produce nip ;
: replace ( old new str -- newstr ) : replace ( old new str -- newstr )
[ split-subseq ] dip join ; [ split-subseq ] dip join ; inline
: i-dot? ( -- ? ) : i-dot? ( -- ? )
locale get { "tr" "az" } member? ; locale get { "tr" "az" } member? ;
@ -44,24 +46,24 @@ SYMBOL: locale ! Just casing locale, or overall?
[ [ "" ] [ [ [ "" ] [
dup first mark-above? dup first mark-above?
[ CHAR: combining-dot-above prefix ] when [ CHAR: combining-dot-above prefix ] when
] if-empty ] with-rest ; ] if-empty ] with-rest ; inline
: lithuanian>lower ( string -- lower ) : lithuanian>lower ( string -- lower )
"i" split add-dots "i" join "i" split add-dots "i" join
"j" split add-dots "i" join ; "j" split add-dots "i" join ; inline
: turk>upper ( string -- upper-i ) : turk>upper ( string -- upper-i )
"i" "I\u000307" replace ; "i" "I\u000307" replace ; inline
: turk>lower ( string -- lower-i ) : turk>lower ( string -- lower-i )
"I\u000307" "i" replace "I\u000307" "i" replace
"I" "\u000131" replace ; "I" "\u000131" replace ; inline
: fix-sigma-end ( string -- string ) : fix-sigma-end ( string -- string )
[ "" ] [ [ "" ] [
dup peek CHAR: greek-small-letter-sigma = dup peek CHAR: greek-small-letter-sigma =
[ 1 head* CHAR: greek-small-letter-final-sigma suffix ] when [ 1 head* CHAR: greek-small-letter-final-sigma suffix ] when
] if-empty ; ] if-empty ; inline
: sigma-map ( string -- string ) : sigma-map ( string -- string )
{ CHAR: greek-capital-letter-sigma } split [ [ { CHAR: greek-capital-letter-sigma } split [ [
@ -70,19 +72,20 @@ SYMBOL: locale ! Just casing locale, or overall?
CHAR: greek-small-letter-final-sigma CHAR: greek-small-letter-final-sigma
CHAR: greek-small-letter-sigma ? prefix CHAR: greek-small-letter-sigma ? prefix
] if-empty ] if-empty
] map ] with-rest concat fix-sigma-end ; ] map ] with-rest concat fix-sigma-end ; inline
: final-sigma ( string -- string ) : final-sigma ( string -- string )
CHAR: greek-capital-letter-sigma CHAR: greek-capital-letter-sigma
over member? [ sigma-map ] when ; over member? [ sigma-map ] when
"" like ; inline
: map-case ( string string-quot char-quot -- case ) :: map-case ( string string-quot char-quot -- case )
[ string length <sbuf> :> out
[ string [
[ dup special-casing at ] 2dip dup special-casing at
[ [ % ] compose ] [ [ , ] compose ] bi* ?if [ string-quot call out push-all ]
] 2curry each [ char-quot call out push ] ?if
] "" make ; inline ] each out "" like ; inline
PRIVATE> PRIVATE>
@ -90,24 +93,30 @@ PRIVATE>
i-dot? [ turk>lower ] when final-sigma i-dot? [ turk>lower ] when final-sigma
[ lower>> ] [ ch>lower ] map-case ; [ lower>> ] [ ch>lower ] map-case ;
HINTS: >lower string ;
: >upper ( string -- upper ) : >upper ( string -- upper )
i-dot? [ turk>upper ] when i-dot? [ turk>upper ] when
[ upper>> ] [ ch>upper ] map-case ; [ upper>> ] [ ch>upper ] map-case ;
HINTS: >upper string ;
<PRIVATE <PRIVATE
: (>title) ( string -- title ) : (>title) ( string -- title )
i-dot? [ turk>upper ] when i-dot? [ turk>upper ] when
[ title>> ] [ ch>title ] map-case ; [ title>> ] [ ch>title ] map-case ; inline
: title-word ( string -- title ) : title-word ( string -- title )
unclip 1string [ >lower ] [ (>title) ] bi* prepend ; unclip 1string [ >lower ] [ (>title) ] bi* prepend ; inline
PRIVATE> PRIVATE>
: >title ( string -- title ) : >title ( string -- title )
final-sigma >words [ title-word ] map concat ; final-sigma >words [ title-word ] map concat ;
HINTS: >title string ;
: >case-fold ( string -- fold ) : >case-fold ( string -- fold )
>upper >lower ; >upper >lower ;

View File

@ -3,6 +3,8 @@ unicode.data io.encodings.utf8 io.files splitting math.parser
locals math quotations assocs combinators unicode.normalize.private ; locals math quotations assocs combinators unicode.normalize.private ;
IN: unicode.normalize.tests IN: unicode.normalize.tests
{ nfc nfkc nfd nfkd } [ must-infer ] each
[ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test [ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test
[ "ab\u00064b\u000347\u00034e\u00034d\u000346" ] [ "ab\u000346\u000347\u00064b\u00034e\u00034d" dup reorder ] unit-test [ "ab\u00064b\u000347\u00034e\u00034d\u000346" ] [ "ab\u000346\u000347\u00064b\u00034e\u00034d" dup reorder ] unit-test

View File

@ -1,8 +1,8 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: sequences namespaces make unicode.data kernel math arrays USING: ascii sequences namespaces make unicode.data kernel math arrays
locals sorting.insertion accessors assocs math.order combinators locals sorting.insertion accessors assocs math.order combinators
unicode.syntax strings sbufs ; unicode.syntax strings sbufs hints combinators.short-circuit vectors ;
IN: unicode.normalize IN: unicode.normalize
<PRIVATE <PRIVATE
@ -19,16 +19,16 @@ CONSTANT: medial-count 21
CONSTANT: final-count 28 CONSTANT: final-count 28
: ?between? ( n/f from to -- ? ) : ?between? ( n/f from to -- ? )
pick [ between? ] [ 3drop f ] if ; pick [ between? ] [ 3drop f ] if ; inline
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ; : hangul? ( ch -- ? ) hangul-base hangul-end ?between? ; inline
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ; : jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ; inline
! These numbers come from UAX 29 ! These numbers come from UAX 29
: initial? ( ch -- ? ) : initial? ( ch -- ? )
dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ; dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ; inline
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ; : medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ; inline
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ; : final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ; inline
: hangul>jamo ( hangul -- jamo-string ) : hangul>jamo ( hangul -- jamo-string )
hangul-base - final-count /mod final-base + hangul-base - final-count /mod final-base +
@ -48,16 +48,16 @@ CONSTANT: final-count 28
: reorder-slice ( string start -- slice done? ) : reorder-slice ( string start -- slice done? )
2dup swap [ non-starter? not ] find-from drop 2dup swap [ non-starter? not ] find-from drop
[ [ over length ] unless* rot <slice> ] keep not ; [ [ over length ] unless* rot <slice> ] keep not ; inline
: reorder-next ( string i -- new-i done? ) : reorder-next ( string i -- new-i done? )
over [ non-starter? ] find-from drop [ over [ non-starter? ] find-from drop [
reorder-slice reorder-slice
[ dup [ combining-class ] insertion-sort to>> ] dip [ dup [ combining-class ] insertion-sort to>> ] dip
] [ length t ] if* ; ] [ length t ] if* ; inline
: reorder-loop ( string start -- ) : reorder-loop ( string start -- )
dupd reorder-next [ 2drop ] [ reorder-loop ] if ; dupd reorder-next [ 2drop ] [ reorder-loop ] if ; inline recursive
: reorder ( string -- ) : reorder ( string -- )
0 reorder-loop ; 0 reorder-loop ;
@ -66,12 +66,14 @@ CONSTANT: final-count 28
over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ; over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ;
:: decompose ( string quot -- decomposed ) :: decompose ( string quot -- decomposed )
[let | out [ string length <sbuf> ] | string length <sbuf> :> out
string [ string [
>fixnum dup ascii? [ out push ] [
dup hangul? [ hangul>jamo out push-all ] dup hangul? [ hangul>jamo out push-all ]
[ dup quot call [ out push-all ] [ out push ] ?if ] if [ dup quot call [ out push-all ] [ out push ] ?if ] if
] each out >string ] if
] dup reorder ; ] each
out "" like dup reorder ; inline
: with-string ( str quot -- str ) : with-string ( str quot -- str )
over aux>> [ call ] [ drop ] if ; inline over aux>> [ call ] [ drop ] if ; inline
@ -79,9 +81,13 @@ CONSTANT: final-count 28
: (nfd) ( string -- nfd ) : (nfd) ( string -- nfd )
[ canonical-entry ] decompose ; [ canonical-entry ] decompose ;
HINTS: (nfd) string ;
: (nfkd) ( string -- nfkd ) : (nfkd) ( string -- nfkd )
[ compatibility-entry ] decompose ; [ compatibility-entry ] decompose ;
HINTS: (nfkd) string ;
PRIVATE> PRIVATE>
: nfd ( string -- nfd ) : nfd ( string -- nfd )
@ -95,83 +101,89 @@ PRIVATE>
0 over ?nth non-starter? 0 over ?nth non-starter?
[ length dupd reorder-back ] [ drop ] if ; [ length dupd reorder-back ] [ drop ] if ;
HINTS: string-append string string ;
<PRIVATE <PRIVATE
! Normalization -- Composition ! Normalization -- Composition
SYMBOL: main-str
SYMBOL: ind
SYMBOL: after
SYMBOL: char
: get-str ( i -- ch ) ind get + main-str get ?nth ; : initial-medial? ( str i -- ? )
: current ( -- ch ) 0 get-str ; { [ swap nth initial? ] [ 1+ swap ?nth medial? ] } 2&& ;
: to ( -- ) ind inc ;
: initial-medial? ( -- ? ) : --final? ( str i -- ? )
current initial? [ 1 get-str medial? ] [ f ] if ; 2 + swap ?nth final? ;
: --final? ( -- ? ) : imf, ( str i -- str i )
2 get-str final? ; [ tail-slice first3 jamo>hangul , ]
[ 3 + ] 2bi ;
: imf, ( -- ) : im, ( str i -- str i )
current to current to current jamo>hangul , ; [ tail-slice first2 final-base jamo>hangul , ]
[ 2 + ] 2bi ;
: im, ( -- ) : compose-jamo ( str i -- str i )
current to current final-base jamo>hangul , ; 2dup initial-medial? [
2dup --final? [ imf, ] [ im, ] if
] [ 2dup swap nth , 1+ ] if ;
: compose-jamo ( -- ) : pass-combining ( str -- str i )
initial-medial? [ dup [ non-starter? not ] find drop
--final? [ imf, ] [ im, ] if [ dup length ] unless*
] [ current , ] if to ; 2dup head-slice % ;
: pass-combining ( -- ) TUPLE: compose-state i str char after last-class ;
current non-starter? [ current , to pass-combining ] when ;
:: try-compose ( last-class new-char current-class -- new-class ) : get-str ( state i -- ch )
last-class current-class = [ new-char after get push last-class ] [ swap [ i>> + ] [ str>> ] bi ?nth ;
char get new-char combine-chars : current ( state -- ch ) 0 get-str ;
[ char set last-class ] : to ( state -- state ) [ 1+ ] change-i ;
[ new-char after get push current-class ] if* : push-after ( ch state -- state ) [ ?push ] change-after ;
:: try-compose ( state new-char current-class -- state )
state last-class>> current-class =
[ new-char state push-after ] [
state char>> new-char combine-chars
[ state swap >>char ] [
new-char state push-after
current-class >>last-class
] if*
] if ; ] if ;
DEFER: compose-iter DEFER: compose-iter
: try-noncombining ( char -- ) : try-noncombining ( char state -- state )
char get swap combine-chars tuck char>> swap combine-chars
[ char set to f compose-iter ] when* ; [ >>char to f >>last-class compose-iter ] when* ;
: compose-iter ( last-class -- ) : compose-iter ( state -- state )
current [ dup current [
dup combining-class { dup combining-class {
{ f [ 2drop ] } { f [ drop ] }
{ 0 [ swap [ drop ] [ try-noncombining ] if ] } { 0 [
over last-class>>
[ drop ] [ swap try-noncombining ] if ] }
[ try-compose to compose-iter ] [ try-compose to compose-iter ]
} case } case
] [ drop ] if* ; ] when* ;
: ?new-after ( -- ) : compose-combining ( ch str i -- str i )
after [ dup empty? [ drop SBUF" " clone ] unless ] change ; compose-state new
swap >>i
swap >>str
swap >>char
compose-iter
{ [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ;
: compose-combining ( ch -- ) :: (compose) ( str i -- )
char set to ?new-after i str ?nth [
f compose-iter dup jamo? [ drop str i compose-jamo ] [
char get , after get % ; i 1+ str ?nth combining-class
[ str i 1+ compose-combining ] [ , str i 1+ ] if
: (compose) ( -- )
current [
dup jamo? [ drop compose-jamo ] [
1 get-str combining-class
[ compose-combining ] [ , to ] if
] if (compose) ] if (compose)
] when* ; ] when* ;
: combine ( str -- comp ) : combine ( str -- comp )
[ [ pass-combining (compose) ] "" make ;
main-str set
0 ind set
SBUF" " clone after set
pass-combining (compose)
] "" make ;
PRIVATE> PRIVATE>