Speeding up normalization
parent
8d8efb6dce
commit
1ed964e539
|
@ -1,16 +1,18 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: unicode.data sequences sequences.next namespaces make unicode.syntax
|
USING: unicode.data sequences sequences.next namespaces
|
||||||
unicode.normalize math unicode.categories combinators unicode.syntax
|
sbufs make unicode.syntax unicode.normalize math hints
|
||||||
assocs strings splitting kernel accessors unicode.breaks fry ;
|
unicode.categories combinators unicode.syntax assocs
|
||||||
|
strings splitting kernel accessors unicode.breaks fry locals ;
|
||||||
|
QUALIFIED: ascii
|
||||||
IN: unicode.case
|
IN: unicode.case
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ;
|
: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ; inline
|
||||||
|
|
||||||
: ch>lower ( ch -- lower ) simple-lower at-default ;
|
: ch>lower ( ch -- lower ) simple-lower at-default ; inline
|
||||||
: ch>upper ( ch -- upper ) simple-upper at-default ;
|
: ch>upper ( ch -- upper ) simple-upper at-default ; inline
|
||||||
: ch>title ( ch -- title ) simple-title at-default ;
|
: ch>title ( ch -- title ) simple-title at-default ; inline
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
SYMBOL: locale ! Just casing locale, or overall?
|
SYMBOL: locale ! Just casing locale, or overall?
|
||||||
|
@ -21,7 +23,7 @@ SYMBOL: locale ! Just casing locale, or overall?
|
||||||
[ dup ] swap '[ _ split1-slice swap ] [ ] produce nip ;
|
[ dup ] swap '[ _ split1-slice swap ] [ ] produce nip ;
|
||||||
|
|
||||||
: replace ( old new str -- newstr )
|
: replace ( old new str -- newstr )
|
||||||
[ split-subseq ] dip join ;
|
[ split-subseq ] dip join ; inline
|
||||||
|
|
||||||
: i-dot? ( -- ? )
|
: i-dot? ( -- ? )
|
||||||
locale get { "tr" "az" } member? ;
|
locale get { "tr" "az" } member? ;
|
||||||
|
@ -44,24 +46,24 @@ SYMBOL: locale ! Just casing locale, or overall?
|
||||||
[ [ "" ] [
|
[ [ "" ] [
|
||||||
dup first mark-above?
|
dup first mark-above?
|
||||||
[ CHAR: combining-dot-above prefix ] when
|
[ CHAR: combining-dot-above prefix ] when
|
||||||
] if-empty ] with-rest ;
|
] if-empty ] with-rest ; inline
|
||||||
|
|
||||||
: lithuanian>lower ( string -- lower )
|
: lithuanian>lower ( string -- lower )
|
||||||
"i" split add-dots "i" join
|
"i" split add-dots "i" join
|
||||||
"j" split add-dots "i" join ;
|
"j" split add-dots "i" join ; inline
|
||||||
|
|
||||||
: turk>upper ( string -- upper-i )
|
: turk>upper ( string -- upper-i )
|
||||||
"i" "I\u000307" replace ;
|
"i" "I\u000307" replace ; inline
|
||||||
|
|
||||||
: turk>lower ( string -- lower-i )
|
: turk>lower ( string -- lower-i )
|
||||||
"I\u000307" "i" replace
|
"I\u000307" "i" replace
|
||||||
"I" "\u000131" replace ;
|
"I" "\u000131" replace ; inline
|
||||||
|
|
||||||
: fix-sigma-end ( string -- string )
|
: fix-sigma-end ( string -- string )
|
||||||
[ "" ] [
|
[ "" ] [
|
||||||
dup peek CHAR: greek-small-letter-sigma =
|
dup peek CHAR: greek-small-letter-sigma =
|
||||||
[ 1 head* CHAR: greek-small-letter-final-sigma suffix ] when
|
[ 1 head* CHAR: greek-small-letter-final-sigma suffix ] when
|
||||||
] if-empty ;
|
] if-empty ; inline
|
||||||
|
|
||||||
: sigma-map ( string -- string )
|
: sigma-map ( string -- string )
|
||||||
{ CHAR: greek-capital-letter-sigma } split [ [
|
{ CHAR: greek-capital-letter-sigma } split [ [
|
||||||
|
@ -70,19 +72,20 @@ SYMBOL: locale ! Just casing locale, or overall?
|
||||||
CHAR: greek-small-letter-final-sigma
|
CHAR: greek-small-letter-final-sigma
|
||||||
CHAR: greek-small-letter-sigma ? prefix
|
CHAR: greek-small-letter-sigma ? prefix
|
||||||
] if-empty
|
] if-empty
|
||||||
] map ] with-rest concat fix-sigma-end ;
|
] map ] with-rest concat fix-sigma-end ; inline
|
||||||
|
|
||||||
: final-sigma ( string -- string )
|
: final-sigma ( string -- string )
|
||||||
CHAR: greek-capital-letter-sigma
|
CHAR: greek-capital-letter-sigma
|
||||||
over member? [ sigma-map ] when ;
|
over member? [ sigma-map ] when
|
||||||
|
"" like ; inline
|
||||||
|
|
||||||
: map-case ( string string-quot char-quot -- case )
|
:: map-case ( string string-quot char-quot -- case )
|
||||||
[
|
string length <sbuf> :> out
|
||||||
[
|
string [
|
||||||
[ dup special-casing at ] 2dip
|
dup special-casing at
|
||||||
[ [ % ] compose ] [ [ , ] compose ] bi* ?if
|
[ string-quot call out push-all ]
|
||||||
] 2curry each
|
[ char-quot call out push ] ?if
|
||||||
] "" make ; inline
|
] each out "" like ; inline
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
|
@ -90,24 +93,30 @@ PRIVATE>
|
||||||
i-dot? [ turk>lower ] when final-sigma
|
i-dot? [ turk>lower ] when final-sigma
|
||||||
[ lower>> ] [ ch>lower ] map-case ;
|
[ lower>> ] [ ch>lower ] map-case ;
|
||||||
|
|
||||||
|
HINTS: >lower string ;
|
||||||
|
|
||||||
: >upper ( string -- upper )
|
: >upper ( string -- upper )
|
||||||
i-dot? [ turk>upper ] when
|
i-dot? [ turk>upper ] when
|
||||||
[ upper>> ] [ ch>upper ] map-case ;
|
[ upper>> ] [ ch>upper ] map-case ;
|
||||||
|
|
||||||
|
HINTS: >upper string ;
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
: (>title) ( string -- title )
|
: (>title) ( string -- title )
|
||||||
i-dot? [ turk>upper ] when
|
i-dot? [ turk>upper ] when
|
||||||
[ title>> ] [ ch>title ] map-case ;
|
[ title>> ] [ ch>title ] map-case ; inline
|
||||||
|
|
||||||
: title-word ( string -- title )
|
: title-word ( string -- title )
|
||||||
unclip 1string [ >lower ] [ (>title) ] bi* prepend ;
|
unclip 1string [ >lower ] [ (>title) ] bi* prepend ; inline
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: >title ( string -- title )
|
: >title ( string -- title )
|
||||||
final-sigma >words [ title-word ] map concat ;
|
final-sigma >words [ title-word ] map concat ;
|
||||||
|
|
||||||
|
HINTS: >title string ;
|
||||||
|
|
||||||
: >case-fold ( string -- fold )
|
: >case-fold ( string -- fold )
|
||||||
>upper >lower ;
|
>upper >lower ;
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,8 @@ unicode.data io.encodings.utf8 io.files splitting math.parser
|
||||||
locals math quotations assocs combinators unicode.normalize.private ;
|
locals math quotations assocs combinators unicode.normalize.private ;
|
||||||
IN: unicode.normalize.tests
|
IN: unicode.normalize.tests
|
||||||
|
|
||||||
|
{ nfc nfkc nfd nfkd } [ must-infer ] each
|
||||||
|
|
||||||
[ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test
|
[ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test
|
||||||
|
|
||||||
[ "ab\u00064b\u000347\u00034e\u00034d\u000346" ] [ "ab\u000346\u000347\u00064b\u00034e\u00034d" dup reorder ] unit-test
|
[ "ab\u00064b\u000347\u00034e\u00034d\u000346" ] [ "ab\u000346\u000347\u00064b\u00034e\u00034d" dup reorder ] unit-test
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: sequences namespaces make unicode.data kernel math arrays
|
USING: ascii sequences namespaces make unicode.data kernel math arrays
|
||||||
locals sorting.insertion accessors assocs math.order combinators
|
locals sorting.insertion accessors assocs math.order combinators
|
||||||
unicode.syntax strings sbufs ;
|
unicode.syntax strings sbufs hints combinators.short-circuit vectors ;
|
||||||
IN: unicode.normalize
|
IN: unicode.normalize
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
@ -19,16 +19,16 @@ CONSTANT: medial-count 21
|
||||||
CONSTANT: final-count 28
|
CONSTANT: final-count 28
|
||||||
|
|
||||||
: ?between? ( n/f from to -- ? )
|
: ?between? ( n/f from to -- ? )
|
||||||
pick [ between? ] [ 3drop f ] if ;
|
pick [ between? ] [ 3drop f ] if ; inline
|
||||||
|
|
||||||
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
|
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ; inline
|
||||||
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
|
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ; inline
|
||||||
|
|
||||||
! These numbers come from UAX 29
|
! These numbers come from UAX 29
|
||||||
: initial? ( ch -- ? )
|
: initial? ( ch -- ? )
|
||||||
dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ;
|
dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ; inline
|
||||||
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
|
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ; inline
|
||||||
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
|
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ; inline
|
||||||
|
|
||||||
: hangul>jamo ( hangul -- jamo-string )
|
: hangul>jamo ( hangul -- jamo-string )
|
||||||
hangul-base - final-count /mod final-base +
|
hangul-base - final-count /mod final-base +
|
||||||
|
@ -48,16 +48,16 @@ CONSTANT: final-count 28
|
||||||
|
|
||||||
: reorder-slice ( string start -- slice done? )
|
: reorder-slice ( string start -- slice done? )
|
||||||
2dup swap [ non-starter? not ] find-from drop
|
2dup swap [ non-starter? not ] find-from drop
|
||||||
[ [ over length ] unless* rot <slice> ] keep not ;
|
[ [ over length ] unless* rot <slice> ] keep not ; inline
|
||||||
|
|
||||||
: reorder-next ( string i -- new-i done? )
|
: reorder-next ( string i -- new-i done? )
|
||||||
over [ non-starter? ] find-from drop [
|
over [ non-starter? ] find-from drop [
|
||||||
reorder-slice
|
reorder-slice
|
||||||
[ dup [ combining-class ] insertion-sort to>> ] dip
|
[ dup [ combining-class ] insertion-sort to>> ] dip
|
||||||
] [ length t ] if* ;
|
] [ length t ] if* ; inline
|
||||||
|
|
||||||
: reorder-loop ( string start -- )
|
: reorder-loop ( string start -- )
|
||||||
dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
|
dupd reorder-next [ 2drop ] [ reorder-loop ] if ; inline recursive
|
||||||
|
|
||||||
: reorder ( string -- )
|
: reorder ( string -- )
|
||||||
0 reorder-loop ;
|
0 reorder-loop ;
|
||||||
|
@ -66,12 +66,14 @@ CONSTANT: final-count 28
|
||||||
over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ;
|
over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ;
|
||||||
|
|
||||||
:: decompose ( string quot -- decomposed )
|
:: decompose ( string quot -- decomposed )
|
||||||
[let | out [ string length <sbuf> ] |
|
string length <sbuf> :> out
|
||||||
string [
|
string [
|
||||||
|
>fixnum dup ascii? [ out push ] [
|
||||||
dup hangul? [ hangul>jamo out push-all ]
|
dup hangul? [ hangul>jamo out push-all ]
|
||||||
[ dup quot call [ out push-all ] [ out push ] ?if ] if
|
[ dup quot call [ out push-all ] [ out push ] ?if ] if
|
||||||
] each out >string
|
] if
|
||||||
] dup reorder ;
|
] each
|
||||||
|
out "" like dup reorder ; inline
|
||||||
|
|
||||||
: with-string ( str quot -- str )
|
: with-string ( str quot -- str )
|
||||||
over aux>> [ call ] [ drop ] if ; inline
|
over aux>> [ call ] [ drop ] if ; inline
|
||||||
|
@ -79,9 +81,13 @@ CONSTANT: final-count 28
|
||||||
: (nfd) ( string -- nfd )
|
: (nfd) ( string -- nfd )
|
||||||
[ canonical-entry ] decompose ;
|
[ canonical-entry ] decompose ;
|
||||||
|
|
||||||
|
HINTS: (nfd) string ;
|
||||||
|
|
||||||
: (nfkd) ( string -- nfkd )
|
: (nfkd) ( string -- nfkd )
|
||||||
[ compatibility-entry ] decompose ;
|
[ compatibility-entry ] decompose ;
|
||||||
|
|
||||||
|
HINTS: (nfkd) string ;
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: nfd ( string -- nfd )
|
: nfd ( string -- nfd )
|
||||||
|
@ -95,83 +101,89 @@ PRIVATE>
|
||||||
0 over ?nth non-starter?
|
0 over ?nth non-starter?
|
||||||
[ length dupd reorder-back ] [ drop ] if ;
|
[ length dupd reorder-back ] [ drop ] if ;
|
||||||
|
|
||||||
|
HINTS: string-append string string ;
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
! Normalization -- Composition
|
! Normalization -- Composition
|
||||||
SYMBOL: main-str
|
|
||||||
SYMBOL: ind
|
|
||||||
SYMBOL: after
|
|
||||||
SYMBOL: char
|
|
||||||
|
|
||||||
: get-str ( i -- ch ) ind get + main-str get ?nth ;
|
: initial-medial? ( str i -- ? )
|
||||||
: current ( -- ch ) 0 get-str ;
|
{ [ swap nth initial? ] [ 1+ swap ?nth medial? ] } 2&& ;
|
||||||
: to ( -- ) ind inc ;
|
|
||||||
|
|
||||||
: initial-medial? ( -- ? )
|
: --final? ( str i -- ? )
|
||||||
current initial? [ 1 get-str medial? ] [ f ] if ;
|
2 + swap ?nth final? ;
|
||||||
|
|
||||||
: --final? ( -- ? )
|
: imf, ( str i -- str i )
|
||||||
2 get-str final? ;
|
[ tail-slice first3 jamo>hangul , ]
|
||||||
|
[ 3 + ] 2bi ;
|
||||||
|
|
||||||
: imf, ( -- )
|
: im, ( str i -- str i )
|
||||||
current to current to current jamo>hangul , ;
|
[ tail-slice first2 final-base jamo>hangul , ]
|
||||||
|
[ 2 + ] 2bi ;
|
||||||
|
|
||||||
: im, ( -- )
|
: compose-jamo ( str i -- str i )
|
||||||
current to current final-base jamo>hangul , ;
|
2dup initial-medial? [
|
||||||
|
2dup --final? [ imf, ] [ im, ] if
|
||||||
|
] [ 2dup swap nth , 1+ ] if ;
|
||||||
|
|
||||||
: compose-jamo ( -- )
|
: pass-combining ( str -- str i )
|
||||||
initial-medial? [
|
dup [ non-starter? not ] find drop
|
||||||
--final? [ imf, ] [ im, ] if
|
[ dup length ] unless*
|
||||||
] [ current , ] if to ;
|
2dup head-slice % ;
|
||||||
|
|
||||||
: pass-combining ( -- )
|
TUPLE: compose-state i str char after last-class ;
|
||||||
current non-starter? [ current , to pass-combining ] when ;
|
|
||||||
|
|
||||||
:: try-compose ( last-class new-char current-class -- new-class )
|
: get-str ( state i -- ch )
|
||||||
last-class current-class = [ new-char after get push last-class ] [
|
swap [ i>> + ] [ str>> ] bi ?nth ;
|
||||||
char get new-char combine-chars
|
: current ( state -- ch ) 0 get-str ;
|
||||||
[ char set last-class ]
|
: to ( state -- state ) [ 1+ ] change-i ;
|
||||||
[ new-char after get push current-class ] if*
|
: push-after ( ch state -- state ) [ ?push ] change-after ;
|
||||||
|
|
||||||
|
:: try-compose ( state new-char current-class -- state )
|
||||||
|
state last-class>> current-class =
|
||||||
|
[ new-char state push-after ] [
|
||||||
|
state char>> new-char combine-chars
|
||||||
|
[ state swap >>char ] [
|
||||||
|
new-char state push-after
|
||||||
|
current-class >>last-class
|
||||||
|
] if*
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
DEFER: compose-iter
|
DEFER: compose-iter
|
||||||
|
|
||||||
: try-noncombining ( char -- )
|
: try-noncombining ( char state -- state )
|
||||||
char get swap combine-chars
|
tuck char>> swap combine-chars
|
||||||
[ char set to f compose-iter ] when* ;
|
[ >>char to f >>last-class compose-iter ] when* ;
|
||||||
|
|
||||||
: compose-iter ( last-class -- )
|
: compose-iter ( state -- state )
|
||||||
current [
|
dup current [
|
||||||
dup combining-class {
|
dup combining-class {
|
||||||
{ f [ 2drop ] }
|
{ f [ drop ] }
|
||||||
{ 0 [ swap [ drop ] [ try-noncombining ] if ] }
|
{ 0 [
|
||||||
|
over last-class>>
|
||||||
|
[ drop ] [ swap try-noncombining ] if ] }
|
||||||
[ try-compose to compose-iter ]
|
[ try-compose to compose-iter ]
|
||||||
} case
|
} case
|
||||||
] [ drop ] if* ;
|
] when* ;
|
||||||
|
|
||||||
: ?new-after ( -- )
|
: compose-combining ( ch str i -- str i )
|
||||||
after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
|
compose-state new
|
||||||
|
swap >>i
|
||||||
|
swap >>str
|
||||||
|
swap >>char
|
||||||
|
compose-iter
|
||||||
|
{ [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ;
|
||||||
|
|
||||||
: compose-combining ( ch -- )
|
:: (compose) ( str i -- )
|
||||||
char set to ?new-after
|
i str ?nth [
|
||||||
f compose-iter
|
dup jamo? [ drop str i compose-jamo ] [
|
||||||
char get , after get % ;
|
i 1+ str ?nth combining-class
|
||||||
|
[ str i 1+ compose-combining ] [ , str i 1+ ] if
|
||||||
: (compose) ( -- )
|
|
||||||
current [
|
|
||||||
dup jamo? [ drop compose-jamo ] [
|
|
||||||
1 get-str combining-class
|
|
||||||
[ compose-combining ] [ , to ] if
|
|
||||||
] if (compose)
|
] if (compose)
|
||||||
] when* ;
|
] when* ;
|
||||||
|
|
||||||
: combine ( str -- comp )
|
: combine ( str -- comp )
|
||||||
[
|
[ pass-combining (compose) ] "" make ;
|
||||||
main-str set
|
|
||||||
0 ind set
|
|
||||||
SBUF" " clone after set
|
|
||||||
pass-combining (compose)
|
|
||||||
] "" make ;
|
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue