From 1ed964e53989341a94875dcf5d547dbba9b158e9 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 9 Jan 2009 14:03:33 -0600 Subject: [PATCH] Speeding up normalization --- basis/unicode/case/case.factor | 57 ++++--- .../unicode/normalize/normalize-tests.factor | 2 + basis/unicode/normalize/normalize.factor | 146 ++++++++++-------- 3 files changed, 114 insertions(+), 91 deletions(-) diff --git a/basis/unicode/case/case.factor b/basis/unicode/case/case.factor index 773bbeed5f..555a39ac88 100644 --- a/basis/unicode/case/case.factor +++ b/basis/unicode/case/case.factor @@ -1,16 +1,18 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: unicode.data sequences sequences.next namespaces make unicode.syntax -unicode.normalize math unicode.categories combinators unicode.syntax -assocs strings splitting kernel accessors unicode.breaks fry ; +USING: unicode.data sequences sequences.next namespaces +sbufs make unicode.syntax unicode.normalize math hints +unicode.categories combinators unicode.syntax assocs +strings splitting kernel accessors unicode.breaks fry locals ; +QUALIFIED: ascii IN: unicode.case lower ( ch -- lower ) simple-lower at-default ; -: ch>upper ( ch -- upper ) simple-upper at-default ; -: ch>title ( ch -- title ) simple-title at-default ; +: ch>lower ( ch -- lower ) simple-lower at-default ; inline +: ch>upper ( ch -- upper ) simple-upper at-default ; inline +: ch>title ( ch -- title ) simple-title at-default ; inline PRIVATE> SYMBOL: locale ! Just casing locale, or overall? @@ -21,7 +23,7 @@ SYMBOL: locale ! Just casing locale, or overall? [ dup ] swap '[ _ split1-slice swap ] [ ] produce nip ; : replace ( old new str -- newstr ) - [ split-subseq ] dip join ; + [ split-subseq ] dip join ; inline : i-dot? ( -- ? ) locale get { "tr" "az" } member? ; @@ -44,24 +46,24 @@ SYMBOL: locale ! Just casing locale, or overall? [ [ "" ] [ dup first mark-above? [ CHAR: combining-dot-above prefix ] when - ] if-empty ] with-rest ; + ] if-empty ] with-rest ; inline : lithuanian>lower ( string -- lower ) "i" split add-dots "i" join - "j" split add-dots "i" join ; + "j" split add-dots "i" join ; inline : turk>upper ( string -- upper-i ) - "i" "I\u000307" replace ; + "i" "I\u000307" replace ; inline : turk>lower ( string -- lower-i ) "I\u000307" "i" replace - "I" "\u000131" replace ; + "I" "\u000131" replace ; inline : fix-sigma-end ( string -- string ) [ "" ] [ dup peek CHAR: greek-small-letter-sigma = [ 1 head* CHAR: greek-small-letter-final-sigma suffix ] when - ] if-empty ; + ] if-empty ; inline : sigma-map ( string -- string ) { CHAR: greek-capital-letter-sigma } split [ [ @@ -70,19 +72,20 @@ SYMBOL: locale ! Just casing locale, or overall? CHAR: greek-small-letter-final-sigma CHAR: greek-small-letter-sigma ? prefix ] if-empty - ] map ] with-rest concat fix-sigma-end ; + ] map ] with-rest concat fix-sigma-end ; inline : final-sigma ( string -- string ) CHAR: greek-capital-letter-sigma - over member? [ sigma-map ] when ; + over member? [ sigma-map ] when + "" like ; inline -: map-case ( string string-quot char-quot -- case ) - [ - [ - [ dup special-casing at ] 2dip - [ [ % ] compose ] [ [ , ] compose ] bi* ?if - ] 2curry each - ] "" make ; inline +:: map-case ( string string-quot char-quot -- case ) + string length :> out + string [ + dup special-casing at + [ string-quot call out push-all ] + [ char-quot call out push ] ?if + ] each out "" like ; inline PRIVATE> @@ -90,24 +93,30 @@ PRIVATE> i-dot? [ turk>lower ] when final-sigma [ lower>> ] [ ch>lower ] map-case ; +HINTS: >lower string ; + : >upper ( string -- upper ) i-dot? [ turk>upper ] when [ upper>> ] [ ch>upper ] map-case ; +HINTS: >upper string ; + title) ( string -- title ) i-dot? [ turk>upper ] when - [ title>> ] [ ch>title ] map-case ; + [ title>> ] [ ch>title ] map-case ; inline : title-word ( string -- title ) - unclip 1string [ >lower ] [ (>title) ] bi* prepend ; + unclip 1string [ >lower ] [ (>title) ] bi* prepend ; inline PRIVATE> : >title ( string -- title ) final-sigma >words [ title-word ] map concat ; +HINTS: >title string ; + : >case-fold ( string -- fold ) >upper >lower ; diff --git a/basis/unicode/normalize/normalize-tests.factor b/basis/unicode/normalize/normalize-tests.factor index 25d5ce365c..1242e1d358 100644 --- a/basis/unicode/normalize/normalize-tests.factor +++ b/basis/unicode/normalize/normalize-tests.factor @@ -3,6 +3,8 @@ unicode.data io.encodings.utf8 io.files splitting math.parser locals math quotations assocs combinators unicode.normalize.private ; IN: unicode.normalize.tests +{ nfc nfkc nfd nfkd } [ must-infer ] each + [ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test [ "ab\u00064b\u000347\u00034e\u00034d\u000346" ] [ "ab\u000346\u000347\u00064b\u00034e\u00034d" dup reorder ] unit-test diff --git a/basis/unicode/normalize/normalize.factor b/basis/unicode/normalize/normalize.factor index 7a41a768cd..f7aa248028 100644 --- a/basis/unicode/normalize/normalize.factor +++ b/basis/unicode/normalize/normalize.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: sequences namespaces make unicode.data kernel math arrays +USING: ascii sequences namespaces make unicode.data kernel math arrays locals sorting.insertion accessors assocs math.order combinators -unicode.syntax strings sbufs ; +unicode.syntax strings sbufs hints combinators.short-circuit vectors ; IN: unicode.normalize jamo ( hangul -- jamo-string ) hangul-base - final-count /mod final-base + @@ -48,16 +48,16 @@ CONSTANT: final-count 28 : reorder-slice ( string start -- slice done? ) 2dup swap [ non-starter? not ] find-from drop - [ [ over length ] unless* rot ] keep not ; + [ [ over length ] unless* rot ] keep not ; inline : reorder-next ( string i -- new-i done? ) over [ non-starter? ] find-from drop [ reorder-slice [ dup [ combining-class ] insertion-sort to>> ] dip - ] [ length t ] if* ; + ] [ length t ] if* ; inline : reorder-loop ( string start -- ) - dupd reorder-next [ 2drop ] [ reorder-loop ] if ; + dupd reorder-next [ 2drop ] [ reorder-loop ] if ; inline recursive : reorder ( string -- ) 0 reorder-loop ; @@ -66,12 +66,14 @@ CONSTANT: final-count 28 over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ; :: decompose ( string quot -- decomposed ) - [let | out [ string length ] | - string [ + string length :> out + string [ + >fixnum dup ascii? [ out push ] [ dup hangul? [ hangul>jamo out push-all ] [ dup quot call [ out push-all ] [ out push ] ?if ] if - ] each out >string - ] dup reorder ; + ] if + ] each + out "" like dup reorder ; inline : with-string ( str quot -- str ) over aux>> [ call ] [ drop ] if ; inline @@ -79,9 +81,13 @@ CONSTANT: final-count 28 : (nfd) ( string -- nfd ) [ canonical-entry ] decompose ; +HINTS: (nfd) string ; + : (nfkd) ( string -- nfkd ) [ compatibility-entry ] decompose ; +HINTS: (nfkd) string ; + PRIVATE> : nfd ( string -- nfd ) @@ -95,83 +101,89 @@ PRIVATE> 0 over ?nth non-starter? [ length dupd reorder-back ] [ drop ] if ; +HINTS: string-append string string ; + hangul , ] + [ 3 + ] 2bi ; -: imf, ( -- ) - current to current to current jamo>hangul , ; +: im, ( str i -- str i ) + [ tail-slice first2 final-base jamo>hangul , ] + [ 2 + ] 2bi ; -: im, ( -- ) - current to current final-base jamo>hangul , ; +: compose-jamo ( str i -- str i ) + 2dup initial-medial? [ + 2dup --final? [ imf, ] [ im, ] if + ] [ 2dup swap nth , 1+ ] if ; -: compose-jamo ( -- ) - initial-medial? [ - --final? [ imf, ] [ im, ] if - ] [ current , ] if to ; +: pass-combining ( str -- str i ) + dup [ non-starter? not ] find drop + [ dup length ] unless* + 2dup head-slice % ; -: pass-combining ( -- ) - current non-starter? [ current , to pass-combining ] when ; +TUPLE: compose-state i str char after last-class ; -:: try-compose ( last-class new-char current-class -- new-class ) - last-class current-class = [ new-char after get push last-class ] [ - char get new-char combine-chars - [ char set last-class ] - [ new-char after get push current-class ] if* +: get-str ( state i -- ch ) + swap [ i>> + ] [ str>> ] bi ?nth ; +: current ( state -- ch ) 0 get-str ; +: to ( state -- state ) [ 1+ ] change-i ; +: push-after ( ch state -- state ) [ ?push ] change-after ; + +:: try-compose ( state new-char current-class -- state ) + state last-class>> current-class = + [ new-char state push-after ] [ + state char>> new-char combine-chars + [ state swap >>char ] [ + new-char state push-after + current-class >>last-class + ] if* ] if ; DEFER: compose-iter -: try-noncombining ( char -- ) - char get swap combine-chars - [ char set to f compose-iter ] when* ; +: try-noncombining ( char state -- state ) + tuck char>> swap combine-chars + [ >>char to f >>last-class compose-iter ] when* ; -: compose-iter ( last-class -- ) - current [ +: compose-iter ( state -- state ) + dup current [ dup combining-class { - { f [ 2drop ] } - { 0 [ swap [ drop ] [ try-noncombining ] if ] } + { f [ drop ] } + { 0 [ + over last-class>> + [ drop ] [ swap try-noncombining ] if ] } [ try-compose to compose-iter ] } case - ] [ drop ] if* ; + ] when* ; -: ?new-after ( -- ) - after [ dup empty? [ drop SBUF" " clone ] unless ] change ; +: compose-combining ( ch str i -- str i ) + compose-state new + swap >>i + swap >>str + swap >>char + compose-iter + { [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ; -: compose-combining ( ch -- ) - char set to ?new-after - f compose-iter - char get , after get % ; - -: (compose) ( -- ) - current [ - dup jamo? [ drop compose-jamo ] [ - 1 get-str combining-class - [ compose-combining ] [ , to ] if +:: (compose) ( str i -- ) + i str ?nth [ + dup jamo? [ drop str i compose-jamo ] [ + i 1+ str ?nth combining-class + [ str i 1+ compose-combining ] [ , str i 1+ ] if ] if (compose) ] when* ; : combine ( str -- comp ) - [ - main-str set - 0 ind set - SBUF" " clone after set - pass-combining (compose) - ] "" make ; + [ pass-combining (compose) ] "" make ; PRIVATE>