Speeding up normalization

2009-01-09 14:03:33 -06:00 · 2009-01-09 14:03:33 -06:00 · 1ed964e539
parent 8d8efb6dce
commit 1ed964e539
3 changed files with 114 additions and 91 deletions
--- a/basis/unicode/case/case.factor
+++ b/basis/unicode/case/case.factor
@ -1,16 +1,18 @@
 ! Copyright (C) 2008 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.data sequences sequences.next namespaces make unicode.syntax
+USING: unicode.data sequences sequences.next namespaces
-unicode.normalize math unicode.categories combinators unicode.syntax
+sbufs make unicode.syntax unicode.normalize math hints
-assocs strings splitting kernel accessors unicode.breaks fry ;
+unicode.categories combinators unicode.syntax assocs
 strings splitting kernel accessors unicode.breaks fry locals ;
 QUALIFIED: ascii
 IN: unicode.case
 <PRIVATE
-: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ;
+: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ; inline
-: ch>lower ( ch -- lower ) simple-lower at-default ;
+: ch>lower ( ch -- lower ) simple-lower at-default ; inline
-: ch>upper ( ch -- upper ) simple-upper at-default ;
+: ch>upper ( ch -- upper ) simple-upper at-default ; inline
-: ch>title ( ch -- title ) simple-title at-default ;
+: ch>title ( ch -- title ) simple-title at-default ; inline
 PRIVATE>
 SYMBOL: locale ! Just casing locale, or overall?
@ -21,7 +23,7 @@ SYMBOL: locale ! Just casing locale, or overall?
    [ dup ] swap '[ _ split1-slice swap ] [ ] produce nip ;
 : replace ( old new str -- newstr )
-    [ split-subseq ] dip join ;
+    [ split-subseq ] dip join ; inline
 : i-dot? ( -- ? )
    locale get { "tr" "az" } member? ;
@ -44,24 +46,24 @@ SYMBOL: locale ! Just casing locale, or overall?
    [ [ "" ] [
        dup first mark-above?
        [ CHAR: combining-dot-above prefix ] when
-    ] if-empty ] with-rest ;
+    ] if-empty ] with-rest ; inline
 : lithuanian>lower ( string -- lower )
    "i" split add-dots "i" join
-    "j" split add-dots "i" join ;
+    "j" split add-dots "i" join ; inline
 : turk>upper ( string -- upper-i )
-    "i" "I\u000307" replace ;
+    "i" "I\u000307" replace ; inline
 : turk>lower ( string -- lower-i )
    "I\u000307" "i" replace
-    "I" "\u000131" replace ;
+    "I" "\u000131" replace ; inline
 : fix-sigma-end ( string -- string )
    [ "" ] [
        dup peek CHAR: greek-small-letter-sigma =
        [ 1 head* CHAR: greek-small-letter-final-sigma suffix ] when
-    ] if-empty ;
+    ] if-empty ; inline
 : sigma-map ( string -- string )
    { CHAR: greek-capital-letter-sigma } split [ [
@ -70,19 +72,20 @@ SYMBOL: locale ! Just casing locale, or overall?
            CHAR: greek-small-letter-final-sigma
            CHAR: greek-small-letter-sigma ? prefix
        ] if-empty
-    ] map ] with-rest concat fix-sigma-end ;
+    ] map ] with-rest concat fix-sigma-end ; inline
 : final-sigma ( string -- string )
    CHAR: greek-capital-letter-sigma
-    over member? [ sigma-map ] when ;
+    over member? [ sigma-map ] when
    "" like ; inline
-: map-case ( string string-quot char-quot -- case )
+:: map-case ( string string-quot char-quot -- case )
-    [
+    string length <sbuf> :> out
-        [
+    string [
-            [ dup special-casing at ] 2dip
+        dup special-casing at
-            [ [ % ] compose ] [ [ , ] compose ] bi* ?if
+        [ string-quot call out push-all ]
-        ] 2curry each
+        [ char-quot call out push ] ?if
-    ] "" make ; inline
+    ] each out "" like ; inline
 PRIVATE>
@ -90,24 +93,30 @@ PRIVATE>
    i-dot? [ turk>lower ] when final-sigma
    [ lower>> ] [ ch>lower ] map-case ;
 HINTS: >lower string ;
 : >upper ( string -- upper )
    i-dot? [ turk>upper ] when
    [ upper>> ] [ ch>upper ] map-case ;
 HINTS: >upper string ;
 <PRIVATE
 : (>title) ( string -- title )
    i-dot? [ turk>upper ] when
-    [ title>> ] [ ch>title ] map-case ;
+    [ title>> ] [ ch>title ] map-case ; inline
 : title-word ( string -- title )
-    unclip 1string [ >lower ] [ (>title) ] bi* prepend ;
+    unclip 1string [ >lower ] [ (>title) ] bi* prepend ; inline
 PRIVATE>
 : >title ( string -- title )
    final-sigma >words [ title-word ] map concat ;
 HINTS: >title string ;
 : >case-fold ( string -- fold )
    >upper >lower ;
--- a/basis/unicode/normalize/normalize-tests.factor
+++ b/basis/unicode/normalize/normalize-tests.factor
@ -3,6 +3,8 @@ unicode.data io.encodings.utf8 io.files splitting math.parser
 locals math quotations assocs combinators unicode.normalize.private ;
 IN: unicode.normalize.tests
 { nfc nfkc nfd nfkd } [ must-infer ] each
 [ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test
 [ "ab\u00064b\u000347\u00034e\u00034d\u000346" ] [ "ab\u000346\u000347\u00064b\u00034e\u00034d" dup reorder ] unit-test
--- a/basis/unicode/normalize/normalize.factor
+++ b/basis/unicode/normalize/normalize.factor
@ -1,8 +1,8 @@
 ! Copyright (C) 2008 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: sequences namespaces make unicode.data kernel math arrays
+USING: ascii sequences namespaces make unicode.data kernel math arrays
 locals sorting.insertion accessors assocs math.order combinators
-unicode.syntax strings sbufs ;
+unicode.syntax strings sbufs hints combinators.short-circuit vectors ;
 IN: unicode.normalize
 <PRIVATE
@ -19,16 +19,16 @@ CONSTANT: medial-count 21
 CONSTANT: final-count 28
 : ?between? ( n/f from to -- ? )
-    pick [ between? ] [ 3drop f ] if ;
+    pick [ between? ] [ 3drop f ] if ; inline
-: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
+: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ; inline
-: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
+: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ; inline
 ! These numbers come from UAX 29
 : initial? ( ch -- ? )
-    dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ;
+    dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ; inline
-: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
+: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ; inline
-: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
+: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ; inline
 : hangul>jamo ( hangul -- jamo-string )
    hangul-base - final-count /mod final-base +
@ -48,16 +48,16 @@ CONSTANT: final-count 28
 : reorder-slice ( string start -- slice done? )
    2dup swap [ non-starter? not ] find-from drop
-    [ [ over length ] unless* rot <slice> ] keep not ;
+    [ [ over length ] unless* rot <slice> ] keep not ; inline
 : reorder-next ( string i -- new-i done? )
    over [ non-starter? ] find-from drop [
        reorder-slice
        [ dup [ combining-class ] insertion-sort to>> ] dip
-    ] [ length t ] if* ;
+    ] [ length t ] if* ; inline
 : reorder-loop ( string start -- )
-    dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
+    dupd reorder-next [ 2drop ] [ reorder-loop ] if ; inline recursive
 : reorder ( string -- )
    0 reorder-loop ;
@ -66,12 +66,14 @@ CONSTANT: final-count 28
    over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ;
 :: decompose ( string quot -- decomposed )
-    [let | out [ string length <sbuf> ] |
+    string length <sbuf> :> out
-        string [
+    string [
        >fixnum dup ascii? [ out push ] [
            dup hangul? [ hangul>jamo out push-all ]
            [ dup quot call [ out push-all ] [ out push ] ?if ] if
-        ] each out >string
+        ] if
-    ] dup reorder ;
+    ] each
    out "" like dup reorder ; inline
 : with-string ( str quot -- str )
    over aux>> [ call ] [ drop ] if ; inline
@ -79,9 +81,13 @@ CONSTANT: final-count 28
 : (nfd) ( string -- nfd )
    [ canonical-entry ] decompose ;
 HINTS: (nfd) string ;
 : (nfkd) ( string -- nfkd )
    [ compatibility-entry ] decompose ;
 HINTS: (nfkd) string ;
 PRIVATE>
 : nfd ( string -- nfd )
@ -95,83 +101,89 @@ PRIVATE>
    0 over ?nth non-starter?
    [ length dupd reorder-back ] [ drop ] if ;
 HINTS: string-append string string ;
 <PRIVATE
 ! Normalization -- Composition
 SYMBOL: main-str
 SYMBOL: ind
 SYMBOL: after
 SYMBOL: char
-: get-str ( i -- ch ) ind get + main-str get ?nth ;
+: initial-medial? ( str i -- ? )
-: current ( -- ch ) 0 get-str ;
+    { [ swap nth initial? ] [ 1+ swap ?nth medial? ] } 2&& ;
 : to ( -- ) ind inc ;
-: initial-medial? ( -- ? )
+: --final? ( str i -- ? )
-    current initial? [ 1 get-str medial? ] [ f ] if ;
+    2 + swap ?nth final? ;
-: --final? ( -- ? )
+: imf, ( str i -- str i )
-    2 get-str final? ;
+    [ tail-slice first3 jamo>hangul , ]
    [ 3 + ] 2bi ;
-: imf, ( -- )
+: im, ( str i -- str i )
-    current to current to current jamo>hangul , ;
+    [ tail-slice first2 final-base jamo>hangul , ]
    [ 2 + ] 2bi ;
-: im, ( -- )
+: compose-jamo ( str i -- str i )
-    current to current final-base jamo>hangul , ;
+    2dup initial-medial? [
        2dup --final? [ imf, ] [ im, ] if
    ] [ 2dup swap nth , 1+ ] if ;
-: compose-jamo ( -- )
+: pass-combining ( str -- str i )
-    initial-medial? [
+    dup [ non-starter? not ] find drop
-        --final? [ imf, ] [ im, ] if
+    [ dup length ] unless*
-    ] [ current , ] if to ;
+    2dup head-slice % ;
-: pass-combining ( -- )
+TUPLE: compose-state i str char after last-class ;
    current non-starter? [ current , to pass-combining ] when ;
-:: try-compose ( last-class new-char current-class -- new-class )
+: get-str ( state i -- ch )
-    last-class current-class = [ new-char after get push last-class ] [
+    swap [ i>> + ] [ str>> ] bi ?nth ;
-        char get new-char combine-chars
+: current ( state -- ch ) 0 get-str ;
-        [ char set last-class ]
+: to ( state -- state ) [ 1+ ] change-i ;
-        [ new-char after get push current-class ] if*
+: push-after ( ch state -- state ) [ ?push ] change-after ;
 :: try-compose ( state new-char current-class -- state )
    state last-class>> current-class =
    [ new-char state push-after ] [
        state char>> new-char combine-chars
        [ state swap >>char ] [
            new-char state push-after
            current-class >>last-class
        ] if*
    ] if ;
 DEFER: compose-iter
-: try-noncombining ( char -- )
+: try-noncombining ( char state -- state )
-    char get swap combine-chars
+    tuck char>> swap combine-chars
-    [ char set to f compose-iter ] when* ;
+    [ >>char to f >>last-class compose-iter ] when* ;
-: compose-iter ( last-class -- )
+: compose-iter ( state -- state )
-    current [
+    dup current [
        dup combining-class {
-            { f [ 2drop ] }
+            { f [ drop ] }
-            { 0 [ swap [ drop ] [ try-noncombining ] if ] }
+            { 0 [
                over last-class>>
                [ drop ] [ swap try-noncombining ] if ] }
            [ try-compose to compose-iter ]
        } case
-    ] [ drop ] if* ;
+    ] when* ;
-: ?new-after ( -- )
+: compose-combining ( ch str i -- str i )
-    after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
+    compose-state new
        swap >>i
        swap >>str
        swap >>char
    compose-iter
    { [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ;
-: compose-combining ( ch -- )
+:: (compose) ( str i -- )
-    char set to ?new-after
+    i str ?nth [
-    f compose-iter
+        dup jamo? [ drop str i compose-jamo ] [
-    char get , after get % ;
+            i 1+ str ?nth combining-class
-
+            [ str i 1+ compose-combining ] [ , str i 1+ ] if
 : (compose) ( -- )
    current [
        dup jamo? [ drop compose-jamo ] [
            1 get-str combining-class
            [ compose-combining ] [ , to ] if
        ] if (compose)
    ] when* ;
 : combine ( str -- comp )
-    [
+    [ pass-combining (compose) ] "" make ;
        main-str set
        0 ind set
        SBUF" " clone after set
        pass-combining (compose)
    ] "" make ;
 PRIVATE>