From cf3473cc911c5f0b404675217d2196e1a080f611 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 8 Jan 2009 17:56:52 -0600 Subject: [PATCH] Making normalization and case conversion faster --- basis/unicode/case/case.factor | 2 +- basis/unicode/data/data.factor | 9 +++- basis/unicode/normalize/normalize.factor | 56 ++++++++++++++---------- 3 files changed, 42 insertions(+), 25 deletions(-) diff --git a/basis/unicode/case/case.factor b/basis/unicode/case/case.factor index b0472cd9cb..99278cd72e 100644 --- a/basis/unicode/case/case.factor +++ b/basis/unicode/case/case.factor @@ -18,7 +18,7 @@ SYMBOL: locale ! Just casing locale, or overall? char ( name -- char ) name-map at ; : char>name ( char -- name ) name-map value-at ; : property? ( char property -- ? ) properties at interval-key? ; @@ -183,6 +183,13 @@ load-data { [ process-category to: category-map ] } cleave +: postprocess-class ( -- ) + combine-map [ [ second ] map ] map concat + [ combining-class not ] filter + [ 0 swap class-map set-at ] each ; + +postprocess-class + load-special-casing to: special-casing load-properties to: properties diff --git a/basis/unicode/normalize/normalize.factor b/basis/unicode/normalize/normalize.factor index 58ce412a2e..c8d0eb3f7d 100644 --- a/basis/unicode/normalize/normalize.factor +++ b/basis/unicode/normalize/normalize.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: sequences namespaces make unicode.data kernel math arrays -locals sorting.insertion accessors assocs math.order ; +locals sorting.insertion accessors assocs math.order combinators ; IN: unicode.normalize jamo % ] - [ dup quot call [ % ] [ , ] ?if ] if - ] each - ] "" make - dup reorder - ] if ; inline + [ + string [ + dup hangul? [ hangul>jamo % ] + [ dup quot call [ % ] [ , ] ?if ] if + ] each + ] "" make + dup reorder ; + +: with-string ( str quot -- str ) + over aux>> [ call ] [ drop ] if ; inline + +: (nfd) ( string -- nfd ) + [ canonical-entry ] decompose ; + +: (nfkd) ( string -- nfkd ) + [ compatibility-entry ] decompose ; PRIVATE> : nfd ( string -- nfd ) - [ canonical-entry ] decompose ; + [ (nfd) ] with-string ; : nfkd ( string -- nfkd ) - [ compatibility-entry ] decompose ; + [ (nfkd) ] with-string ; : string-append ( s1 s2 -- string ) [ append ] keep @@ -138,20 +142,26 @@ DEFER: compose-iter : compose-iter ( last-class -- ) current [ - dup combining-class - [ try-compose to compose-iter ] - [ swap [ drop ] [ try-noncombining ] if ] if* + dup combining-class { + { f [ 2drop ] } + { 0 [ swap [ drop ] [ try-noncombining ] if ] } + [ try-compose to compose-iter ] + } case ] [ drop ] if* ; : ?new-after ( -- ) after [ dup empty? [ drop SBUF" " clone ] unless ] change ; +: compose-combining ( ch -- ) + char set to ?new-after + f compose-iter + char get , after get % ; + : (compose) ( -- ) current [ dup jamo? [ drop compose-jamo ] [ - char set to ?new-after - f compose-iter - char get , after get % + 1 get-str combining-class + [ compose-combining ] [ , to ] if ] if (compose) ] when* ; @@ -166,7 +176,7 @@ DEFER: compose-iter PRIVATE> : nfc ( string -- nfc ) - nfd combine ; + [ (nfd) combine ] with-string ; : nfkc ( string -- nfkc ) - nfkd combine ; + [ (nfkd) combine ] with-string ;