From 9d3d3f815faf2bcb5f4479bb7cc7abe901b9136d Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 5 Jan 2009 22:19:14 -0600 Subject: [PATCH] Unicode normalization bug fixes (incomplete) --- basis/unicode/data/data.factor | 3 ++- basis/unicode/normalize/normalize-tests.factor | 2 +- basis/unicode/normalize/normalize.factor | 16 ++++++++-------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor index 80cf40fbf1..f86dccb555 100644 --- a/basis/unicode/data/data.factor +++ b/basis/unicode/data/data.factor @@ -4,7 +4,7 @@ USING: combinators.short-circuit assocs math kernel sequences io.files hashtables quotations splitting grouping arrays math.parser hash2 math.order byte-arrays words namespaces words compiler.units parser io.encodings.ascii values interval-maps -ascii sets combinators locals math.ranges sorting ; +ascii sets combinators locals math.ranges sorting make ; IN: unicode.data VALUE: simple-lower @@ -102,6 +102,7 @@ VALUE: properties "Cc" "Cf" "Cs" "Co" } ; : num-chars HEX: 2FA1E ; + ! the maximum unicode char in the first 3 planes : ?set-nth ( val index seq -- ) diff --git a/basis/unicode/normalize/normalize-tests.factor b/basis/unicode/normalize/normalize-tests.factor index cae1380ab4..6970e1a2b6 100644 --- a/basis/unicode/normalize/normalize-tests.factor +++ b/basis/unicode/normalize/normalize-tests.factor @@ -41,4 +41,4 @@ IN: unicode.normalize.tests [ { { 5 { 1 2 3 4 5 } } } [ nfkd ] assert= ] } cleave ; -! parse-test [ run-line ] each +parse-test 1000 head [ run-line ] each diff --git a/basis/unicode/normalize/normalize.factor b/basis/unicode/normalize/normalize.factor index 0c00f526c7..0e1881785f 100644 --- a/basis/unicode/normalize/normalize.factor +++ b/basis/unicode/normalize/normalize.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: sequences namespaces make unicode.data kernel math arrays -locals sorting.insertion accessors ; +locals sorting.insertion accessors assocs ; IN: unicode.normalize ! Conjoining Jamo behavior @@ -117,16 +117,17 @@ SYMBOL: char : pass-combining ( -- ) current non-starter? [ current , to pass-combining ] when ; -: try-compose ( last-class char current-class -- ) - swapd = [ after get push ] [ - char get over combine-chars - [ nip char set ] [ after get push ] if* +:: try-compose ( last-class new-char current-class -- new-class ) + last-class current-class = [ new-char after get push last-class ] [ + char get new-char combine-chars + [ char set last-class ] + [ new-char after get push current-class ] if* ] if ; -: compose-iter ( n -- ) +: compose-iter ( last-class -- ) current [ dup combining-class dup - [ [ try-compose ] keep to compose-iter ] [ 3drop ] if + [ try-compose to compose-iter ] [ 3drop ] if ] [ drop ] if* ; : ?new-after ( -- ) @@ -138,7 +139,6 @@ SYMBOL: char char set to ?new-after 0 compose-iter char get , after get % - to ] if (compose) ] when* ;