From 1dd34ad776453109d25054bab8f8e59385a18a0b Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 18 Mar 2009 21:04:36 -0500 Subject: [PATCH] Reorganizing Unicode data, so that the data tables are private; ch>upper moves to unicode.data --- basis/regexp/nfa/nfa.factor | 4 +- basis/unicode/breaks/breaks.factor | 7 ++-- basis/unicode/case/case-tests.factor | 3 +- basis/unicode/case/case.factor | 8 +--- basis/unicode/data/data-docs.factor | 7 +++- basis/unicode/data/data.factor | 62 ++++++++++++++++++++++++---- basis/unicode/script/script.factor | 28 +------------ 7 files changed, 70 insertions(+), 49 deletions(-) diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index f04e88070a..a692f70778 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -3,11 +3,11 @@ USING: accessors arrays assocs grouping kernel locals math namespaces sequences fry quotations math.order math.ranges vectors unicode.categories regexp.transition-tables words sets hashtables -combinators.short-circuit unicode.case unicode.case.private regexp.ast +combinators.short-circuit unicode.data regexp.ast regexp.classes memoize ; IN: regexp.nfa -! This uses unicode.case.private for ch>upper and ch>lower +! This uses unicode.data for ch>upper and ch>lower ! but case-insensitive matching should be done by case-folding everything ! before processing starts diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor index f2e9454545..91f6a45911 100644 --- a/basis/unicode/breaks/breaks.factor +++ b/basis/unicode/breaks/breaks.factor @@ -2,8 +2,9 @@ ! See http://factorcode.org/license.txt for BSD license. USING: combinators.short-circuit unicode.categories kernel math combinators splitting sequences math.parser io.files io assocs -arrays namespaces make math.ranges unicode.normalize.private values -io.encodings.ascii unicode.syntax unicode.data compiler.units fry +arrays namespaces make math.ranges unicode.normalize +unicode.normalize.private values io.encodings.ascii +unicode.syntax unicode.data compiler.units fry alien.syntax sets accessors interval-maps memoize locals words ; IN: unicode.breaks @@ -126,7 +127,7 @@ to: grapheme-table VALUE: word-break-table -"vocab:unicode/data/WordBreakProperty.txt" load-script +"vocab:unicode/data/WordBreakProperty.txt" load-key-value to: word-break-table C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter diff --git a/basis/unicode/case/case-tests.factor b/basis/unicode/case/case-tests.factor index 52a8d9755e..a76f5e78c4 100644 --- a/basis/unicode/case/case-tests.factor +++ b/basis/unicode/case/case-tests.factor @@ -1,6 +1,7 @@ ! Copyright (C) 2008, 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: unicode.case unicode.case.private tools.test namespaces strings unicode.normalize ; +USING: unicode.case tools.test namespaces strings unicode.normalize +unicode.case.private ; IN: unicode.case.tests \ >upper must-infer diff --git a/basis/unicode/case/case.factor b/basis/unicode/case/case.factor index c75582dacd..fa842b8b81 100644 --- a/basis/unicode/case/case.factor +++ b/basis/unicode/case/case.factor @@ -7,12 +7,6 @@ strings splitting kernel accessors unicode.breaks fry locals ; QUALIFIED: ascii IN: unicode.case -lower ( ch -- lower ) simple-lower at-default ; inline -: ch>upper ( ch -- upper ) simple-upper at-default ; inline -: ch>title ( ch -- title ) simple-title at-default ; inline -PRIVATE> - SYMBOL: locale ! Just casing locale, or overall? :> out string [ - dup special-casing at + dup special-case [ string-quot call out push-all ] [ char-quot call out push ] ?if ] each out "" like ; inline diff --git a/basis/unicode/data/data-docs.factor b/basis/unicode/data/data-docs.factor index 0123c98a67..d1a458eb48 100644 --- a/basis/unicode/data/data-docs.factor +++ b/basis/unicode/data/data-docs.factor @@ -13,7 +13,8 @@ ARTICLE: "unicode.data" "Unicode data tables" { $subsection non-starter? } { $subsection name>char } { $subsection char>name } -{ $subsection property? } ; +{ $subsection property? } +{ $subsection load-key-value } ; HELP: canonical-entry { $values { "char" "a code point" } { "seq" string } } @@ -46,3 +47,7 @@ HELP: name>char HELP: property? { $values { "char" "a code point" } { "property" string } { "?" "a boolean" } } { $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ; + +HELP: load-key-value +{ $values { "filename" string } { "table" "an interval map" } } +{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ; diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor index a1f663d03a..93df3d5a8c 100644 --- a/basis/unicode/data/data.factor +++ b/basis/unicode/data/data.factor @@ -8,6 +8,8 @@ ascii sets combinators locals math.ranges sorting make strings.parser io.encodings.utf8 memoize ; IN: unicode.data +char ( name -- char ) name-map at ; -: char>name ( char -- name ) name-map value-at ; -: property? ( char property -- ? ) properties at interval-key? ; +PRIVATE> + +: canonical-entry ( char -- seq ) canonical-map at ; inline +: combine-chars ( a b -- char/f ) combine-map hash2 ; inline +: compatibility-entry ( char -- seq ) compatibility-map at ; inline +: combining-class ( char -- n ) class-map at ; inline +: non-starter? ( char -- ? ) combining-class { 0 f } member? not ; inline +: name>char ( name -- char ) name-map at ; inline +: char>name ( char -- name ) name-map value-at ; inline +: property? ( char property -- ? ) properties at interval-key? ; inline +: ch>lower ( ch -- lower ) simple-lower at-default ; inline +: ch>upper ( ch -- upper ) simple-upper at-default ; inline +: ch>title ( ch -- title ) simple-title at-default ; inline +: special-case ( ch -- casing-tuple ) special-casing at ; inline ! For non-existent characters, use Cn CONSTANT: categories @@ -40,11 +48,15 @@ CONSTANT: categories "Zs" "Zl" "Zp" "Cc" "Cf" "Cs" "Co" } + [ swap ] H{ } assoc-map-as ; CONSTANT: num-chars HEX: 2FA1E +PRIVATE> + : category# ( char -- category ) ! There are a few characters that should be Cn ! that this gives Cf or Mn @@ -60,6 +72,8 @@ CONSTANT: num-chars HEX: 2FA1E : category ( char -- category ) category# categories nth ; + ] map sift ; +PRIVATE> + TUPLE: code-point lower title upper ; C: code-point + swap first set ; @@ -212,3 +230,31 @@ load-properties to: properties [ name>char [ "Invalid character" throw ] unless* ] name>char-hook set-global + +SYMBOL: interned + +: parse-key-value ( filename -- assoc ) + ! assoc is code point/range => name + ascii file-lines filter-comments [ split-; ] map ; + +: range, ( value key -- ) + swap interned get + [ = ] with find nip 2array , ; + +: expand-ranges ( assoc -- interval-map ) + [ + [ + swap CHAR: . over member? [ + ".." split1 [ hex> ] bi@ 2array + ] [ hex> ] if range, + ] assoc-each + ] { } make ; + +: process-key-value ( ranges -- table ) + dup values prune interned + [ expand-ranges ] with-variable ; + +PRIVATE> + +: load-key-value ( filename -- table ) + parse-key-value process-key-value ; diff --git a/basis/unicode/script/script.factor b/basis/unicode/script/script.factor index c8f818dbaa..ed80476084 100644 --- a/basis/unicode/script/script.factor +++ b/basis/unicode/script/script.factor @@ -9,35 +9,9 @@ IN: unicode.script name - ascii file-lines filter-comments [ split-; ] map ; - -: range, ( value key -- ) - swap interned get - [ = ] with find nip 2array , ; - -: expand-ranges ( assoc -- interval-map ) - [ - [ - swap CHAR: . over member? [ - ".." split1 [ hex> ] bi@ 2array - ] [ hex> ] if range, - ] assoc-each - ] { } make ; - -: process-script ( ranges -- table ) - dup values prune interned - [ expand-ranges ] with-variable ; - -: load-script ( filename -- table ) - parse-script process-script ; - VALUE: script-table -"vocab:unicode/script/Scripts.txt" load-script +"vocab:unicode/script/Scripts.txt" load-key-value to: script-table PRIVATE>