Reorganizing Unicode data, so that the data tables are private; ch>upper moves to unicode.data

db4
Daniel Ehrenberg 2009-03-18 21:04:36 -05:00
parent 3e63d3eb01
commit 1dd34ad776
7 changed files with 70 additions and 49 deletions

View File

@ -3,11 +3,11 @@
USING: accessors arrays assocs grouping kernel locals math namespaces
sequences fry quotations math.order math.ranges vectors
unicode.categories regexp.transition-tables words sets hashtables
combinators.short-circuit unicode.case unicode.case.private regexp.ast
combinators.short-circuit unicode.data regexp.ast
regexp.classes memoize ;
IN: regexp.nfa
! This uses unicode.case.private for ch>upper and ch>lower
! This uses unicode.data for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts

View File

@ -2,8 +2,9 @@
! See http://factorcode.org/license.txt for BSD license.
USING: combinators.short-circuit unicode.categories kernel math
combinators splitting sequences math.parser io.files io assocs
arrays namespaces make math.ranges unicode.normalize.private values
io.encodings.ascii unicode.syntax unicode.data compiler.units fry
arrays namespaces make math.ranges unicode.normalize
unicode.normalize.private values io.encodings.ascii
unicode.syntax unicode.data compiler.units fry
alien.syntax sets accessors interval-maps memoize locals words ;
IN: unicode.breaks
@ -126,7 +127,7 @@ to: grapheme-table
VALUE: word-break-table
"vocab:unicode/data/WordBreakProperty.txt" load-script
"vocab:unicode/data/WordBreakProperty.txt" load-key-value
to: word-break-table
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter

View File

@ -1,6 +1,7 @@
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: unicode.case unicode.case.private tools.test namespaces strings unicode.normalize ;
USING: unicode.case tools.test namespaces strings unicode.normalize
unicode.case.private ;
IN: unicode.case.tests
\ >upper must-infer

View File

@ -7,12 +7,6 @@ strings splitting kernel accessors unicode.breaks fry locals ;
QUALIFIED: ascii
IN: unicode.case
<PRIVATE
: ch>lower ( ch -- lower ) simple-lower at-default ; inline
: ch>upper ( ch -- upper ) simple-upper at-default ; inline
: ch>title ( ch -- title ) simple-title at-default ; inline
PRIVATE>
SYMBOL: locale ! Just casing locale, or overall?
<PRIVATE
@ -86,7 +80,7 @@ SYMBOL: locale ! Just casing locale, or overall?
:: map-case ( string string-quot char-quot -- case )
string length <sbuf> :> out
string [
dup special-casing at
dup special-case
[ string-quot call out push-all ]
[ char-quot call out push ] ?if
] each out "" like ; inline

View File

@ -13,7 +13,8 @@ ARTICLE: "unicode.data" "Unicode data tables"
{ $subsection non-starter? }
{ $subsection name>char }
{ $subsection char>name }
{ $subsection property? } ;
{ $subsection property? }
{ $subsection load-key-value } ;
HELP: canonical-entry
{ $values { "char" "a code point" } { "seq" string } }
@ -46,3 +47,7 @@ HELP: name>char
HELP: property?
{ $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
{ $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
HELP: load-key-value
{ $values { "filename" string } { "table" "an interval map" } }
{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;

View File

@ -8,6 +8,8 @@ ascii sets combinators locals math.ranges sorting make
strings.parser io.encodings.utf8 memoize ;
IN: unicode.data
<PRIVATE
VALUE: simple-lower
VALUE: simple-upper
VALUE: simple-title
@ -20,14 +22,20 @@ VALUE: name-map
VALUE: special-casing
VALUE: properties
: canonical-entry ( char -- seq ) canonical-map at ;
: combine-chars ( a b -- char/f ) combine-map hash2 ;
: compatibility-entry ( char -- seq ) compatibility-map at ;
: combining-class ( char -- n ) class-map at ;
: non-starter? ( char -- ? ) combining-class { 0 f } member? not ;
: name>char ( name -- char ) name-map at ;
: char>name ( char -- name ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
PRIVATE>
: canonical-entry ( char -- seq ) canonical-map at ; inline
: combine-chars ( a b -- char/f ) combine-map hash2 ; inline
: compatibility-entry ( char -- seq ) compatibility-map at ; inline
: combining-class ( char -- n ) class-map at ; inline
: non-starter? ( char -- ? ) combining-class { 0 f } member? not ; inline
: name>char ( name -- char ) name-map at ; inline
: char>name ( char -- name ) name-map value-at ; inline
: property? ( char property -- ? ) properties at interval-key? ; inline
: ch>lower ( ch -- lower ) simple-lower at-default ; inline
: ch>upper ( ch -- upper ) simple-upper at-default ; inline
: ch>title ( ch -- title ) simple-title at-default ; inline
: special-case ( ch -- casing-tuple ) special-casing at ; inline
! For non-existent characters, use Cn
CONSTANT: categories
@ -40,11 +48,15 @@ CONSTANT: categories
"Zs" "Zl" "Zp"
"Cc" "Cf" "Cs" "Co" }
<PRIVATE
MEMO: categories-map ( -- hashtable )
categories <enum> [ swap ] H{ } assoc-map-as ;
CONSTANT: num-chars HEX: 2FA1E
PRIVATE>
: category# ( char -- category )
! There are a few characters that should be Cn
! that this gives Cf or Mn
@ -60,6 +72,8 @@ CONSTANT: num-chars HEX: 2FA1E
: category ( char -- category )
category# categories nth ;
<PRIVATE
! Loading data from UnicodeData.txt
: split-; ( line -- array )
@ -155,10 +169,14 @@ CONSTANT: num-chars HEX: 2FA1E
: multihex ( hexstring -- string )
" " split [ hex> ] map sift ;
PRIVATE>
TUPLE: code-point lower title upper ;
C: <code-point> code-point
<PRIVATE
: set-code-point ( seq -- )
4 head [ multihex ] map first4
<code-point> swap first set ;
@ -212,3 +230,31 @@ load-properties to: properties
[ name>char [ "Invalid character" throw ] unless* ]
name>char-hook set-global
SYMBOL: interned
: parse-key-value ( filename -- assoc )
! assoc is code point/range => name
ascii file-lines filter-comments [ split-; ] map ;
: range, ( value key -- )
swap interned get
[ = ] with find nip 2array , ;
: expand-ranges ( assoc -- interval-map )
[
[
swap CHAR: . over member? [
".." split1 [ hex> ] bi@ 2array
] [ hex> ] if range,
] assoc-each
] { } make <interval-map> ;
: process-key-value ( ranges -- table )
dup values prune interned
[ expand-ranges ] with-variable ;
PRIVATE>
: load-key-value ( filename -- table )
parse-key-value process-key-value ;

View File

@ -9,35 +9,9 @@ IN: unicode.script
<PRIVATE
SYMBOL: interned
: parse-script ( filename -- assoc )
! assoc is code point/range => name
ascii file-lines filter-comments [ split-; ] map ;
: range, ( value key -- )
swap interned get
[ = ] with find nip 2array , ;
: expand-ranges ( assoc -- interval-map )
[
[
swap CHAR: . over member? [
".." split1 [ hex> ] bi@ 2array
] [ hex> ] if range,
] assoc-each
] { } make <interval-map> ;
: process-script ( ranges -- table )
dup values prune interned
[ expand-ranges ] with-variable ;
: load-script ( filename -- table )
parse-script process-script ;
VALUE: script-table
"vocab:unicode/script/Scripts.txt" load-script
"vocab:unicode/script/Scripts.txt" load-key-value
to: script-table
PRIVATE>