Reorganizing Unicode data, so that the data tables are private; ch>upper moves to unicode.data
parent
3e63d3eb01
commit
1dd34ad776
|
@ -3,11 +3,11 @@
|
||||||
USING: accessors arrays assocs grouping kernel locals math namespaces
|
USING: accessors arrays assocs grouping kernel locals math namespaces
|
||||||
sequences fry quotations math.order math.ranges vectors
|
sequences fry quotations math.order math.ranges vectors
|
||||||
unicode.categories regexp.transition-tables words sets hashtables
|
unicode.categories regexp.transition-tables words sets hashtables
|
||||||
combinators.short-circuit unicode.case unicode.case.private regexp.ast
|
combinators.short-circuit unicode.data regexp.ast
|
||||||
regexp.classes memoize ;
|
regexp.classes memoize ;
|
||||||
IN: regexp.nfa
|
IN: regexp.nfa
|
||||||
|
|
||||||
! This uses unicode.case.private for ch>upper and ch>lower
|
! This uses unicode.data for ch>upper and ch>lower
|
||||||
! but case-insensitive matching should be done by case-folding everything
|
! but case-insensitive matching should be done by case-folding everything
|
||||||
! before processing starts
|
! before processing starts
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,9 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: combinators.short-circuit unicode.categories kernel math
|
USING: combinators.short-circuit unicode.categories kernel math
|
||||||
combinators splitting sequences math.parser io.files io assocs
|
combinators splitting sequences math.parser io.files io assocs
|
||||||
arrays namespaces make math.ranges unicode.normalize.private values
|
arrays namespaces make math.ranges unicode.normalize
|
||||||
io.encodings.ascii unicode.syntax unicode.data compiler.units fry
|
unicode.normalize.private values io.encodings.ascii
|
||||||
|
unicode.syntax unicode.data compiler.units fry
|
||||||
alien.syntax sets accessors interval-maps memoize locals words ;
|
alien.syntax sets accessors interval-maps memoize locals words ;
|
||||||
IN: unicode.breaks
|
IN: unicode.breaks
|
||||||
|
|
||||||
|
@ -126,7 +127,7 @@ to: grapheme-table
|
||||||
|
|
||||||
VALUE: word-break-table
|
VALUE: word-break-table
|
||||||
|
|
||||||
"vocab:unicode/data/WordBreakProperty.txt" load-script
|
"vocab:unicode/data/WordBreakProperty.txt" load-key-value
|
||||||
to: word-break-table
|
to: word-break-table
|
||||||
|
|
||||||
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
|
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
|
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: unicode.case unicode.case.private tools.test namespaces strings unicode.normalize ;
|
USING: unicode.case tools.test namespaces strings unicode.normalize
|
||||||
|
unicode.case.private ;
|
||||||
IN: unicode.case.tests
|
IN: unicode.case.tests
|
||||||
|
|
||||||
\ >upper must-infer
|
\ >upper must-infer
|
||||||
|
|
|
@ -7,12 +7,6 @@ strings splitting kernel accessors unicode.breaks fry locals ;
|
||||||
QUALIFIED: ascii
|
QUALIFIED: ascii
|
||||||
IN: unicode.case
|
IN: unicode.case
|
||||||
|
|
||||||
<PRIVATE
|
|
||||||
: ch>lower ( ch -- lower ) simple-lower at-default ; inline
|
|
||||||
: ch>upper ( ch -- upper ) simple-upper at-default ; inline
|
|
||||||
: ch>title ( ch -- title ) simple-title at-default ; inline
|
|
||||||
PRIVATE>
|
|
||||||
|
|
||||||
SYMBOL: locale ! Just casing locale, or overall?
|
SYMBOL: locale ! Just casing locale, or overall?
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
@ -86,7 +80,7 @@ SYMBOL: locale ! Just casing locale, or overall?
|
||||||
:: map-case ( string string-quot char-quot -- case )
|
:: map-case ( string string-quot char-quot -- case )
|
||||||
string length <sbuf> :> out
|
string length <sbuf> :> out
|
||||||
string [
|
string [
|
||||||
dup special-casing at
|
dup special-case
|
||||||
[ string-quot call out push-all ]
|
[ string-quot call out push-all ]
|
||||||
[ char-quot call out push ] ?if
|
[ char-quot call out push ] ?if
|
||||||
] each out "" like ; inline
|
] each out "" like ; inline
|
||||||
|
|
|
@ -13,7 +13,8 @@ ARTICLE: "unicode.data" "Unicode data tables"
|
||||||
{ $subsection non-starter? }
|
{ $subsection non-starter? }
|
||||||
{ $subsection name>char }
|
{ $subsection name>char }
|
||||||
{ $subsection char>name }
|
{ $subsection char>name }
|
||||||
{ $subsection property? } ;
|
{ $subsection property? }
|
||||||
|
{ $subsection load-key-value } ;
|
||||||
|
|
||||||
HELP: canonical-entry
|
HELP: canonical-entry
|
||||||
{ $values { "char" "a code point" } { "seq" string } }
|
{ $values { "char" "a code point" } { "seq" string } }
|
||||||
|
@ -46,3 +47,7 @@ HELP: name>char
|
||||||
HELP: property?
|
HELP: property?
|
||||||
{ $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
|
{ $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
|
||||||
{ $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
|
{ $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
|
||||||
|
|
||||||
|
HELP: load-key-value
|
||||||
|
{ $values { "filename" string } { "table" "an interval map" } }
|
||||||
|
{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
|
||||||
|
|
|
@ -8,6 +8,8 @@ ascii sets combinators locals math.ranges sorting make
|
||||||
strings.parser io.encodings.utf8 memoize ;
|
strings.parser io.encodings.utf8 memoize ;
|
||||||
IN: unicode.data
|
IN: unicode.data
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
VALUE: simple-lower
|
VALUE: simple-lower
|
||||||
VALUE: simple-upper
|
VALUE: simple-upper
|
||||||
VALUE: simple-title
|
VALUE: simple-title
|
||||||
|
@ -20,14 +22,20 @@ VALUE: name-map
|
||||||
VALUE: special-casing
|
VALUE: special-casing
|
||||||
VALUE: properties
|
VALUE: properties
|
||||||
|
|
||||||
: canonical-entry ( char -- seq ) canonical-map at ;
|
PRIVATE>
|
||||||
: combine-chars ( a b -- char/f ) combine-map hash2 ;
|
|
||||||
: compatibility-entry ( char -- seq ) compatibility-map at ;
|
: canonical-entry ( char -- seq ) canonical-map at ; inline
|
||||||
: combining-class ( char -- n ) class-map at ;
|
: combine-chars ( a b -- char/f ) combine-map hash2 ; inline
|
||||||
: non-starter? ( char -- ? ) combining-class { 0 f } member? not ;
|
: compatibility-entry ( char -- seq ) compatibility-map at ; inline
|
||||||
: name>char ( name -- char ) name-map at ;
|
: combining-class ( char -- n ) class-map at ; inline
|
||||||
: char>name ( char -- name ) name-map value-at ;
|
: non-starter? ( char -- ? ) combining-class { 0 f } member? not ; inline
|
||||||
: property? ( char property -- ? ) properties at interval-key? ;
|
: name>char ( name -- char ) name-map at ; inline
|
||||||
|
: char>name ( char -- name ) name-map value-at ; inline
|
||||||
|
: property? ( char property -- ? ) properties at interval-key? ; inline
|
||||||
|
: ch>lower ( ch -- lower ) simple-lower at-default ; inline
|
||||||
|
: ch>upper ( ch -- upper ) simple-upper at-default ; inline
|
||||||
|
: ch>title ( ch -- title ) simple-title at-default ; inline
|
||||||
|
: special-case ( ch -- casing-tuple ) special-casing at ; inline
|
||||||
|
|
||||||
! For non-existent characters, use Cn
|
! For non-existent characters, use Cn
|
||||||
CONSTANT: categories
|
CONSTANT: categories
|
||||||
|
@ -40,11 +48,15 @@ CONSTANT: categories
|
||||||
"Zs" "Zl" "Zp"
|
"Zs" "Zl" "Zp"
|
||||||
"Cc" "Cf" "Cs" "Co" }
|
"Cc" "Cf" "Cs" "Co" }
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
MEMO: categories-map ( -- hashtable )
|
MEMO: categories-map ( -- hashtable )
|
||||||
categories <enum> [ swap ] H{ } assoc-map-as ;
|
categories <enum> [ swap ] H{ } assoc-map-as ;
|
||||||
|
|
||||||
CONSTANT: num-chars HEX: 2FA1E
|
CONSTANT: num-chars HEX: 2FA1E
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
: category# ( char -- category )
|
: category# ( char -- category )
|
||||||
! There are a few characters that should be Cn
|
! There are a few characters that should be Cn
|
||||||
! that this gives Cf or Mn
|
! that this gives Cf or Mn
|
||||||
|
@ -60,6 +72,8 @@ CONSTANT: num-chars HEX: 2FA1E
|
||||||
: category ( char -- category )
|
: category ( char -- category )
|
||||||
category# categories nth ;
|
category# categories nth ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
! Loading data from UnicodeData.txt
|
! Loading data from UnicodeData.txt
|
||||||
|
|
||||||
: split-; ( line -- array )
|
: split-; ( line -- array )
|
||||||
|
@ -155,10 +169,14 @@ CONSTANT: num-chars HEX: 2FA1E
|
||||||
: multihex ( hexstring -- string )
|
: multihex ( hexstring -- string )
|
||||||
" " split [ hex> ] map sift ;
|
" " split [ hex> ] map sift ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
TUPLE: code-point lower title upper ;
|
TUPLE: code-point lower title upper ;
|
||||||
|
|
||||||
C: <code-point> code-point
|
C: <code-point> code-point
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
: set-code-point ( seq -- )
|
: set-code-point ( seq -- )
|
||||||
4 head [ multihex ] map first4
|
4 head [ multihex ] map first4
|
||||||
<code-point> swap first set ;
|
<code-point> swap first set ;
|
||||||
|
@ -212,3 +230,31 @@ load-properties to: properties
|
||||||
|
|
||||||
[ name>char [ "Invalid character" throw ] unless* ]
|
[ name>char [ "Invalid character" throw ] unless* ]
|
||||||
name>char-hook set-global
|
name>char-hook set-global
|
||||||
|
|
||||||
|
SYMBOL: interned
|
||||||
|
|
||||||
|
: parse-key-value ( filename -- assoc )
|
||||||
|
! assoc is code point/range => name
|
||||||
|
ascii file-lines filter-comments [ split-; ] map ;
|
||||||
|
|
||||||
|
: range, ( value key -- )
|
||||||
|
swap interned get
|
||||||
|
[ = ] with find nip 2array , ;
|
||||||
|
|
||||||
|
: expand-ranges ( assoc -- interval-map )
|
||||||
|
[
|
||||||
|
[
|
||||||
|
swap CHAR: . over member? [
|
||||||
|
".." split1 [ hex> ] bi@ 2array
|
||||||
|
] [ hex> ] if range,
|
||||||
|
] assoc-each
|
||||||
|
] { } make <interval-map> ;
|
||||||
|
|
||||||
|
: process-key-value ( ranges -- table )
|
||||||
|
dup values prune interned
|
||||||
|
[ expand-ranges ] with-variable ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: load-key-value ( filename -- table )
|
||||||
|
parse-key-value process-key-value ;
|
||||||
|
|
|
@ -9,35 +9,9 @@ IN: unicode.script
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
SYMBOL: interned
|
|
||||||
|
|
||||||
: parse-script ( filename -- assoc )
|
|
||||||
! assoc is code point/range => name
|
|
||||||
ascii file-lines filter-comments [ split-; ] map ;
|
|
||||||
|
|
||||||
: range, ( value key -- )
|
|
||||||
swap interned get
|
|
||||||
[ = ] with find nip 2array , ;
|
|
||||||
|
|
||||||
: expand-ranges ( assoc -- interval-map )
|
|
||||||
[
|
|
||||||
[
|
|
||||||
swap CHAR: . over member? [
|
|
||||||
".." split1 [ hex> ] bi@ 2array
|
|
||||||
] [ hex> ] if range,
|
|
||||||
] assoc-each
|
|
||||||
] { } make <interval-map> ;
|
|
||||||
|
|
||||||
: process-script ( ranges -- table )
|
|
||||||
dup values prune interned
|
|
||||||
[ expand-ranges ] with-variable ;
|
|
||||||
|
|
||||||
: load-script ( filename -- table )
|
|
||||||
parse-script process-script ;
|
|
||||||
|
|
||||||
VALUE: script-table
|
VALUE: script-table
|
||||||
|
|
||||||
"vocab:unicode/script/Scripts.txt" load-script
|
"vocab:unicode/script/Scripts.txt" load-key-value
|
||||||
to: script-table
|
to: script-table
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
Loading…
Reference in New Issue