More docs for Unicode and simple-flat-file; moving more parsing code there
parent
5e80ffb0be
commit
f3038f2ae8
|
@ -72,7 +72,7 @@ ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
|
||||||
{ { $snippet "\\p{blank}" } "Non-newline whitespace" }
|
{ { $snippet "\\p{blank}" } "Non-newline whitespace" }
|
||||||
{ { $snippet "\\p{cntrl}" } "Control character" }
|
{ { $snippet "\\p{cntrl}" } "Control character" }
|
||||||
{ { $snippet "\\p{space}" } "Whitespace" }
|
{ { $snippet "\\p{space}" } "Whitespace" }
|
||||||
{ { $snippet "\\p{xdigit}" } "Hexidecimal digit" }
|
{ { $snippet "\\p{xdigit}" } "Hexadecimal digit" }
|
||||||
{ { $snippet "\\p{Nd}" } "Character in Unicode category Nd" }
|
{ { $snippet "\\p{Nd}" } "Character in Unicode category Nd" }
|
||||||
{ { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" }
|
{ { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" }
|
||||||
{ { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
|
{ { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
|
||||||
|
|
|
@ -1,8 +1,24 @@
|
||||||
USING: help.syntax help.markup strings ;
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: help.syntax help.markup strings biassocs arrays ;
|
||||||
IN: simple-flat-file
|
IN: simple-flat-file
|
||||||
|
|
||||||
ABOUT: "simple-flat-file"
|
ABOUT: "simple-flat-file"
|
||||||
|
|
||||||
ARTICLE: "simple-flat-file" "Parsing simple flat files"
|
ARTICLE: "simple-flat-file" "Parsing simple flat files"
|
||||||
"The " { $vocab-link "simple-flat-file" } " vocabulary provides words for loading and parsing simple flat files in a particular format which is common for encoding tasks."
|
"The " { $vocab-link "simple-flat-file" } " vocabulary provides words for loading and parsing simple flat files in a particular format which is common for encoding and Unicode tasks."
|
||||||
{ $subsection flat-file>biassoc } ;
|
{ $subsection flat-file>biassoc }
|
||||||
|
{ $subsection load-interval-file }
|
||||||
|
{ $subsection data } ;
|
||||||
|
|
||||||
|
HELP: load-interval-file
|
||||||
|
{ $values { "filename" string } { "table" "an interval map" } }
|
||||||
|
{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
|
||||||
|
|
||||||
|
HELP: data
|
||||||
|
{ $values { "filename" string } { "data" array } }
|
||||||
|
{ $description "This loads a file that's delineated by semicolons and lines, returning an array of lines, where each line is an array split by the semicolons, with whitespace trimmed off." } ;
|
||||||
|
|
||||||
|
HELP: flat-file>biassoc
|
||||||
|
{ $values { "filename" string } { "biassoc" biassoc } }
|
||||||
|
{ $description "This loads a flat file, in the form that many encoding resource files are in, with two columns of numeric data in hex, and returns a biassoc associating them." } ;
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
! Copyright (C) 2009 Daniel Ehrenberg
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: sequences splitting kernel math.parser io.files io.encodings.utf8
|
USING: sequences splitting kernel math.parser io.files io.encodings.utf8
|
||||||
biassocs ascii ;
|
biassocs ascii namespaces arrays make assocs interval-maps sets ;
|
||||||
IN: simple-flat-file
|
IN: simple-flat-file
|
||||||
|
|
||||||
: drop-comments ( seq -- newseq )
|
: drop-comments ( seq -- newseq )
|
||||||
|
@ -30,3 +30,25 @@ IN: simple-flat-file
|
||||||
|
|
||||||
: data ( filename -- data )
|
: data ( filename -- data )
|
||||||
utf8 file-lines drop-comments [ split-; ] map ;
|
utf8 file-lines drop-comments [ split-; ] map ;
|
||||||
|
|
||||||
|
SYMBOL: interned
|
||||||
|
|
||||||
|
: range, ( value key -- )
|
||||||
|
swap interned get
|
||||||
|
[ = ] with find nip 2array , ;
|
||||||
|
|
||||||
|
: expand-ranges ( assoc -- interval-map )
|
||||||
|
[
|
||||||
|
[
|
||||||
|
swap CHAR: . over member? [
|
||||||
|
".." split1 [ hex> ] bi@ 2array
|
||||||
|
] [ hex> ] if range,
|
||||||
|
] assoc-each
|
||||||
|
] { } make <interval-map> ;
|
||||||
|
|
||||||
|
: process-interval-file ( ranges -- table )
|
||||||
|
dup values prune interned
|
||||||
|
[ expand-ranges ] with-variable ;
|
||||||
|
|
||||||
|
: load-interval-file ( filename -- table )
|
||||||
|
data process-interval-file ;
|
||||||
|
|
|
@ -5,7 +5,8 @@ combinators splitting sequences math.parser io.files io assocs
|
||||||
arrays namespaces make math.ranges unicode.normalize
|
arrays namespaces make math.ranges unicode.normalize
|
||||||
unicode.normalize.private values io.encodings.ascii
|
unicode.normalize.private values io.encodings.ascii
|
||||||
unicode.syntax unicode.data compiler.units fry
|
unicode.syntax unicode.data compiler.units fry
|
||||||
alien.syntax sets accessors interval-maps memoize locals words ;
|
alien.syntax sets accessors interval-maps memoize locals words
|
||||||
|
simple-flat-file ;
|
||||||
IN: unicode.breaks
|
IN: unicode.breaks
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
@ -127,7 +128,7 @@ to: grapheme-table
|
||||||
|
|
||||||
VALUE: word-break-table
|
VALUE: word-break-table
|
||||||
|
|
||||||
"vocab:unicode/data/WordBreakProperty.txt" load-key-value
|
"vocab:unicode/data/WordBreakProperty.txt" load-interval-file
|
||||||
to: word-break-table
|
to: word-break-table
|
||||||
|
|
||||||
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
|
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
|
||||||
|
|
|
@ -6,7 +6,7 @@ IN: unicode.data
|
||||||
ABOUT: "unicode.data"
|
ABOUT: "unicode.data"
|
||||||
|
|
||||||
ARTICLE: "unicode.data" "Unicode data tables"
|
ARTICLE: "unicode.data" "Unicode data tables"
|
||||||
"The " { $vocab-link "unicode.data" "unicode.data" } " vocabulary contains core Unicode data tables and code for parsing this from files."
|
"The " { $vocab-link "unicode.data" "unicode.data" } " vocabulary contains core Unicode data tables and code for parsing this from files. The following words access these data tables."
|
||||||
{ $subsection canonical-entry }
|
{ $subsection canonical-entry }
|
||||||
{ $subsection combine-chars }
|
{ $subsection combine-chars }
|
||||||
{ $subsection combining-class }
|
{ $subsection combining-class }
|
||||||
|
@ -14,7 +14,11 @@ ARTICLE: "unicode.data" "Unicode data tables"
|
||||||
{ $subsection name>char }
|
{ $subsection name>char }
|
||||||
{ $subsection char>name }
|
{ $subsection char>name }
|
||||||
{ $subsection property? }
|
{ $subsection property? }
|
||||||
{ $subsection load-key-value } ;
|
{ $subsection category }
|
||||||
|
{ $subsection ch>upper }
|
||||||
|
{ $subsection ch>lower }
|
||||||
|
{ $subsection ch>title }
|
||||||
|
{ $subsection special-case } ;
|
||||||
|
|
||||||
HELP: canonical-entry
|
HELP: canonical-entry
|
||||||
{ $values { "char" "a code point" } { "seq" string } }
|
{ $values { "char" "a code point" } { "seq" string } }
|
||||||
|
@ -48,6 +52,22 @@ HELP: property?
|
||||||
{ $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
|
{ $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
|
||||||
{ $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
|
{ $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
|
||||||
|
|
||||||
HELP: load-key-value
|
HELP: category
|
||||||
{ $values { "filename" string } { "table" "an interval map" } }
|
{ $values { "char" "a code point" } { "category" string } }
|
||||||
{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
|
{ $description "Returns the general category of a code point, in the form of a string. This will always be a string within the ASCII range of length two. If the code point is unassigned, then it returns " { $snippet "Cn" } "." } ;
|
||||||
|
|
||||||
|
HELP: ch>upper
|
||||||
|
{ $values { "ch" "a code point" } { "upper" "a code point" } }
|
||||||
|
{ $description "Returns the simple upper-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
|
||||||
|
|
||||||
|
HELP: ch>lower
|
||||||
|
{ $values { "ch" "a code point" } { "lower" "a code point" } }
|
||||||
|
{ $description "Returns the simple lower-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
|
||||||
|
|
||||||
|
HELP: ch>title
|
||||||
|
{ $values { "ch" "a code point" } { "title" "a code point" } }
|
||||||
|
{ $description "Returns the simple title-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
|
||||||
|
|
||||||
|
HELP: special-case
|
||||||
|
{ $values { "ch" "a code point" } { "casing-tuple" { "a tuple, or " { $link f } } } }
|
||||||
|
{ $description "If a code point has special casing behavior, returns a tuple which represents that information." } ;
|
||||||
|
|
|
@ -58,7 +58,7 @@ CONSTANT: num-chars HEX: 2FA1E
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: category# ( char -- category )
|
: category# ( char -- n )
|
||||||
! There are a few characters that should be Cn
|
! There are a few characters that should be Cn
|
||||||
! that this gives Cf or Mn
|
! that this gives Cf or Mn
|
||||||
! Cf = 26; Mn = 5; Cn = 29
|
! Cf = 26; Mn = 5; Cn = 29
|
||||||
|
@ -219,27 +219,3 @@ load-properties to: properties
|
||||||
|
|
||||||
[ name>char [ "Invalid character" throw ] unless* ]
|
[ name>char [ "Invalid character" throw ] unless* ]
|
||||||
name>char-hook set-global
|
name>char-hook set-global
|
||||||
|
|
||||||
SYMBOL: interned
|
|
||||||
|
|
||||||
: range, ( value key -- )
|
|
||||||
swap interned get
|
|
||||||
[ = ] with find nip 2array , ;
|
|
||||||
|
|
||||||
: expand-ranges ( assoc -- interval-map )
|
|
||||||
[
|
|
||||||
[
|
|
||||||
swap CHAR: . over member? [
|
|
||||||
".." split1 [ hex> ] bi@ 2array
|
|
||||||
] [ hex> ] if range,
|
|
||||||
] assoc-each
|
|
||||||
] { } make <interval-map> ;
|
|
||||||
|
|
||||||
: process-key-value ( ranges -- table )
|
|
||||||
dup values prune interned
|
|
||||||
[ expand-ranges ] with-variable ;
|
|
||||||
|
|
||||||
PRIVATE>
|
|
||||||
|
|
||||||
: load-key-value ( filename -- table )
|
|
||||||
data process-key-value ;
|
|
||||||
|
|
|
@ -1,17 +1,13 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors values kernel sequences assocs io.files
|
USING: values interval-maps simple-flat-file ;
|
||||||
io.encodings ascii math.ranges io splitting math.parser
|
|
||||||
namespaces make byte-arrays locals math sets io.encodings.ascii
|
|
||||||
words words.symbol compiler.units arrays interval-maps
|
|
||||||
unicode.data ;
|
|
||||||
IN: unicode.script
|
IN: unicode.script
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
VALUE: script-table
|
VALUE: script-table
|
||||||
|
|
||||||
"vocab:unicode/script/Scripts.txt" load-key-value
|
"vocab:unicode/script/Scripts.txt" load-interval-file
|
||||||
to: script-table
|
to: script-table
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: unicode.data kernel math sequences parser lexer
|
USING: unicode.data kernel math sequences parser lexer
|
||||||
bit-arrays namespaces make sequences.private arrays quotations
|
bit-arrays namespaces make sequences.private arrays quotations
|
||||||
assocs classes.predicate math.order strings.parser ;
|
assocs classes.predicate math.order strings.parser sets ;
|
||||||
IN: unicode.syntax
|
IN: unicode.syntax
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
@ -30,9 +30,6 @@ PRIVATE>
|
||||||
: CATEGORY:
|
: CATEGORY:
|
||||||
CREATE ";" parse-tokens define-category ; parsing
|
CREATE ";" parse-tokens define-category ; parsing
|
||||||
|
|
||||||
: seq-minus ( seq1 seq2 -- diff )
|
|
||||||
[ member? not ] curry filter ;
|
|
||||||
|
|
||||||
: CATEGORY-NOT:
|
: CATEGORY-NOT:
|
||||||
CREATE ";" parse-tokens
|
CREATE ";" parse-tokens
|
||||||
categories swap seq-minus define-category ; parsing
|
categories swap diff define-category ; parsing
|
||||||
|
|
Loading…
Reference in New Issue