Some reorganizing in Unicode; regexp class changes
parent
7a010063c0
commit
54194d269c
|
@ -5,18 +5,30 @@ ascii unicode.categories combinators.short-circuit sequences
|
||||||
fry macros arrays assocs sets classes mirrors ;
|
fry macros arrays assocs sets classes mirrors ;
|
||||||
IN: regexp.classes
|
IN: regexp.classes
|
||||||
|
|
||||||
SINGLETONS: any-char any-char-no-nl
|
SINGLETONS: dot letter-class LETTER-class Letter-class digit-class
|
||||||
letter-class LETTER-class Letter-class digit-class
|
|
||||||
alpha-class non-newline-blank-class
|
alpha-class non-newline-blank-class
|
||||||
ascii-class punctuation-class java-printable-class blank-class
|
ascii-class punctuation-class java-printable-class blank-class
|
||||||
control-character-class hex-digit-class java-blank-class c-identifier-class
|
control-character-class hex-digit-class java-blank-class c-identifier-class
|
||||||
unmatchable-class terminator-class word-boundary-class ;
|
unmatchable-class terminator-class word-boundary-class ;
|
||||||
|
|
||||||
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ;
|
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file
|
||||||
|
^unix $unix word-break ;
|
||||||
|
|
||||||
TUPLE: range from to ;
|
TUPLE: range from to ;
|
||||||
C: <range> range
|
C: <range> range
|
||||||
|
|
||||||
|
TUPLE: primitive-class class ;
|
||||||
|
C: <primitive-class> primitive-class
|
||||||
|
|
||||||
|
TUPLE: category-class category ;
|
||||||
|
C: <category-class> category-class
|
||||||
|
|
||||||
|
TUPLE: category-range-class category ;
|
||||||
|
C: <category-range-class> category-range-class
|
||||||
|
|
||||||
|
TUPLE: script-class script ;
|
||||||
|
C: <script-class> script-class
|
||||||
|
|
||||||
GENERIC: class-member? ( obj class -- ? )
|
GENERIC: class-member? ( obj class -- ? )
|
||||||
|
|
||||||
M: t class-member? ( obj class -- ? ) 2drop t ;
|
M: t class-member? ( obj class -- ? ) 2drop t ;
|
||||||
|
@ -26,12 +38,6 @@ M: integer class-member? ( obj class -- ? ) = ;
|
||||||
M: range class-member? ( obj class -- ? )
|
M: range class-member? ( obj class -- ? )
|
||||||
[ from>> ] [ to>> ] bi between? ;
|
[ from>> ] [ to>> ] bi between? ;
|
||||||
|
|
||||||
M: any-char class-member? ( obj class -- ? )
|
|
||||||
2drop t ;
|
|
||||||
|
|
||||||
M: any-char-no-nl class-member? ( obj class -- ? )
|
|
||||||
drop CHAR: \n = not ;
|
|
||||||
|
|
||||||
M: letter-class class-member? ( obj class -- ? )
|
M: letter-class class-member? ( obj class -- ? )
|
||||||
drop letter? ;
|
drop letter? ;
|
||||||
|
|
||||||
|
@ -99,16 +105,16 @@ M: unmatchable-class class-member? ( obj class -- ? )
|
||||||
M: terminator-class class-member? ( obj class -- ? )
|
M: terminator-class class-member? ( obj class -- ? )
|
||||||
drop "\r\n\u000085\u002029\u002028" member? ;
|
drop "\r\n\u000085\u002029\u002028" member? ;
|
||||||
|
|
||||||
M: ^ class-member? ( obj class -- ? )
|
|
||||||
2drop f ;
|
|
||||||
|
|
||||||
M: $ class-member? ( obj class -- ? )
|
|
||||||
2drop f ;
|
|
||||||
|
|
||||||
M: f class-member? 2drop f ;
|
M: f class-member? 2drop f ;
|
||||||
|
|
||||||
TUPLE: primitive-class class ;
|
M: script-class class-member?
|
||||||
C: <primitive-class> primitive-class
|
[ script-of ] [ script>> ] bi* = ;
|
||||||
|
|
||||||
|
M: category-class class-member?
|
||||||
|
[ category# ] [ category>> ] bi* = ;
|
||||||
|
|
||||||
|
M: category-range-class class-member?
|
||||||
|
[ category first ] [ category>> ] bi* = ;
|
||||||
|
|
||||||
TUPLE: not-class class ;
|
TUPLE: not-class class ;
|
||||||
|
|
||||||
|
|
|
@ -117,8 +117,17 @@ M: or-class modify-class
|
||||||
M: not-class modify-class
|
M: not-class modify-class
|
||||||
class>> modify-class <not-class> ;
|
class>> modify-class <not-class> ;
|
||||||
|
|
||||||
M: any-char modify-class
|
MEMO: unix-dot ( -- class )
|
||||||
drop dotall option? t any-char-no-nl ? ;
|
CHAR: \n <not-class> ;
|
||||||
|
|
||||||
|
MEMO: nonl-dot ( -- class )
|
||||||
|
{ CHAR: \n CHAR: \r } <or-class> <not-class> ;
|
||||||
|
|
||||||
|
M: dot modify-class
|
||||||
|
drop dotall option? [ t ] [
|
||||||
|
unix-lines option?
|
||||||
|
unix-dot nonl-dot ?
|
||||||
|
] if ;
|
||||||
|
|
||||||
: modify-letter-class ( class -- newclass )
|
: modify-letter-class ( class -- newclass )
|
||||||
case-insensitive option? [ drop Letter-class ] when ;
|
case-insensitive option? [ drop Letter-class ] when ;
|
||||||
|
|
|
@ -18,6 +18,13 @@ ERROR: bad-number ;
|
||||||
|
|
||||||
ERROR: bad-class name ;
|
ERROR: bad-class name ;
|
||||||
|
|
||||||
|
: parse-unicode-class ( name -- class )
|
||||||
|
! Implement this!
|
||||||
|
drop f ;
|
||||||
|
|
||||||
|
: unicode-class ( name -- class )
|
||||||
|
parse-unicode-class [ bad-class ] unless* ;
|
||||||
|
|
||||||
: name>class ( name -- class )
|
: name>class ( name -- class )
|
||||||
>string >case-fold {
|
>string >case-fold {
|
||||||
{ "lower" letter-class }
|
{ "lower" letter-class }
|
||||||
|
@ -32,8 +39,7 @@ ERROR: bad-class name ;
|
||||||
{ "cntrl" control-character-class }
|
{ "cntrl" control-character-class }
|
||||||
{ "xdigit" hex-digit-class }
|
{ "xdigit" hex-digit-class }
|
||||||
{ "space" java-blank-class }
|
{ "space" java-blank-class }
|
||||||
! TODO: unicode-character-class
|
} [ unicode-class ] at-error ;
|
||||||
} [ bad-class ] at-error ;
|
|
||||||
|
|
||||||
: lookup-escape ( char -- ast )
|
: lookup-escape ( char -- ast )
|
||||||
{
|
{
|
||||||
|
@ -144,7 +150,7 @@ Parenthized = "?:" Alternation:a => [[ a ]]
|
||||||
|
|
||||||
Element = "(" Parenthized:p ")" => [[ p ]]
|
Element = "(" Parenthized:p ")" => [[ p ]]
|
||||||
| "[" CharClass:r "]" => [[ r ]]
|
| "[" CharClass:r "]" => [[ r ]]
|
||||||
| ".":d => [[ any-char <primitive-class> ]]
|
| ".":d => [[ dot ]]
|
||||||
| Character
|
| Character
|
||||||
|
|
||||||
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
|
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: combinators.short-circuit assocs math kernel sequences
|
USING: combinators.short-circuit assocs math kernel sequences
|
||||||
io.files hashtables quotations splitting grouping arrays io
|
io.files hashtables quotations splitting grouping arrays io
|
||||||
|
@ -29,6 +29,21 @@ VALUE: properties
|
||||||
: char>name ( char -- name ) name-map value-at ;
|
: char>name ( char -- name ) name-map value-at ;
|
||||||
: property? ( char property -- ? ) properties at interval-key? ;
|
: property? ( char property -- ? ) properties at interval-key? ;
|
||||||
|
|
||||||
|
: category# ( char -- category )
|
||||||
|
! There are a few characters that should be Cn
|
||||||
|
! that this gives Cf or Mn
|
||||||
|
! Cf = 26; Mn = 5; Cn = 29
|
||||||
|
! Use a compressed array instead?
|
||||||
|
dup category-map ?nth [ ] [
|
||||||
|
dup HEX: E0001 HEX: E007F between?
|
||||||
|
[ drop 26 ] [
|
||||||
|
HEX: E0100 HEX: E01EF between? 5 29 ?
|
||||||
|
] if
|
||||||
|
] ?if ;
|
||||||
|
|
||||||
|
: category ( char -- category )
|
||||||
|
category# categories nth ;
|
||||||
|
|
||||||
! Loading data from UnicodeData.txt
|
! Loading data from UnicodeData.txt
|
||||||
|
|
||||||
: split-; ( line -- array )
|
: split-; ( line -- array )
|
||||||
|
@ -195,33 +210,5 @@ load-special-casing to: special-casing
|
||||||
|
|
||||||
load-properties to: properties
|
load-properties to: properties
|
||||||
|
|
||||||
! Utility to load resource files that look like Scripts.txt
|
|
||||||
|
|
||||||
SYMBOL: interned
|
|
||||||
|
|
||||||
: parse-script ( filename -- assoc )
|
|
||||||
! assoc is code point/range => name
|
|
||||||
ascii file-lines filter-comments [ split-; ] map ;
|
|
||||||
|
|
||||||
: range, ( value key -- )
|
|
||||||
swap interned get
|
|
||||||
[ = ] with find nip 2array , ;
|
|
||||||
|
|
||||||
: expand-ranges ( assoc -- interval-map )
|
|
||||||
[
|
|
||||||
[
|
|
||||||
swap CHAR: . over member? [
|
|
||||||
".." split1 [ hex> ] bi@ 2array
|
|
||||||
] [ hex> ] if range,
|
|
||||||
] assoc-each
|
|
||||||
] { } make <interval-map> ;
|
|
||||||
|
|
||||||
: process-script ( ranges -- table )
|
|
||||||
dup values prune interned
|
|
||||||
[ expand-ranges ] with-variable ;
|
|
||||||
|
|
||||||
: load-script ( filename -- table )
|
|
||||||
parse-script process-script ;
|
|
||||||
|
|
||||||
[ name>char [ "Invalid character" throw ] unless* ]
|
[ name>char [ "Invalid character" throw ] unless* ]
|
||||||
name>char-hook set-global
|
name>char-hook set-global
|
||||||
|
|
|
@ -7,10 +7,40 @@ words words.symbol compiler.units arrays interval-maps
|
||||||
unicode.data ;
|
unicode.data ;
|
||||||
IN: unicode.script
|
IN: unicode.script
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
SYMBOL: interned
|
||||||
|
|
||||||
|
: parse-script ( filename -- assoc )
|
||||||
|
! assoc is code point/range => name
|
||||||
|
ascii file-lines filter-comments [ split-; ] map ;
|
||||||
|
|
||||||
|
: range, ( value key -- )
|
||||||
|
swap interned get
|
||||||
|
[ = ] with find nip 2array , ;
|
||||||
|
|
||||||
|
: expand-ranges ( assoc -- interval-map )
|
||||||
|
[
|
||||||
|
[
|
||||||
|
swap CHAR: . over member? [
|
||||||
|
".." split1 [ hex> ] bi@ 2array
|
||||||
|
] [ hex> ] if range,
|
||||||
|
] assoc-each
|
||||||
|
] { } make <interval-map> ;
|
||||||
|
|
||||||
|
: process-script ( ranges -- table )
|
||||||
|
dup values prune interned
|
||||||
|
[ expand-ranges ] with-variable ;
|
||||||
|
|
||||||
|
: load-script ( filename -- table )
|
||||||
|
parse-script process-script ;
|
||||||
|
|
||||||
VALUE: script-table
|
VALUE: script-table
|
||||||
|
|
||||||
"vocab:unicode/script/Scripts.txt" load-script
|
"vocab:unicode/script/Scripts.txt" load-script
|
||||||
to: script-table
|
to: script-table
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
: script-of ( char -- script )
|
: script-of ( char -- script )
|
||||||
script-table interval-at ;
|
script-table interval-at ;
|
||||||
|
|
|
@ -5,22 +5,7 @@ bit-arrays namespaces make sequences.private arrays quotations
|
||||||
assocs classes.predicate math.order strings.parser ;
|
assocs classes.predicate math.order strings.parser ;
|
||||||
IN: unicode.syntax
|
IN: unicode.syntax
|
||||||
|
|
||||||
! Character classes (categories)
|
<PRIVATE
|
||||||
|
|
||||||
: category# ( char -- category )
|
|
||||||
! There are a few characters that should be Cn
|
|
||||||
! that this gives Cf or Mn
|
|
||||||
! Cf = 26; Mn = 5; Cn = 29
|
|
||||||
! Use a compressed array instead?
|
|
||||||
dup category-map ?nth [ ] [
|
|
||||||
dup HEX: E0001 HEX: E007F between?
|
|
||||||
[ drop 26 ] [
|
|
||||||
HEX: E0100 HEX: E01EF between? 5 29 ?
|
|
||||||
] if
|
|
||||||
] ?if ;
|
|
||||||
|
|
||||||
: category ( char -- category )
|
|
||||||
category# categories nth ;
|
|
||||||
|
|
||||||
: >category-array ( categories -- bitarray )
|
: >category-array ( categories -- bitarray )
|
||||||
categories [ swap member? ] with map >bit-array ;
|
categories [ swap member? ] with map >bit-array ;
|
||||||
|
@ -40,6 +25,8 @@ IN: unicode.syntax
|
||||||
: define-category ( word categories -- )
|
: define-category ( word categories -- )
|
||||||
[category] integer swap define-predicate-class ;
|
[category] integer swap define-predicate-class ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
: CATEGORY:
|
: CATEGORY:
|
||||||
CREATE ";" parse-tokens define-category ; parsing
|
CREATE ";" parse-tokens define-category ; parsing
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue