Some reorganizing in Unicode; regexp class changes
parent
7a010063c0
commit
54194d269c
|
@ -5,18 +5,30 @@ ascii unicode.categories combinators.short-circuit sequences
|
|||
fry macros arrays assocs sets classes mirrors ;
|
||||
IN: regexp.classes
|
||||
|
||||
SINGLETONS: any-char any-char-no-nl
|
||||
letter-class LETTER-class Letter-class digit-class
|
||||
SINGLETONS: dot letter-class LETTER-class Letter-class digit-class
|
||||
alpha-class non-newline-blank-class
|
||||
ascii-class punctuation-class java-printable-class blank-class
|
||||
control-character-class hex-digit-class java-blank-class c-identifier-class
|
||||
unmatchable-class terminator-class word-boundary-class ;
|
||||
|
||||
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ;
|
||||
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file
|
||||
^unix $unix word-break ;
|
||||
|
||||
TUPLE: range from to ;
|
||||
C: <range> range
|
||||
|
||||
TUPLE: primitive-class class ;
|
||||
C: <primitive-class> primitive-class
|
||||
|
||||
TUPLE: category-class category ;
|
||||
C: <category-class> category-class
|
||||
|
||||
TUPLE: category-range-class category ;
|
||||
C: <category-range-class> category-range-class
|
||||
|
||||
TUPLE: script-class script ;
|
||||
C: <script-class> script-class
|
||||
|
||||
GENERIC: class-member? ( obj class -- ? )
|
||||
|
||||
M: t class-member? ( obj class -- ? ) 2drop t ;
|
||||
|
@ -26,12 +38,6 @@ M: integer class-member? ( obj class -- ? ) = ;
|
|||
M: range class-member? ( obj class -- ? )
|
||||
[ from>> ] [ to>> ] bi between? ;
|
||||
|
||||
M: any-char class-member? ( obj class -- ? )
|
||||
2drop t ;
|
||||
|
||||
M: any-char-no-nl class-member? ( obj class -- ? )
|
||||
drop CHAR: \n = not ;
|
||||
|
||||
M: letter-class class-member? ( obj class -- ? )
|
||||
drop letter? ;
|
||||
|
||||
|
@ -99,16 +105,16 @@ M: unmatchable-class class-member? ( obj class -- ? )
|
|||
M: terminator-class class-member? ( obj class -- ? )
|
||||
drop "\r\n\u000085\u002029\u002028" member? ;
|
||||
|
||||
M: ^ class-member? ( obj class -- ? )
|
||||
2drop f ;
|
||||
|
||||
M: $ class-member? ( obj class -- ? )
|
||||
2drop f ;
|
||||
|
||||
M: f class-member? 2drop f ;
|
||||
|
||||
TUPLE: primitive-class class ;
|
||||
C: <primitive-class> primitive-class
|
||||
M: script-class class-member?
|
||||
[ script-of ] [ script>> ] bi* = ;
|
||||
|
||||
M: category-class class-member?
|
||||
[ category# ] [ category>> ] bi* = ;
|
||||
|
||||
M: category-range-class class-member?
|
||||
[ category first ] [ category>> ] bi* = ;
|
||||
|
||||
TUPLE: not-class class ;
|
||||
|
||||
|
|
|
@ -117,8 +117,17 @@ M: or-class modify-class
|
|||
M: not-class modify-class
|
||||
class>> modify-class <not-class> ;
|
||||
|
||||
M: any-char modify-class
|
||||
drop dotall option? t any-char-no-nl ? ;
|
||||
MEMO: unix-dot ( -- class )
|
||||
CHAR: \n <not-class> ;
|
||||
|
||||
MEMO: nonl-dot ( -- class )
|
||||
{ CHAR: \n CHAR: \r } <or-class> <not-class> ;
|
||||
|
||||
M: dot modify-class
|
||||
drop dotall option? [ t ] [
|
||||
unix-lines option?
|
||||
unix-dot nonl-dot ?
|
||||
] if ;
|
||||
|
||||
: modify-letter-class ( class -- newclass )
|
||||
case-insensitive option? [ drop Letter-class ] when ;
|
||||
|
|
|
@ -18,6 +18,13 @@ ERROR: bad-number ;
|
|||
|
||||
ERROR: bad-class name ;
|
||||
|
||||
: parse-unicode-class ( name -- class )
|
||||
! Implement this!
|
||||
drop f ;
|
||||
|
||||
: unicode-class ( name -- class )
|
||||
parse-unicode-class [ bad-class ] unless* ;
|
||||
|
||||
: name>class ( name -- class )
|
||||
>string >case-fold {
|
||||
{ "lower" letter-class }
|
||||
|
@ -32,8 +39,7 @@ ERROR: bad-class name ;
|
|||
{ "cntrl" control-character-class }
|
||||
{ "xdigit" hex-digit-class }
|
||||
{ "space" java-blank-class }
|
||||
! TODO: unicode-character-class
|
||||
} [ bad-class ] at-error ;
|
||||
} [ unicode-class ] at-error ;
|
||||
|
||||
: lookup-escape ( char -- ast )
|
||||
{
|
||||
|
@ -144,7 +150,7 @@ Parenthized = "?:" Alternation:a => [[ a ]]
|
|||
|
||||
Element = "(" Parenthized:p ")" => [[ p ]]
|
||||
| "[" CharClass:r "]" => [[ r ]]
|
||||
| ".":d => [[ any-char <primitive-class> ]]
|
||||
| ".":d => [[ dot ]]
|
||||
| Character
|
||||
|
||||
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: combinators.short-circuit assocs math kernel sequences
|
||||
io.files hashtables quotations splitting grouping arrays io
|
||||
|
@ -29,6 +29,21 @@ VALUE: properties
|
|||
: char>name ( char -- name ) name-map value-at ;
|
||||
: property? ( char property -- ? ) properties at interval-key? ;
|
||||
|
||||
: category# ( char -- category )
|
||||
! There are a few characters that should be Cn
|
||||
! that this gives Cf or Mn
|
||||
! Cf = 26; Mn = 5; Cn = 29
|
||||
! Use a compressed array instead?
|
||||
dup category-map ?nth [ ] [
|
||||
dup HEX: E0001 HEX: E007F between?
|
||||
[ drop 26 ] [
|
||||
HEX: E0100 HEX: E01EF between? 5 29 ?
|
||||
] if
|
||||
] ?if ;
|
||||
|
||||
: category ( char -- category )
|
||||
category# categories nth ;
|
||||
|
||||
! Loading data from UnicodeData.txt
|
||||
|
||||
: split-; ( line -- array )
|
||||
|
@ -195,33 +210,5 @@ load-special-casing to: special-casing
|
|||
|
||||
load-properties to: properties
|
||||
|
||||
! Utility to load resource files that look like Scripts.txt
|
||||
|
||||
SYMBOL: interned
|
||||
|
||||
: parse-script ( filename -- assoc )
|
||||
! assoc is code point/range => name
|
||||
ascii file-lines filter-comments [ split-; ] map ;
|
||||
|
||||
: range, ( value key -- )
|
||||
swap interned get
|
||||
[ = ] with find nip 2array , ;
|
||||
|
||||
: expand-ranges ( assoc -- interval-map )
|
||||
[
|
||||
[
|
||||
swap CHAR: . over member? [
|
||||
".." split1 [ hex> ] bi@ 2array
|
||||
] [ hex> ] if range,
|
||||
] assoc-each
|
||||
] { } make <interval-map> ;
|
||||
|
||||
: process-script ( ranges -- table )
|
||||
dup values prune interned
|
||||
[ expand-ranges ] with-variable ;
|
||||
|
||||
: load-script ( filename -- table )
|
||||
parse-script process-script ;
|
||||
|
||||
[ name>char [ "Invalid character" throw ] unless* ]
|
||||
name>char-hook set-global
|
||||
|
|
|
@ -7,10 +7,40 @@ words words.symbol compiler.units arrays interval-maps
|
|||
unicode.data ;
|
||||
IN: unicode.script
|
||||
|
||||
<PRIVATE
|
||||
|
||||
SYMBOL: interned
|
||||
|
||||
: parse-script ( filename -- assoc )
|
||||
! assoc is code point/range => name
|
||||
ascii file-lines filter-comments [ split-; ] map ;
|
||||
|
||||
: range, ( value key -- )
|
||||
swap interned get
|
||||
[ = ] with find nip 2array , ;
|
||||
|
||||
: expand-ranges ( assoc -- interval-map )
|
||||
[
|
||||
[
|
||||
swap CHAR: . over member? [
|
||||
".." split1 [ hex> ] bi@ 2array
|
||||
] [ hex> ] if range,
|
||||
] assoc-each
|
||||
] { } make <interval-map> ;
|
||||
|
||||
: process-script ( ranges -- table )
|
||||
dup values prune interned
|
||||
[ expand-ranges ] with-variable ;
|
||||
|
||||
: load-script ( filename -- table )
|
||||
parse-script process-script ;
|
||||
|
||||
VALUE: script-table
|
||||
|
||||
"vocab:unicode/script/Scripts.txt" load-script
|
||||
to: script-table
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: script-of ( char -- script )
|
||||
script-table interval-at ;
|
||||
|
|
|
@ -5,22 +5,7 @@ bit-arrays namespaces make sequences.private arrays quotations
|
|||
assocs classes.predicate math.order strings.parser ;
|
||||
IN: unicode.syntax
|
||||
|
||||
! Character classes (categories)
|
||||
|
||||
: category# ( char -- category )
|
||||
! There are a few characters that should be Cn
|
||||
! that this gives Cf or Mn
|
||||
! Cf = 26; Mn = 5; Cn = 29
|
||||
! Use a compressed array instead?
|
||||
dup category-map ?nth [ ] [
|
||||
dup HEX: E0001 HEX: E007F between?
|
||||
[ drop 26 ] [
|
||||
HEX: E0100 HEX: E01EF between? 5 29 ?
|
||||
] if
|
||||
] ?if ;
|
||||
|
||||
: category ( char -- category )
|
||||
category# categories nth ;
|
||||
<PRIVATE
|
||||
|
||||
: >category-array ( categories -- bitarray )
|
||||
categories [ swap member? ] with map >bit-array ;
|
||||
|
@ -40,6 +25,8 @@ IN: unicode.syntax
|
|||
: define-category ( word categories -- )
|
||||
[category] integer swap define-predicate-class ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: CATEGORY:
|
||||
CREATE ";" parse-tokens define-category ; parsing
|
||||
|
||||
|
|
Loading…
Reference in New Issue