Some reorganizing in Unicode; regexp class changes

db4
Daniel Ehrenberg 2009-03-17 19:39:04 -05:00
parent 7a010063c0
commit 54194d269c
6 changed files with 92 additions and 67 deletions

View File

@ -5,18 +5,30 @@ ascii unicode.categories combinators.short-circuit sequences
fry macros arrays assocs sets classes mirrors ;
IN: regexp.classes
SINGLETONS: any-char any-char-no-nl
letter-class LETTER-class Letter-class digit-class
SINGLETONS: dot letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ;
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file
^unix $unix word-break ;
TUPLE: range from to ;
C: <range> range
TUPLE: primitive-class class ;
C: <primitive-class> primitive-class
TUPLE: category-class category ;
C: <category-class> category-class
TUPLE: category-range-class category ;
C: <category-range-class> category-range-class
TUPLE: script-class script ;
C: <script-class> script-class
GENERIC: class-member? ( obj class -- ? )
M: t class-member? ( obj class -- ? ) 2drop t ;
@ -26,12 +38,6 @@ M: integer class-member? ( obj class -- ? ) = ;
M: range class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ;
M: any-char class-member? ( obj class -- ? )
2drop t ;
M: any-char-no-nl class-member? ( obj class -- ? )
drop CHAR: \n = not ;
M: letter-class class-member? ( obj class -- ? )
drop letter? ;
@ -99,16 +105,16 @@ M: unmatchable-class class-member? ( obj class -- ? )
M: terminator-class class-member? ( obj class -- ? )
drop "\r\n\u000085\u002029\u002028" member? ;
M: ^ class-member? ( obj class -- ? )
2drop f ;
M: $ class-member? ( obj class -- ? )
2drop f ;
M: f class-member? 2drop f ;
TUPLE: primitive-class class ;
C: <primitive-class> primitive-class
M: script-class class-member?
[ script-of ] [ script>> ] bi* = ;
M: category-class class-member?
[ category# ] [ category>> ] bi* = ;
M: category-range-class class-member?
[ category first ] [ category>> ] bi* = ;
TUPLE: not-class class ;

View File

@ -117,8 +117,17 @@ M: or-class modify-class
M: not-class modify-class
class>> modify-class <not-class> ;
M: any-char modify-class
drop dotall option? t any-char-no-nl ? ;
MEMO: unix-dot ( -- class )
CHAR: \n <not-class> ;
MEMO: nonl-dot ( -- class )
{ CHAR: \n CHAR: \r } <or-class> <not-class> ;
M: dot modify-class
drop dotall option? [ t ] [
unix-lines option?
unix-dot nonl-dot ?
] if ;
: modify-letter-class ( class -- newclass )
case-insensitive option? [ drop Letter-class ] when ;

View File

@ -18,6 +18,13 @@ ERROR: bad-number ;
ERROR: bad-class name ;
: parse-unicode-class ( name -- class )
! Implement this!
drop f ;
: unicode-class ( name -- class )
parse-unicode-class [ bad-class ] unless* ;
: name>class ( name -- class )
>string >case-fold {
{ "lower" letter-class }
@ -32,8 +39,7 @@ ERROR: bad-class name ;
{ "cntrl" control-character-class }
{ "xdigit" hex-digit-class }
{ "space" java-blank-class }
! TODO: unicode-character-class
} [ bad-class ] at-error ;
} [ unicode-class ] at-error ;
: lookup-escape ( char -- ast )
{
@ -144,7 +150,7 @@ Parenthized = "?:" Alternation:a => [[ a ]]
Element = "(" Parenthized:p ")" => [[ p ]]
| "[" CharClass:r "]" => [[ r ]]
| ".":d => [[ any-char <primitive-class> ]]
| ".":d => [[ dot ]]
| Character
Number = (!(","|"}").)* => [[ string>number ensure-number ]]

View File

@ -1,4 +1,4 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: combinators.short-circuit assocs math kernel sequences
io.files hashtables quotations splitting grouping arrays io
@ -29,6 +29,21 @@ VALUE: properties
: char>name ( char -- name ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
: category# ( char -- category )
! There are a few characters that should be Cn
! that this gives Cf or Mn
! Cf = 26; Mn = 5; Cn = 29
! Use a compressed array instead?
dup category-map ?nth [ ] [
dup HEX: E0001 HEX: E007F between?
[ drop 26 ] [
HEX: E0100 HEX: E01EF between? 5 29 ?
] if
] ?if ;
: category ( char -- category )
category# categories nth ;
! Loading data from UnicodeData.txt
: split-; ( line -- array )
@ -195,33 +210,5 @@ load-special-casing to: special-casing
load-properties to: properties
! Utility to load resource files that look like Scripts.txt
SYMBOL: interned
: parse-script ( filename -- assoc )
! assoc is code point/range => name
ascii file-lines filter-comments [ split-; ] map ;
: range, ( value key -- )
swap interned get
[ = ] with find nip 2array , ;
: expand-ranges ( assoc -- interval-map )
[
[
swap CHAR: . over member? [
".." split1 [ hex> ] bi@ 2array
] [ hex> ] if range,
] assoc-each
] { } make <interval-map> ;
: process-script ( ranges -- table )
dup values prune interned
[ expand-ranges ] with-variable ;
: load-script ( filename -- table )
parse-script process-script ;
[ name>char [ "Invalid character" throw ] unless* ]
name>char-hook set-global

View File

@ -7,10 +7,40 @@ words words.symbol compiler.units arrays interval-maps
unicode.data ;
IN: unicode.script
<PRIVATE
SYMBOL: interned
: parse-script ( filename -- assoc )
! assoc is code point/range => name
ascii file-lines filter-comments [ split-; ] map ;
: range, ( value key -- )
swap interned get
[ = ] with find nip 2array , ;
: expand-ranges ( assoc -- interval-map )
[
[
swap CHAR: . over member? [
".." split1 [ hex> ] bi@ 2array
] [ hex> ] if range,
] assoc-each
] { } make <interval-map> ;
: process-script ( ranges -- table )
dup values prune interned
[ expand-ranges ] with-variable ;
: load-script ( filename -- table )
parse-script process-script ;
VALUE: script-table
"vocab:unicode/script/Scripts.txt" load-script
to: script-table
PRIVATE>
: script-of ( char -- script )
script-table interval-at ;

View File

@ -5,22 +5,7 @@ bit-arrays namespaces make sequences.private arrays quotations
assocs classes.predicate math.order strings.parser ;
IN: unicode.syntax
! Character classes (categories)
: category# ( char -- category )
! There are a few characters that should be Cn
! that this gives Cf or Mn
! Cf = 26; Mn = 5; Cn = 29
! Use a compressed array instead?
dup category-map ?nth [ ] [
dup HEX: E0001 HEX: E007F between?
[ drop 26 ] [
HEX: E0100 HEX: E01EF between? 5 29 ?
] if
] ?if ;
: category ( char -- category )
category# categories nth ;
<PRIVATE
: >category-array ( categories -- bitarray )
categories [ swap member? ] with map >bit-array ;
@ -40,6 +25,8 @@ IN: unicode.syntax
: define-category ( word categories -- )
[category] integer swap define-predicate-class ;
PRIVATE>
: CATEGORY:
CREATE ";" parse-tokens define-category ; parsing