diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index e3a1774585..c586932075 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -5,18 +5,30 @@ ascii unicode.categories combinators.short-circuit sequences fry macros arrays assocs sets classes mirrors ; IN: regexp.classes -SINGLETONS: any-char any-char-no-nl -letter-class LETTER-class Letter-class digit-class +SINGLETONS: dot letter-class LETTER-class Letter-class digit-class alpha-class non-newline-blank-class ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class unmatchable-class terminator-class word-boundary-class ; -SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ; +SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file +^unix $unix word-break ; TUPLE: range from to ; C: <range> range +TUPLE: primitive-class class ; +C: <primitive-class> primitive-class + +TUPLE: category-class category ; +C: <category-class> category-class + +TUPLE: category-range-class category ; +C: <category-range-class> category-range-class + +TUPLE: script-class script ; +C: <script-class> script-class + GENERIC: class-member? ( obj class -- ? ) M: t class-member? ( obj class -- ? ) 2drop t ; @@ -26,12 +38,6 @@ M: integer class-member? ( obj class -- ? ) = ; M: range class-member? ( obj class -- ? ) [ from>> ] [ to>> ] bi between? ; -M: any-char class-member? ( obj class -- ? ) - 2drop t ; - -M: any-char-no-nl class-member? ( obj class -- ? ) - drop CHAR: \n = not ; - M: letter-class class-member? ( obj class -- ? ) drop letter? ; @@ -99,16 +105,16 @@ M: unmatchable-class class-member? ( obj class -- ? ) M: terminator-class class-member? ( obj class -- ? ) drop "\r\n\u000085\u002029\u002028" member? ; -M: ^ class-member? ( obj class -- ? ) - 2drop f ; - -M: $ class-member? ( obj class -- ? ) - 2drop f ; - M: f class-member? 2drop f ; -TUPLE: primitive-class class ; -C: <primitive-class> primitive-class +M: script-class class-member? + [ script-of ] [ script>> ] bi* = ; + +M: category-class class-member? + [ category# ] [ category>> ] bi* = ; + +M: category-range-class class-member? + [ category first ] [ category>> ] bi* = ; TUPLE: not-class class ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index d59d4818ec..82c22a5af9 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -117,8 +117,17 @@ M: or-class modify-class M: not-class modify-class class>> modify-class <not-class> ; -M: any-char modify-class - drop dotall option? t any-char-no-nl ? ; +MEMO: unix-dot ( -- class ) + CHAR: \n <not-class> ; + +MEMO: nonl-dot ( -- class ) + { CHAR: \n CHAR: \r } <or-class> <not-class> ; + +M: dot modify-class + drop dotall option? [ t ] [ + unix-lines option? + unix-dot nonl-dot ? + ] if ; : modify-letter-class ( class -- newclass ) case-insensitive option? [ drop Letter-class ] when ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 7b2d6af2c1..db18275f04 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -18,6 +18,13 @@ ERROR: bad-number ; ERROR: bad-class name ; +: parse-unicode-class ( name -- class ) + ! Implement this! + drop f ; + +: unicode-class ( name -- class ) + parse-unicode-class [ bad-class ] unless* ; + : name>class ( name -- class ) >string >case-fold { { "lower" letter-class } @@ -32,8 +39,7 @@ ERROR: bad-class name ; { "cntrl" control-character-class } { "xdigit" hex-digit-class } { "space" java-blank-class } - ! TODO: unicode-character-class - } [ bad-class ] at-error ; + } [ unicode-class ] at-error ; : lookup-escape ( char -- ast ) { @@ -144,7 +150,7 @@ Parenthized = "?:" Alternation:a => [[ a ]] Element = "(" Parenthized:p ")" => [[ p ]] | "[" CharClass:r "]" => [[ r ]] - | ".":d => [[ any-char <primitive-class> ]] + | ".":d => [[ dot ]] | Character Number = (!(","|"}").)* => [[ string>number ensure-number ]] diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor index 74914e8537..90064ca376 100644 --- a/basis/unicode/data/data.factor +++ b/basis/unicode/data/data.factor @@ -1,4 +1,4 @@ -! Copyright (C) 2008 Daniel Ehrenberg. +! Copyright (C) 2008, 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: combinators.short-circuit assocs math kernel sequences io.files hashtables quotations splitting grouping arrays io @@ -29,6 +29,21 @@ VALUE: properties : char>name ( char -- name ) name-map value-at ; : property? ( char property -- ? ) properties at interval-key? ; +: category# ( char -- category ) + ! There are a few characters that should be Cn + ! that this gives Cf or Mn + ! Cf = 26; Mn = 5; Cn = 29 + ! Use a compressed array instead? + dup category-map ?nth [ ] [ + dup HEX: E0001 HEX: E007F between? + [ drop 26 ] [ + HEX: E0100 HEX: E01EF between? 5 29 ? + ] if + ] ?if ; + +: category ( char -- category ) + category# categories nth ; + ! Loading data from UnicodeData.txt : split-; ( line -- array ) @@ -195,33 +210,5 @@ load-special-casing to: special-casing load-properties to: properties -! Utility to load resource files that look like Scripts.txt - -SYMBOL: interned - -: parse-script ( filename -- assoc ) - ! assoc is code point/range => name - ascii file-lines filter-comments [ split-; ] map ; - -: range, ( value key -- ) - swap interned get - [ = ] with find nip 2array , ; - -: expand-ranges ( assoc -- interval-map ) - [ - [ - swap CHAR: . over member? [ - ".." split1 [ hex> ] bi@ 2array - ] [ hex> ] if range, - ] assoc-each - ] { } make <interval-map> ; - -: process-script ( ranges -- table ) - dup values prune interned - [ expand-ranges ] with-variable ; - -: load-script ( filename -- table ) - parse-script process-script ; - [ name>char [ "Invalid character" throw ] unless* ] name>char-hook set-global diff --git a/basis/unicode/script/script.factor b/basis/unicode/script/script.factor index 383f9e3de3..c8f818dbaa 100644 --- a/basis/unicode/script/script.factor +++ b/basis/unicode/script/script.factor @@ -7,10 +7,40 @@ words words.symbol compiler.units arrays interval-maps unicode.data ; IN: unicode.script +<PRIVATE + +SYMBOL: interned + +: parse-script ( filename -- assoc ) + ! assoc is code point/range => name + ascii file-lines filter-comments [ split-; ] map ; + +: range, ( value key -- ) + swap interned get + [ = ] with find nip 2array , ; + +: expand-ranges ( assoc -- interval-map ) + [ + [ + swap CHAR: . over member? [ + ".." split1 [ hex> ] bi@ 2array + ] [ hex> ] if range, + ] assoc-each + ] { } make <interval-map> ; + +: process-script ( ranges -- table ) + dup values prune interned + [ expand-ranges ] with-variable ; + +: load-script ( filename -- table ) + parse-script process-script ; + VALUE: script-table "vocab:unicode/script/Scripts.txt" load-script to: script-table +PRIVATE> + : script-of ( char -- script ) script-table interval-at ; diff --git a/basis/unicode/syntax/syntax.factor b/basis/unicode/syntax/syntax.factor index b7ac022d0e..5bd8c05e15 100644 --- a/basis/unicode/syntax/syntax.factor +++ b/basis/unicode/syntax/syntax.factor @@ -5,22 +5,7 @@ bit-arrays namespaces make sequences.private arrays quotations assocs classes.predicate math.order strings.parser ; IN: unicode.syntax -! Character classes (categories) - -: category# ( char -- category ) - ! There are a few characters that should be Cn - ! that this gives Cf or Mn - ! Cf = 26; Mn = 5; Cn = 29 - ! Use a compressed array instead? - dup category-map ?nth [ ] [ - dup HEX: E0001 HEX: E007F between? - [ drop 26 ] [ - HEX: E0100 HEX: E01EF between? 5 29 ? - ] if - ] ?if ; - -: category ( char -- category ) - category# categories nth ; +<PRIVATE : >category-array ( categories -- bitarray ) categories [ swap member? ] with map >bit-array ; @@ -40,6 +25,8 @@ IN: unicode.syntax : define-category ( word categories -- ) [category] integer swap define-predicate-class ; +PRIVATE> + : CATEGORY: CREATE ";" parse-tokens define-category ; parsing