From ba9938c30f5255718ff36d092ff010fcf454fe84 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 18 Mar 2009 16:09:45 -0500 Subject: [PATCH] Reorganizing things in regexp, mostly --- basis/regexp/classes/classes.factor | 34 +++++++++++---------- basis/regexp/combinators/combinators.factor | 4 +-- basis/regexp/nfa/nfa.factor | 10 +++--- basis/regexp/parser/parser.factor | 6 ++-- basis/unicode/data/data.factor | 32 +++++++++---------- 5 files changed, 44 insertions(+), 42 deletions(-) diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index c586932075..28b0ed1563 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -2,7 +2,8 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors kernel math math.order words combinators locals ascii unicode.categories combinators.short-circuit sequences -fry macros arrays assocs sets classes mirrors ; +fry macros arrays assocs sets classes mirrors unicode.script +unicode.data ; IN: regexp.classes SINGLETONS: dot letter-class LETTER-class Letter-class digit-class @@ -14,8 +15,8 @@ unmatchable-class terminator-class word-boundary-class ; SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ; -TUPLE: range from to ; -C: range +TUPLE: range-class from to ; +C: range-class TUPLE: primitive-class class ; C: primitive-class @@ -35,7 +36,7 @@ M: t class-member? ( obj class -- ? ) 2drop t ; M: integer class-member? ( obj class -- ? ) = ; -M: range class-member? ( obj class -- ? ) +M: range-class class-member? ( obj class -- ? ) [ from>> ] [ to>> ] bi between? ; M: letter-class class-member? ( obj class -- ? ) @@ -119,7 +120,10 @@ M: category-range-class class-member? TUPLE: not-class class ; PREDICATE: not-integer < not-class class>> integer? ; -PREDICATE: not-primitive < not-class class>> primitive-class? ; + +UNION: simple-class + primitive-class range-class category-class category-range-class dot ; +PREDICATE: not-simple < not-class class>> simple-class? ; M: not-class class-member? class>> class-member? not ; @@ -146,14 +150,14 @@ DEFER: substitute [ drop class new seq { } like >>seq ] } case ; inline -TUPLE: class-partition integers not-integers primitives not-primitives and or other ; +TUPLE: class-partition integers not-integers simples not-simples and or other ; : partition-classes ( seq -- class-partition ) prune [ integer? ] partition [ not-integer? ] partition - [ primitive-class? ] partition ! extend primitive-class to epsilon tags - [ not-primitive? ] partition + [ simple-class? ] partition + [ not-simple? ] partition [ and-class? ] partition [ or-class? ] partition class-partition boa ; @@ -167,17 +171,17 @@ TUPLE: class-partition integers not-integers primitives not-primitives and or ot : filter-not-integers ( partition -- partition' ) dup - [ primitives>> ] [ not-primitives>> ] [ or>> ] tri + [ simples>> ] [ not-simples>> ] [ or>> ] tri 3append and-class boa '[ [ class>> _ class-member? ] filter ] change-not-integers ; : answer-ors ( partition -- partition' ) - dup [ not-integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append + dup [ not-integers>> ] [ not-simples>> ] [ simples>> ] tri 3append '[ [ _ [ t substitute ] each ] map ] change-or ; : contradiction? ( partition -- ? ) { - [ [ primitives>> ] [ not-primitives>> ] bi intersects? ] + [ [ simples>> ] [ not-simples>> ] bi intersects? ] [ other>> f swap member? ] } 1|| ; @@ -198,17 +202,17 @@ TUPLE: class-partition integers not-integers primitives not-primitives and or ot : filter-integers ( partition -- partition' ) dup - [ primitives>> ] [ not-primitives>> ] [ and>> ] tri + [ simples>> ] [ not-simples>> ] [ and>> ] tri 3append or-class boa '[ [ _ class-member? not ] filter ] change-integers ; : answer-ands ( partition -- partition' ) - dup [ integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append + dup [ integers>> ] [ not-simples>> ] [ simples>> ] tri 3append '[ [ _ [ f substitute ] each ] map ] change-and ; : tautology? ( partition -- ? ) { - [ [ primitives>> ] [ not-primitives>> ] bi intersects? ] + [ [ simples>> ] [ not-simples>> ] bi intersects? ] [ other>> t swap member? ] } 1|| ; @@ -247,8 +251,6 @@ M: f drop t ; M: primitive-class class-member? class>> class-member? ; -UNION: class primitive-class not-class or-class and-class range ; - TUPLE: condition question yes no ; C: condition diff --git a/basis/regexp/combinators/combinators.factor b/basis/regexp/combinators/combinators.factor index 2941afd99e..3bb5fcef6d 100644 --- a/basis/regexp/combinators/combinators.factor +++ b/basis/regexp/combinators/combinators.factor @@ -13,14 +13,14 @@ IN: regexp.combinators PRIVATE> -CONSTANT: R/ (?~.*)/ +CONSTANT: R/ (?~.*)/s : ( string -- regexp ) [ "\\Q" "\\E" surround ] [ ] bi make-regexp ; foldable : ( char1 char2 -- regexp ) [ [ "[" "-" surround ] [ "]" append ] bi* append ] - [ ] + [ ] 2bi make-regexp ; : ( regexps -- disjunction ) diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 82c22a5af9..f04e88070a 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -4,7 +4,7 @@ USING: accessors arrays assocs grouping kernel locals math namespaces sequences fry quotations math.order math.ranges vectors unicode.categories regexp.transition-tables words sets hashtables combinators.short-circuit unicode.case unicode.case.private regexp.ast -regexp.classes ; +regexp.classes memoize ; IN: regexp.nfa ! This uses unicode.case.private for ch>upper and ch>lower @@ -140,17 +140,17 @@ M: LETTER-class modify-class modify-letter-class ; [ [ LETTER? ] bi@ and ] } 2|| ; -M: range modify-class +M: range-class modify-class case-insensitive option? [ dup cased-range? [ [ from>> ] [ to>> ] bi - [ [ ch>lower ] bi@ ] - [ [ ch>upper ] bi@ ] 2bi + [ [ ch>lower ] bi@ ] + [ [ ch>upper ] bi@ ] 2bi 2array ] when ] when ; -M: class nfa-node +M: object nfa-node modify-class add-simple-entry ; M: with-options nfa-node ( node -- start end ) diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index db18275f04..bf5465e0e2 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -23,7 +23,7 @@ ERROR: bad-class name ; drop f ; : unicode-class ( name -- class ) - parse-unicode-class [ bad-class ] unless* ; + dup parse-unicode-class [ ] [ bad-class ] ?if ; : name>class ( name -- class ) >string >case-fold { @@ -125,10 +125,10 @@ AnyRangeCharacter = EscapeSequence | . RangeCharacter = !("]") AnyRangeCharacter -Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b ]] +Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b ]] | RangeCharacter -StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b ]] +StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b ]] | AnyRangeCharacter Ranges = StartRange:s Range*:r => [[ r s prefix ]] diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor index 90064ca376..a1f663d03a 100644 --- a/basis/unicode/data/data.factor +++ b/basis/unicode/data/data.factor @@ -29,6 +29,22 @@ VALUE: properties : char>name ( char -- name ) name-map value-at ; : property? ( char property -- ? ) properties at interval-key? ; +! For non-existent characters, use Cn +CONSTANT: categories + { "Cn" + "Lu" "Ll" "Lt" "Lm" "Lo" + "Mn" "Mc" "Me" + "Nd" "Nl" "No" + "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po" + "Sm" "Sc" "Sk" "So" + "Zs" "Zl" "Zp" + "Cc" "Cf" "Cs" "Co" } + +MEMO: categories-map ( -- hashtable ) + categories [ swap ] H{ } assoc-map-as ; + +CONSTANT: num-chars HEX: 2FA1E + : category# ( char -- category ) ! There are a few characters that should be Cn ! that this gives Cf or Mn @@ -112,22 +128,6 @@ VALUE: properties [ nip zero? not ] assoc-filter >hashtable ; -! For non-existent characters, use Cn -CONSTANT: categories - { "Cn" - "Lu" "Ll" "Lt" "Lm" "Lo" - "Mn" "Mc" "Me" - "Nd" "Nl" "No" - "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po" - "Sm" "Sc" "Sk" "So" - "Zs" "Zl" "Zp" - "Cc" "Cf" "Cs" "Co" } - -MEMO: categories-map ( -- hashtable ) - categories [ swap ] H{ } assoc-map-as ; - -CONSTANT: num-chars HEX: 2FA1E - ! the maximum unicode char in the first 3 planes : ?set-nth ( val index seq -- )