Reorganizing things in regexp, mostly

db4
Daniel Ehrenberg 2009-03-18 16:09:45 -05:00
parent 54194d269c
commit ba9938c30f
5 changed files with 44 additions and 42 deletions

View File

@ -2,7 +2,8 @@
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words combinators locals
ascii unicode.categories combinators.short-circuit sequences
fry macros arrays assocs sets classes mirrors ;
fry macros arrays assocs sets classes mirrors unicode.script
unicode.data ;
IN: regexp.classes
SINGLETONS: dot letter-class LETTER-class Letter-class digit-class
@ -14,8 +15,8 @@ unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file
^unix $unix word-break ;
TUPLE: range from to ;
C: <range> range
TUPLE: range-class from to ;
C: <range-class> range-class
TUPLE: primitive-class class ;
C: <primitive-class> primitive-class
@ -35,7 +36,7 @@ M: t class-member? ( obj class -- ? ) 2drop t ;
M: integer class-member? ( obj class -- ? ) = ;
M: range class-member? ( obj class -- ? )
M: range-class class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ;
M: letter-class class-member? ( obj class -- ? )
@ -119,7 +120,10 @@ M: category-range-class class-member?
TUPLE: not-class class ;
PREDICATE: not-integer < not-class class>> integer? ;
PREDICATE: not-primitive < not-class class>> primitive-class? ;
UNION: simple-class
primitive-class range-class category-class category-range-class dot ;
PREDICATE: not-simple < not-class class>> simple-class? ;
M: not-class class-member?
class>> class-member? not ;
@ -146,14 +150,14 @@ DEFER: substitute
[ drop class new seq { } like >>seq ]
} case ; inline
TUPLE: class-partition integers not-integers primitives not-primitives and or other ;
TUPLE: class-partition integers not-integers simples not-simples and or other ;
: partition-classes ( seq -- class-partition )
prune
[ integer? ] partition
[ not-integer? ] partition
[ primitive-class? ] partition ! extend primitive-class to epsilon tags
[ not-primitive? ] partition
[ simple-class? ] partition
[ not-simple? ] partition
[ and-class? ] partition
[ or-class? ] partition
class-partition boa ;
@ -167,17 +171,17 @@ TUPLE: class-partition integers not-integers primitives not-primitives and or ot
: filter-not-integers ( partition -- partition' )
dup
[ primitives>> ] [ not-primitives>> ] [ or>> ] tri
[ simples>> ] [ not-simples>> ] [ or>> ] tri
3append and-class boa
'[ [ class>> _ class-member? ] filter ] change-not-integers ;
: answer-ors ( partition -- partition' )
dup [ not-integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append
dup [ not-integers>> ] [ not-simples>> ] [ simples>> ] tri 3append
'[ [ _ [ t substitute ] each ] map ] change-or ;
: contradiction? ( partition -- ? )
{
[ [ primitives>> ] [ not-primitives>> ] bi intersects? ]
[ [ simples>> ] [ not-simples>> ] bi intersects? ]
[ other>> f swap member? ]
} 1|| ;
@ -198,17 +202,17 @@ TUPLE: class-partition integers not-integers primitives not-primitives and or ot
: filter-integers ( partition -- partition' )
dup
[ primitives>> ] [ not-primitives>> ] [ and>> ] tri
[ simples>> ] [ not-simples>> ] [ and>> ] tri
3append or-class boa
'[ [ _ class-member? not ] filter ] change-integers ;
: answer-ands ( partition -- partition' )
dup [ integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append
dup [ integers>> ] [ not-simples>> ] [ simples>> ] tri 3append
'[ [ _ [ f substitute ] each ] map ] change-and ;
: tautology? ( partition -- ? )
{
[ [ primitives>> ] [ not-primitives>> ] bi intersects? ]
[ [ simples>> ] [ not-simples>> ] bi intersects? ]
[ other>> t swap member? ]
} 1|| ;
@ -247,8 +251,6 @@ M: f <not-class> drop t ;
M: primitive-class class-member?
class>> class-member? ;
UNION: class primitive-class not-class or-class and-class range ;
TUPLE: condition question yes no ;
C: <condition> condition

View File

@ -13,14 +13,14 @@ IN: regexp.combinators
PRIVATE>
CONSTANT: <nothing> R/ (?~.*)/
CONSTANT: <nothing> R/ (?~.*)/s
: <literal> ( string -- regexp )
[ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable
: <char-range> ( char1 char2 -- regexp )
[ [ "[" "-" surround ] [ "]" append ] bi* append ]
[ <range> ]
[ <range-class> ]
2bi make-regexp ;
: <or> ( regexps -- disjunction )

View File

@ -4,7 +4,7 @@ USING: accessors arrays assocs grouping kernel locals math namespaces
sequences fry quotations math.order math.ranges vectors
unicode.categories regexp.transition-tables words sets hashtables
combinators.short-circuit unicode.case unicode.case.private regexp.ast
regexp.classes ;
regexp.classes memoize ;
IN: regexp.nfa
! This uses unicode.case.private for ch>upper and ch>lower
@ -140,17 +140,17 @@ M: LETTER-class modify-class modify-letter-class ;
[ [ LETTER? ] bi@ and ]
} 2|| ;
M: range modify-class
M: range-class modify-class
case-insensitive option? [
dup cased-range? [
[ from>> ] [ to>> ] bi
[ [ ch>lower ] bi@ <range> ]
[ [ ch>upper ] bi@ <range> ] 2bi
[ [ ch>lower ] bi@ <range-class> ]
[ [ ch>upper ] bi@ <range-class> ] 2bi
2array <or-class>
] when
] when ;
M: class nfa-node
M: object nfa-node
modify-class add-simple-entry ;
M: with-options nfa-node ( node -- start end )

View File

@ -23,7 +23,7 @@ ERROR: bad-class name ;
drop f ;
: unicode-class ( name -- class )
parse-unicode-class [ bad-class ] unless* ;
dup parse-unicode-class [ ] [ bad-class ] ?if ;
: name>class ( name -- class )
>string >case-fold {
@ -125,10 +125,10 @@ AnyRangeCharacter = EscapeSequence | .
RangeCharacter = !("]") AnyRangeCharacter
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
| RangeCharacter
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
| AnyRangeCharacter
Ranges = StartRange:s Range*:r => [[ r s prefix ]]

View File

@ -29,6 +29,22 @@ VALUE: properties
: char>name ( char -- name ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
! For non-existent characters, use Cn
CONSTANT: categories
{ "Cn"
"Lu" "Ll" "Lt" "Lm" "Lo"
"Mn" "Mc" "Me"
"Nd" "Nl" "No"
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
"Sm" "Sc" "Sk" "So"
"Zs" "Zl" "Zp"
"Cc" "Cf" "Cs" "Co" }
MEMO: categories-map ( -- hashtable )
categories <enum> [ swap ] H{ } assoc-map-as ;
CONSTANT: num-chars HEX: 2FA1E
: category# ( char -- category )
! There are a few characters that should be Cn
! that this gives Cf or Mn
@ -112,22 +128,6 @@ VALUE: properties
[ nip zero? not ] assoc-filter
>hashtable ;
! For non-existent characters, use Cn
CONSTANT: categories
{ "Cn"
"Lu" "Ll" "Lt" "Lm" "Lo"
"Mn" "Mc" "Me"
"Nd" "Nl" "No"
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
"Sm" "Sc" "Sk" "So"
"Zs" "Zl" "Zp"
"Cc" "Cf" "Cs" "Co" }
MEMO: categories-map ( -- hashtable )
categories <enum> [ swap ] H{ } assoc-map-as ;
CONSTANT: num-chars HEX: 2FA1E
! the maximum unicode char in the first 3 planes
: ?set-nth ( val index seq -- )