Reorganizing things in regexp, mostly
parent
54194d269c
commit
ba9938c30f
|
|
@ -2,7 +2,8 @@
|
|||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors kernel math math.order words combinators locals
|
||||
ascii unicode.categories combinators.short-circuit sequences
|
||||
fry macros arrays assocs sets classes mirrors ;
|
||||
fry macros arrays assocs sets classes mirrors unicode.script
|
||||
unicode.data ;
|
||||
IN: regexp.classes
|
||||
|
||||
SINGLETONS: dot letter-class LETTER-class Letter-class digit-class
|
||||
|
|
@ -14,8 +15,8 @@ unmatchable-class terminator-class word-boundary-class ;
|
|||
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file
|
||||
^unix $unix word-break ;
|
||||
|
||||
TUPLE: range from to ;
|
||||
C: <range> range
|
||||
TUPLE: range-class from to ;
|
||||
C: <range-class> range-class
|
||||
|
||||
TUPLE: primitive-class class ;
|
||||
C: <primitive-class> primitive-class
|
||||
|
|
@ -35,7 +36,7 @@ M: t class-member? ( obj class -- ? ) 2drop t ;
|
|||
|
||||
M: integer class-member? ( obj class -- ? ) = ;
|
||||
|
||||
M: range class-member? ( obj class -- ? )
|
||||
M: range-class class-member? ( obj class -- ? )
|
||||
[ from>> ] [ to>> ] bi between? ;
|
||||
|
||||
M: letter-class class-member? ( obj class -- ? )
|
||||
|
|
@ -119,7 +120,10 @@ M: category-range-class class-member?
|
|||
TUPLE: not-class class ;
|
||||
|
||||
PREDICATE: not-integer < not-class class>> integer? ;
|
||||
PREDICATE: not-primitive < not-class class>> primitive-class? ;
|
||||
|
||||
UNION: simple-class
|
||||
primitive-class range-class category-class category-range-class dot ;
|
||||
PREDICATE: not-simple < not-class class>> simple-class? ;
|
||||
|
||||
M: not-class class-member?
|
||||
class>> class-member? not ;
|
||||
|
|
@ -146,14 +150,14 @@ DEFER: substitute
|
|||
[ drop class new seq { } like >>seq ]
|
||||
} case ; inline
|
||||
|
||||
TUPLE: class-partition integers not-integers primitives not-primitives and or other ;
|
||||
TUPLE: class-partition integers not-integers simples not-simples and or other ;
|
||||
|
||||
: partition-classes ( seq -- class-partition )
|
||||
prune
|
||||
[ integer? ] partition
|
||||
[ not-integer? ] partition
|
||||
[ primitive-class? ] partition ! extend primitive-class to epsilon tags
|
||||
[ not-primitive? ] partition
|
||||
[ simple-class? ] partition
|
||||
[ not-simple? ] partition
|
||||
[ and-class? ] partition
|
||||
[ or-class? ] partition
|
||||
class-partition boa ;
|
||||
|
|
@ -167,17 +171,17 @@ TUPLE: class-partition integers not-integers primitives not-primitives and or ot
|
|||
|
||||
: filter-not-integers ( partition -- partition' )
|
||||
dup
|
||||
[ primitives>> ] [ not-primitives>> ] [ or>> ] tri
|
||||
[ simples>> ] [ not-simples>> ] [ or>> ] tri
|
||||
3append and-class boa
|
||||
'[ [ class>> _ class-member? ] filter ] change-not-integers ;
|
||||
|
||||
: answer-ors ( partition -- partition' )
|
||||
dup [ not-integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append
|
||||
dup [ not-integers>> ] [ not-simples>> ] [ simples>> ] tri 3append
|
||||
'[ [ _ [ t substitute ] each ] map ] change-or ;
|
||||
|
||||
: contradiction? ( partition -- ? )
|
||||
{
|
||||
[ [ primitives>> ] [ not-primitives>> ] bi intersects? ]
|
||||
[ [ simples>> ] [ not-simples>> ] bi intersects? ]
|
||||
[ other>> f swap member? ]
|
||||
} 1|| ;
|
||||
|
||||
|
|
@ -198,17 +202,17 @@ TUPLE: class-partition integers not-integers primitives not-primitives and or ot
|
|||
|
||||
: filter-integers ( partition -- partition' )
|
||||
dup
|
||||
[ primitives>> ] [ not-primitives>> ] [ and>> ] tri
|
||||
[ simples>> ] [ not-simples>> ] [ and>> ] tri
|
||||
3append or-class boa
|
||||
'[ [ _ class-member? not ] filter ] change-integers ;
|
||||
|
||||
: answer-ands ( partition -- partition' )
|
||||
dup [ integers>> ] [ not-primitives>> ] [ primitives>> ] tri 3append
|
||||
dup [ integers>> ] [ not-simples>> ] [ simples>> ] tri 3append
|
||||
'[ [ _ [ f substitute ] each ] map ] change-and ;
|
||||
|
||||
: tautology? ( partition -- ? )
|
||||
{
|
||||
[ [ primitives>> ] [ not-primitives>> ] bi intersects? ]
|
||||
[ [ simples>> ] [ not-simples>> ] bi intersects? ]
|
||||
[ other>> t swap member? ]
|
||||
} 1|| ;
|
||||
|
||||
|
|
@ -247,8 +251,6 @@ M: f <not-class> drop t ;
|
|||
M: primitive-class class-member?
|
||||
class>> class-member? ;
|
||||
|
||||
UNION: class primitive-class not-class or-class and-class range ;
|
||||
|
||||
TUPLE: condition question yes no ;
|
||||
C: <condition> condition
|
||||
|
||||
|
|
|
|||
|
|
@ -13,14 +13,14 @@ IN: regexp.combinators
|
|||
|
||||
PRIVATE>
|
||||
|
||||
CONSTANT: <nothing> R/ (?~.*)/
|
||||
CONSTANT: <nothing> R/ (?~.*)/s
|
||||
|
||||
: <literal> ( string -- regexp )
|
||||
[ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable
|
||||
|
||||
: <char-range> ( char1 char2 -- regexp )
|
||||
[ [ "[" "-" surround ] [ "]" append ] bi* append ]
|
||||
[ <range> ]
|
||||
[ <range-class> ]
|
||||
2bi make-regexp ;
|
||||
|
||||
: <or> ( regexps -- disjunction )
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ USING: accessors arrays assocs grouping kernel locals math namespaces
|
|||
sequences fry quotations math.order math.ranges vectors
|
||||
unicode.categories regexp.transition-tables words sets hashtables
|
||||
combinators.short-circuit unicode.case unicode.case.private regexp.ast
|
||||
regexp.classes ;
|
||||
regexp.classes memoize ;
|
||||
IN: regexp.nfa
|
||||
|
||||
! This uses unicode.case.private for ch>upper and ch>lower
|
||||
|
|
@ -140,17 +140,17 @@ M: LETTER-class modify-class modify-letter-class ;
|
|||
[ [ LETTER? ] bi@ and ]
|
||||
} 2|| ;
|
||||
|
||||
M: range modify-class
|
||||
M: range-class modify-class
|
||||
case-insensitive option? [
|
||||
dup cased-range? [
|
||||
[ from>> ] [ to>> ] bi
|
||||
[ [ ch>lower ] bi@ <range> ]
|
||||
[ [ ch>upper ] bi@ <range> ] 2bi
|
||||
[ [ ch>lower ] bi@ <range-class> ]
|
||||
[ [ ch>upper ] bi@ <range-class> ] 2bi
|
||||
2array <or-class>
|
||||
] when
|
||||
] when ;
|
||||
|
||||
M: class nfa-node
|
||||
M: object nfa-node
|
||||
modify-class add-simple-entry ;
|
||||
|
||||
M: with-options nfa-node ( node -- start end )
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ ERROR: bad-class name ;
|
|||
drop f ;
|
||||
|
||||
: unicode-class ( name -- class )
|
||||
parse-unicode-class [ bad-class ] unless* ;
|
||||
dup parse-unicode-class [ ] [ bad-class ] ?if ;
|
||||
|
||||
: name>class ( name -- class )
|
||||
>string >case-fold {
|
||||
|
|
@ -125,10 +125,10 @@ AnyRangeCharacter = EscapeSequence | .
|
|||
|
||||
RangeCharacter = !("]") AnyRangeCharacter
|
||||
|
||||
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
|
||||
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
|
||||
| RangeCharacter
|
||||
|
||||
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
|
||||
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
|
||||
| AnyRangeCharacter
|
||||
|
||||
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
|
||||
|
|
|
|||
|
|
@ -29,6 +29,22 @@ VALUE: properties
|
|||
: char>name ( char -- name ) name-map value-at ;
|
||||
: property? ( char property -- ? ) properties at interval-key? ;
|
||||
|
||||
! For non-existent characters, use Cn
|
||||
CONSTANT: categories
|
||||
{ "Cn"
|
||||
"Lu" "Ll" "Lt" "Lm" "Lo"
|
||||
"Mn" "Mc" "Me"
|
||||
"Nd" "Nl" "No"
|
||||
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
|
||||
"Sm" "Sc" "Sk" "So"
|
||||
"Zs" "Zl" "Zp"
|
||||
"Cc" "Cf" "Cs" "Co" }
|
||||
|
||||
MEMO: categories-map ( -- hashtable )
|
||||
categories <enum> [ swap ] H{ } assoc-map-as ;
|
||||
|
||||
CONSTANT: num-chars HEX: 2FA1E
|
||||
|
||||
: category# ( char -- category )
|
||||
! There are a few characters that should be Cn
|
||||
! that this gives Cf or Mn
|
||||
|
|
@ -112,22 +128,6 @@ VALUE: properties
|
|||
[ nip zero? not ] assoc-filter
|
||||
>hashtable ;
|
||||
|
||||
! For non-existent characters, use Cn
|
||||
CONSTANT: categories
|
||||
{ "Cn"
|
||||
"Lu" "Ll" "Lt" "Lm" "Lo"
|
||||
"Mn" "Mc" "Me"
|
||||
"Nd" "Nl" "No"
|
||||
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
|
||||
"Sm" "Sc" "Sk" "So"
|
||||
"Zs" "Zl" "Zp"
|
||||
"Cc" "Cf" "Cs" "Co" }
|
||||
|
||||
MEMO: categories-map ( -- hashtable )
|
||||
categories <enum> [ swap ] H{ } assoc-map-as ;
|
||||
|
||||
CONSTANT: num-chars HEX: 2FA1E
|
||||
|
||||
! the maximum unicode char in the first 3 planes
|
||||
|
||||
: ?set-nth ( val index seq -- )
|
||||
|
|
|
|||
Loading…
Reference in New Issue