Various regexp changes, including the addition of regexp combinators
parent
e54727f9bf
commit
c708bfcbca
|
@ -13,7 +13,10 @@ C: <from-to> from-to
|
||||||
TUPLE: at-least n ;
|
TUPLE: at-least n ;
|
||||||
C: <at-least> at-least
|
C: <at-least> at-least
|
||||||
|
|
||||||
SINGLETON: epsilon
|
TUPLE: tagged-epsilon tag ;
|
||||||
|
C: <tagged-epsilon> tagged-epsilon
|
||||||
|
|
||||||
|
CONSTANT: epsilon T{ tagged-epsilon }
|
||||||
|
|
||||||
TUPLE: concatenation first second ;
|
TUPLE: concatenation first second ;
|
||||||
|
|
||||||
|
@ -60,3 +63,10 @@ C: <lookahead> lookahead
|
||||||
|
|
||||||
TUPLE: lookbehind term ;
|
TUPLE: lookbehind term ;
|
||||||
C: <lookbehind> lookbehind
|
C: <lookbehind> lookbehind
|
||||||
|
|
||||||
|
TUPLE: possessive-star term ;
|
||||||
|
C: <possessive-star> possessive-star
|
||||||
|
|
||||||
|
: <possessive-plus> ( term -- term' )
|
||||||
|
dup <possessive-star> 2array <concatenation> ;
|
||||||
|
|
||||||
|
|
|
@ -12,8 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class
|
||||||
control-character-class hex-digit-class java-blank-class c-identifier-class
|
control-character-class hex-digit-class java-blank-class c-identifier-class
|
||||||
unmatchable-class terminator-class word-boundary-class ;
|
unmatchable-class terminator-class word-boundary-class ;
|
||||||
|
|
||||||
SINGLETONS: beginning-of-input beginning-of-line
|
SINGLETONS: beginning-of-input ^ end-of-input $ ;
|
||||||
end-of-input end-of-line ;
|
|
||||||
|
|
||||||
TUPLE: range from to ;
|
TUPLE: range from to ;
|
||||||
C: <range> range
|
C: <range> range
|
||||||
|
@ -100,10 +99,10 @@ M: unmatchable-class class-member? ( obj class -- ? )
|
||||||
M: terminator-class class-member? ( obj class -- ? )
|
M: terminator-class class-member? ( obj class -- ? )
|
||||||
drop "\r\n\u000085\u002029\u002028" member? ;
|
drop "\r\n\u000085\u002029\u002028" member? ;
|
||||||
|
|
||||||
M: beginning-of-line class-member? ( obj class -- ? )
|
M: ^ class-member? ( obj class -- ? )
|
||||||
2drop f ;
|
2drop f ;
|
||||||
|
|
||||||
M: end-of-line class-member? ( obj class -- ? )
|
M: $ class-member? ( obj class -- ? )
|
||||||
2drop f ;
|
2drop f ;
|
||||||
|
|
||||||
M: f class-member? 2drop f ;
|
M: f class-member? 2drop f ;
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: regexp.combinators tools.test regexp kernel sequences ;
|
||||||
|
IN: regexp.combinators.tests
|
||||||
|
|
||||||
|
: strings ( -- regexp )
|
||||||
|
{ "foo" "bar" "baz" } <any-of> ;
|
||||||
|
|
||||||
|
[ t t t ] [ "foo" "bar" "baz" [ strings matches? ] tri@ ] unit-test
|
||||||
|
[ f f f ] [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test
|
||||||
|
|
||||||
|
: conj ( -- regexp )
|
||||||
|
{ R/ .*a/ R/ b.*/ } <and> ;
|
||||||
|
|
||||||
|
[ t ] [ "bljhasflsda" conj matches? ] unit-test
|
||||||
|
[ f ] [ "bsdfdfs" conj matches? ] unit-test ! why does this fail?
|
||||||
|
[ f ] [ "fsfa" conj matches? ] unit-test
|
||||||
|
|
||||||
|
! For some reason, creating this DFA doesn't work
|
||||||
|
! [ f ] [ "bljhasflsda" conj <not> matches? ] unit-test
|
||||||
|
! [ t ] [ "bsdfdfs" conj <not> matches? ] unit-test
|
||||||
|
! [ t ] [ "fsfa" conj <not> matches? ] unit-test
|
||||||
|
|
||||||
|
[ f f ] [ "" "hi" [ <nothing> matches? ] bi@ ] unit-test
|
||||||
|
[ t t ] [ "" "hi" [ <nothing> <not> matches? ] bi@ ] unit-test
|
||||||
|
|
||||||
|
[ { t t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <zero-or-more> matches? ] map ] unit-test
|
||||||
|
[ { f t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <one-or-more> matches? ] map ] unit-test
|
||||||
|
[ { t t f f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <option> matches? ] map ] unit-test
|
|
@ -0,0 +1,48 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: regexp sequences kernel regexp.negation regexp.ast
|
||||||
|
accessors fry ;
|
||||||
|
IN: regexp.combinators
|
||||||
|
|
||||||
|
: <nothing> ( -- regexp )
|
||||||
|
R/ (?~.*)/ ;
|
||||||
|
|
||||||
|
: <literal> ( string -- regexp )
|
||||||
|
[ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ;
|
||||||
|
|
||||||
|
: <or> ( regexps -- disjunction )
|
||||||
|
[ [ raw>> "(" ")" surround ] map "|" join ]
|
||||||
|
[ [ parse-tree>> ] map <alternation> ] bi
|
||||||
|
make-regexp ;
|
||||||
|
|
||||||
|
: <any-of> ( strings -- regexp )
|
||||||
|
[ <literal> ] map <or> ;
|
||||||
|
|
||||||
|
: <sequence> ( regexps -- regexp )
|
||||||
|
[ [ raw>> ] map concat ]
|
||||||
|
[ [ parse-tree>> ] map <concatenation> ] bi
|
||||||
|
make-regexp ;
|
||||||
|
|
||||||
|
: modify-regexp ( regexp raw-quot tree-quot -- new-regexp )
|
||||||
|
[ '[ raw>> @ ] ]
|
||||||
|
[ '[ parse-tree>> @ ] ] bi* bi
|
||||||
|
make-regexp ; inline
|
||||||
|
|
||||||
|
: <not> ( regexp -- not-regexp )
|
||||||
|
[ "(?~" ")" surround ]
|
||||||
|
[ <negation> ] modify-regexp ;
|
||||||
|
|
||||||
|
: <and> ( regexps -- conjunction )
|
||||||
|
[ <not> ] map <or> <not> ;
|
||||||
|
|
||||||
|
: <zero-or-more> ( regexp -- regexp* )
|
||||||
|
[ "(" ")*" surround ]
|
||||||
|
[ <star> ] modify-regexp ;
|
||||||
|
|
||||||
|
: <one-or-more> ( regexp -- regexp+ )
|
||||||
|
[ "(" ")+" surround ]
|
||||||
|
[ <plus> ] modify-regexp ;
|
||||||
|
|
||||||
|
: <option> ( regexp -- regexp? )
|
||||||
|
[ "(" ")?" surround ]
|
||||||
|
[ <maybe> ] modify-regexp ;
|
|
@ -2,7 +2,7 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays assocs combinators fry kernel locals
|
USING: accessors arrays assocs combinators fry kernel locals
|
||||||
math math.order regexp.nfa regexp.transition-tables sequences
|
math math.order regexp.nfa regexp.transition-tables sequences
|
||||||
sets sorting vectors ;
|
sets sorting vectors regexp.ast ;
|
||||||
IN: regexp.dfa
|
IN: regexp.dfa
|
||||||
|
|
||||||
:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj )
|
:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj )
|
||||||
|
@ -20,7 +20,7 @@ IN: regexp.dfa
|
||||||
transitions>> '[ _ swap _ at at ] gather sift ;
|
transitions>> '[ _ swap _ at at ] gather sift ;
|
||||||
|
|
||||||
: (find-epsilon-closure) ( states nfa -- new-states )
|
: (find-epsilon-closure) ( states nfa -- new-states )
|
||||||
eps swap find-delta ;
|
epsilon swap find-delta ;
|
||||||
|
|
||||||
: find-epsilon-closure ( states nfa -- new-states )
|
: find-epsilon-closure ( states nfa -- new-states )
|
||||||
'[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
|
'[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
|
||||||
|
@ -35,7 +35,7 @@ IN: regexp.dfa
|
||||||
: find-transitions ( dfa-state nfa -- next-dfa-state )
|
: find-transitions ( dfa-state nfa -- next-dfa-state )
|
||||||
transitions>>
|
transitions>>
|
||||||
'[ _ at keys ] gather
|
'[ _ at keys ] gather
|
||||||
eps swap remove ;
|
epsilon swap remove ;
|
||||||
|
|
||||||
: add-todo-state ( state visited-states new-states -- )
|
: add-todo-state ( state visited-states new-states -- )
|
||||||
3dup drop key? [ 3drop ] [
|
3dup drop key? [ 3drop ] [
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: regexp.nfa regexp.disambiguate kernel sequences
|
USING: regexp.nfa regexp.disambiguate kernel sequences
|
||||||
assocs regexp.classes hashtables accessors fry vectors
|
assocs regexp.classes hashtables accessors fry vectors
|
||||||
regexp.ast regexp.transition-tables regexp.minimize ;
|
regexp.ast regexp.transition-tables regexp.minimize namespaces ;
|
||||||
IN: regexp.negation
|
IN: regexp.negation
|
||||||
|
|
||||||
: ast>dfa ( parse-tree -- minimal-dfa )
|
: ast>dfa ( parse-tree -- minimal-dfa )
|
||||||
|
@ -48,14 +48,14 @@ CONSTANT: fail-state -1
|
||||||
|
|
||||||
: unify-final-state ( transition-table -- transition-table )
|
: unify-final-state ( transition-table -- transition-table )
|
||||||
dup [ final-states>> keys ] keep
|
dup [ final-states>> keys ] keep
|
||||||
'[ -2 eps <literal-transition> _ add-transition ] each
|
'[ -2 epsilon <literal-transition> _ add-transition ] each
|
||||||
H{ { -2 -2 } } >>final-states ;
|
H{ { -2 -2 } } >>final-states ;
|
||||||
|
|
||||||
: adjoin-dfa ( transition-table -- start end )
|
: adjoin-dfa ( transition-table -- start end )
|
||||||
box-transitions unify-final-state renumber-states
|
box-transitions unify-final-state renumber-states
|
||||||
[ start-state>> ]
|
[ start-state>> ]
|
||||||
[ final-states>> keys first ]
|
[ final-states>> keys first ]
|
||||||
[ table [ transitions>> ] bi@ swap update ] tri ;
|
[ nfa-table get [ transitions>> ] bi@ swap update ] tri ;
|
||||||
|
|
||||||
M: negation nfa-node ( node -- start end )
|
M: negation nfa-node ( node -- start end )
|
||||||
term>> ast>dfa negate-table adjoin-dfa ;
|
term>> ast>dfa negate-table adjoin-dfa ;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays assocs grouping kernel
|
USING: accessors arrays assocs grouping kernel
|
||||||
locals math namespaces sequences fry quotations
|
locals math namespaces sequences fry quotations
|
||||||
|
@ -24,8 +24,6 @@ M: alternation remove-lookahead
|
||||||
|
|
||||||
M: concatenation remove-lookahead ;
|
M: concatenation remove-lookahead ;
|
||||||
|
|
||||||
SINGLETON: eps
|
|
||||||
|
|
||||||
SYMBOL: option-stack
|
SYMBOL: option-stack
|
||||||
|
|
||||||
SYMBOL: state
|
SYMBOL: state
|
||||||
|
@ -34,7 +32,6 @@ SYMBOL: state
|
||||||
state [ get ] [ inc ] bi ;
|
state [ get ] [ inc ] bi ;
|
||||||
|
|
||||||
SYMBOL: nfa-table
|
SYMBOL: nfa-table
|
||||||
: table ( -- table ) nfa-table get ;
|
|
||||||
|
|
||||||
: set-each ( keys value hashtable -- )
|
: set-each ( keys value hashtable -- )
|
||||||
'[ _ swap _ set-at ] each ;
|
'[ _ swap _ set-at ] each ;
|
||||||
|
@ -56,10 +53,10 @@ GENERIC: nfa-node ( node -- start-state end-state )
|
||||||
|
|
||||||
: add-simple-entry ( obj class -- start-state end-state )
|
: add-simple-entry ( obj class -- start-state end-state )
|
||||||
[ next-state next-state 2dup ] 2dip
|
[ next-state next-state 2dup ] 2dip
|
||||||
make-transition table add-transition ;
|
make-transition nfa-table get add-transition ;
|
||||||
|
|
||||||
: epsilon-transition ( source target -- )
|
: epsilon-transition ( source target -- )
|
||||||
eps <literal-transition> table add-transition ;
|
epsilon <literal-transition> nfa-table get add-transition ;
|
||||||
|
|
||||||
M:: star nfa-node ( node -- start end )
|
M:: star nfa-node ( node -- start end )
|
||||||
node term>> nfa-node :> s1 :> s0
|
node term>> nfa-node :> s1 :> s0
|
||||||
|
@ -71,8 +68,8 @@ M:: star nfa-node ( node -- start end )
|
||||||
s1 s3 epsilon-transition
|
s1 s3 epsilon-transition
|
||||||
s2 s3 ;
|
s2 s3 ;
|
||||||
|
|
||||||
M: epsilon nfa-node
|
M: tagged-epsilon nfa-node
|
||||||
drop eps literal-transition add-simple-entry ;
|
literal-transition add-simple-entry ;
|
||||||
|
|
||||||
M: concatenation nfa-node ( node -- start end )
|
M: concatenation nfa-node ( node -- start end )
|
||||||
[ first>> ] [ second>> ] bi
|
[ first>> ] [ second>> ] bi
|
||||||
|
@ -154,7 +151,7 @@ M: with-options nfa-node ( node -- start end )
|
||||||
0 state set
|
0 state set
|
||||||
<transition-table> nfa-table set
|
<transition-table> nfa-table set
|
||||||
remove-lookahead nfa-node
|
remove-lookahead nfa-node
|
||||||
table
|
nfa-table get
|
||||||
swap dup associate >>final-states
|
swap dup associate >>final-states
|
||||||
swap >>start-state
|
swap >>start-state
|
||||||
] with-scope ;
|
] with-scope ;
|
||||||
|
|
|
@ -6,7 +6,7 @@ regexp.ast ;
|
||||||
IN: regexp.parser
|
IN: regexp.parser
|
||||||
|
|
||||||
: allowed-char? ( ch -- ? )
|
: allowed-char? ( ch -- ? )
|
||||||
".()|[*+?" member? not ;
|
".()|[*+?$^" member? not ;
|
||||||
|
|
||||||
ERROR: bad-number ;
|
ERROR: bad-number ;
|
||||||
|
|
||||||
|
@ -53,6 +53,8 @@ ERROR: bad-class name ;
|
||||||
{ CHAR: d [ digit-class <primitive-class> ] }
|
{ CHAR: d [ digit-class <primitive-class> ] }
|
||||||
{ CHAR: D [ digit-class <primitive-class> <not-class> ] }
|
{ CHAR: D [ digit-class <primitive-class> <not-class> ] }
|
||||||
|
|
||||||
|
{ CHAR: z [ end-of-input <tagged-epsilon> ] }
|
||||||
|
{ CHAR: A [ beginning-of-input <tagged-epsilon> ] }
|
||||||
[ ]
|
[ ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
|
@ -109,7 +111,10 @@ Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-cl
|
||||||
|
|
||||||
EscapeSequence = "\\" Escape:e => [[ e ]]
|
EscapeSequence = "\\" Escape:e => [[ e ]]
|
||||||
|
|
||||||
Character = EscapeSequence | . ?[ allowed-char? ]?
|
Character = EscapeSequence
|
||||||
|
| "$" => [[ $ <tagged-epsilon> ]]
|
||||||
|
| "^" => [[ ^ <tagged-epsilon> ]]
|
||||||
|
| . ?[ allowed-char? ]?
|
||||||
|
|
||||||
AnyRangeCharacter = EscapeSequence | .
|
AnyRangeCharacter = EscapeSequence | .
|
||||||
|
|
||||||
|
@ -152,6 +157,8 @@ Times = "," Number:n "}" => [[ 0 n <from-to> ]]
|
||||||
| Number:n "," Number:m "}" => [[ n m <from-to> ]]
|
| Number:n "," Number:m "}" => [[ n m <from-to> ]]
|
||||||
|
|
||||||
Repeated = Element:e "{" Times:t => [[ e t <times> ]]
|
Repeated = Element:e "{" Times:t => [[ e t <times> ]]
|
||||||
|
| Element:e "*+" => [[ e <possessive-star> ]]
|
||||||
|
| Element:e "++" => [[ e <possessive-plus> ]]
|
||||||
| Element:e "?" => [[ e <maybe> ]]
|
| Element:e "?" => [[ e <maybe> ]]
|
||||||
| Element:e "*" => [[ e <star> ]]
|
| Element:e "*" => [[ e <star> ]]
|
||||||
| Element:e "+" => [[ e <plus> ]]
|
| Element:e "+" => [[ e <plus> ]]
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
USING: regexp tools.test kernel sequences regexp.parser
|
USING: regexp tools.test kernel sequences regexp.parser
|
||||||
regexp.traversal eval strings multiline ;
|
regexp.traversal eval strings multiline accessors ;
|
||||||
IN: regexp-tests
|
IN: regexp-tests
|
||||||
|
|
||||||
\ <regexp> must-infer
|
\ <regexp> must-infer
|
||||||
|
@ -332,6 +332,16 @@ IN: regexp-tests
|
||||||
! Intersecting classes
|
! Intersecting classes
|
||||||
[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
|
[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
|
||||||
[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
|
[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
|
||||||
|
[ t ] [ "ac" R/ ac|\p{Lower}b/ matches? ] unit-test
|
||||||
|
[ t ] [ "ac" R/ ac|[a-z]b/ matches? ] unit-test
|
||||||
|
[ t ] [ "ac" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
|
||||||
|
[ t ] [ "ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
|
||||||
|
[ t ] [ "πb" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
|
||||||
|
[ f ] [ "πc" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
|
||||||
|
[ f ] [ "Ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
|
||||||
|
|
||||||
|
[ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test
|
||||||
|
[ t ] [ R/ foo/ dfa>> >boolean ] unit-test
|
||||||
|
|
||||||
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
|
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
|
||||||
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors combinators kernel math sequences strings sets
|
USING: accessors combinators kernel math sequences strings sets
|
||||||
assocs prettyprint.backend prettyprint.custom make lexer
|
assocs prettyprint.backend prettyprint.custom make lexer
|
||||||
|
@ -10,17 +10,28 @@ IN: regexp
|
||||||
|
|
||||||
TUPLE: regexp raw parse-tree options dfa ;
|
TUPLE: regexp raw parse-tree options dfa ;
|
||||||
|
|
||||||
|
: make-regexp ( string ast -- regexp )
|
||||||
|
f f <options> f regexp boa ;
|
||||||
|
|
||||||
: <optioned-regexp> ( string options -- regexp )
|
: <optioned-regexp> ( string options -- regexp )
|
||||||
[ dup parse-regexp ] [ string>options ] bi*
|
[ dup parse-regexp ] [ string>options ] bi*
|
||||||
2dup <with-options> ast>dfa
|
f regexp boa ;
|
||||||
regexp boa ;
|
|
||||||
|
|
||||||
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
|
: get-dfa ( regexp -- dfa )
|
||||||
|
dup dfa>> [ ] [
|
||||||
|
dup
|
||||||
|
[ parse-tree>> ]
|
||||||
|
[ options>> ] bi
|
||||||
|
<with-options> ast>dfa
|
||||||
|
[ >>dfa drop ] keep
|
||||||
|
] ?if ;
|
||||||
|
|
||||||
: (match) ( string regexp -- dfa-traverser )
|
: (match) ( string regexp -- dfa-traverser )
|
||||||
dfa>> <dfa-traverser> do-match ; inline
|
get-dfa <dfa-traverser> do-match ; inline
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
|
@ -97,7 +108,7 @@ PRIVATE>
|
||||||
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
|
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
|
||||||
lexer get dup still-parsing-line?
|
lexer get dup still-parsing-line?
|
||||||
[ (parse-token) ] [ drop f ] if
|
[ (parse-token) ] [ drop f ] if
|
||||||
<optioned-regexp> parsed ;
|
<optioned-regexp> dup get-dfa drop parsed ;
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue