Rewriting regexp parser
parent
00c5395d31
commit
105ef28433
|
@ -11,22 +11,10 @@ IN: regexp.nfa
|
||||||
|
|
||||||
ERROR: feature-is-broken feature ;
|
ERROR: feature-is-broken feature ;
|
||||||
|
|
||||||
SYMBOL: negation-mode
|
SYMBOL: negated?
|
||||||
: negated? ( -- ? ) negation-mode get 0 or odd? ;
|
|
||||||
|
|
||||||
SINGLETON: eps
|
SINGLETON: eps
|
||||||
|
|
||||||
MIXIN: traversal-flag
|
|
||||||
SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
|
|
||||||
SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
|
|
||||||
SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
|
|
||||||
SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
|
|
||||||
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
|
|
||||||
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
|
|
||||||
SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
|
|
||||||
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
|
|
||||||
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
|
|
||||||
|
|
||||||
: options ( -- obj ) current-regexp get options>> ;
|
: options ( -- obj ) current-regexp get options>> ;
|
||||||
|
|
||||||
: option? ( obj -- ? ) options key? ;
|
: option? ( obj -- ? ) options key? ;
|
||||||
|
@ -53,7 +41,7 @@ GENERIC: nfa-node ( node -- )
|
||||||
s1 [ regexp next-state ]
|
s1 [ regexp next-state ]
|
||||||
stack [ regexp stack>> ]
|
stack [ regexp stack>> ]
|
||||||
table [ regexp nfa-table>> ] |
|
table [ regexp nfa-table>> ] |
|
||||||
negated? [
|
negated? get [
|
||||||
s0 f obj class make-transition table add-transition
|
s0 f obj class make-transition table add-transition
|
||||||
s0 s1 <default-transition> table add-transition
|
s0 s1 <default-transition> table add-transition
|
||||||
] [
|
] [
|
||||||
|
@ -62,10 +50,6 @@ GENERIC: nfa-node ( node -- )
|
||||||
s0 s1 2array stack push
|
s0 s1 2array stack push
|
||||||
t s1 table final-states>> set-at ] ;
|
t s1 table final-states>> set-at ] ;
|
||||||
|
|
||||||
: add-traversal-flag ( flag -- )
|
|
||||||
stack peek second
|
|
||||||
current-regexp get nfa-traversal-flags>> push-at ;
|
|
||||||
|
|
||||||
:: concatenate-nodes ( -- )
|
:: concatenate-nodes ( -- )
|
||||||
[let* | regexp [ current-regexp get ]
|
[let* | regexp [ current-regexp get ]
|
||||||
stack [ regexp stack>> ]
|
stack [ regexp stack>> ]
|
||||||
|
@ -97,7 +81,7 @@ GENERIC: nfa-node ( node -- )
|
||||||
t s5 table final-states>> set-at
|
t s5 table final-states>> set-at
|
||||||
s4 s5 2array stack push ] ;
|
s4 s5 2array stack push ] ;
|
||||||
|
|
||||||
M: kleene-star nfa-node ( node -- )
|
M: star nfa-node ( node -- )
|
||||||
term>> nfa-node
|
term>> nfa-node
|
||||||
[let* | regexp [ current-regexp get ]
|
[let* | regexp [ current-regexp get ]
|
||||||
stack [ regexp stack>> ]
|
stack [ regexp stack>> ]
|
||||||
|
@ -139,17 +123,12 @@ M: constant nfa-node ( node -- )
|
||||||
char>> literal-transition add-simple-entry
|
char>> literal-transition add-simple-entry
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
M: epsilon nfa-node ( node -- )
|
|
||||||
drop eps literal-transition add-simple-entry ;
|
|
||||||
|
|
||||||
M: word nfa-node ( node -- ) class-transition add-simple-entry ;
|
M: word nfa-node ( node -- ) class-transition add-simple-entry ;
|
||||||
|
|
||||||
M: any-char nfa-node ( node -- )
|
M: any-char nfa-node ( node -- )
|
||||||
[ dotall option? ] dip any-char-no-nl ?
|
[ dotall option? ] dip any-char-no-nl ?
|
||||||
class-transition add-simple-entry ;
|
class-transition add-simple-entry ;
|
||||||
|
|
||||||
! M: beginning-of-text nfa-node ( node -- ) ;
|
|
||||||
|
|
||||||
M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
|
M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
|
||||||
|
|
||||||
M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
|
M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
|
||||||
|
@ -182,38 +161,6 @@ M: character-class-range nfa-node ( node -- )
|
||||||
class-transition add-simple-entry
|
class-transition add-simple-entry
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
M: capture-group nfa-node ( node -- )
|
|
||||||
term>> nfa-node ;
|
|
||||||
|
|
||||||
M: non-capture-group nfa-node ( node -- )
|
|
||||||
term>> nfa-node ;
|
|
||||||
|
|
||||||
M: reluctant-kleene-star nfa-node ( node -- )
|
|
||||||
term>> <kleene-star> nfa-node ;
|
|
||||||
|
|
||||||
M: negation nfa-node ( node -- )
|
|
||||||
negation-mode inc
|
|
||||||
term>> nfa-node
|
|
||||||
negation-mode dec ;
|
|
||||||
|
|
||||||
M: lookahead nfa-node ( node -- )
|
|
||||||
"lookahead" feature-is-broken
|
|
||||||
eps literal-transition add-simple-entry
|
|
||||||
lookahead-on add-traversal-flag
|
|
||||||
term>> nfa-node
|
|
||||||
eps literal-transition add-simple-entry
|
|
||||||
lookahead-off add-traversal-flag
|
|
||||||
2 [ concatenate-nodes ] times ;
|
|
||||||
|
|
||||||
M: lookbehind nfa-node ( node -- )
|
|
||||||
"lookbehind" feature-is-broken
|
|
||||||
eps literal-transition add-simple-entry
|
|
||||||
lookbehind-on add-traversal-flag
|
|
||||||
term>> nfa-node
|
|
||||||
eps literal-transition add-simple-entry
|
|
||||||
lookbehind-off add-traversal-flag
|
|
||||||
2 [ concatenate-nodes ] times ;
|
|
||||||
|
|
||||||
M: option nfa-node ( node -- )
|
M: option nfa-node ( node -- )
|
||||||
[ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
|
[ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
|
||||||
eps literal-transition add-simple-entry ;
|
eps literal-transition add-simple-entry ;
|
||||||
|
@ -221,7 +168,6 @@ M: option nfa-node ( node -- )
|
||||||
: construct-nfa ( regexp -- )
|
: construct-nfa ( regexp -- )
|
||||||
[
|
[
|
||||||
reset-regexp
|
reset-regexp
|
||||||
negation-mode off
|
|
||||||
[ current-regexp set ]
|
[ current-regexp set ]
|
||||||
[ parse-tree>> nfa-node ]
|
[ parse-tree>> nfa-node ]
|
||||||
[ set-start-state ] tri
|
[ set-start-state ] tri
|
||||||
|
|
|
@ -1,34 +1,24 @@
|
||||||
USING: kernel tools.test regexp.backend regexp ;
|
USING: kernel tools.test regexp.parser fry sequences ;
|
||||||
IN: regexp.parser
|
IN: regexp.parser.tests
|
||||||
|
|
||||||
: test-regexp ( string -- )
|
: regexp-parses ( string -- )
|
||||||
default-regexp parse-regexp ;
|
[ [ ] ] dip '[ _ parse-regexp drop ] unit-test ;
|
||||||
|
|
||||||
! [ "(" ] [ unmatched-parentheses? ] must-fail-with
|
: regexp-fails ( string -- )
|
||||||
|
'[ _ parse-regexp ] must-fail ;
|
||||||
|
|
||||||
[ ] [ "a|b" test-regexp ] unit-test
|
{
|
||||||
[ ] [ "a.b" test-regexp ] unit-test
|
"a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||"
|
||||||
[ ] [ "a|b|c" test-regexp ] unit-test
|
"(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|"
|
||||||
[ ] [ "abc|b" test-regexp ] unit-test
|
"[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]"
|
||||||
[ ] [ "a|bcd" test-regexp ] unit-test
|
"[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
|
||||||
[ ] [ "a|(b)" test-regexp ] unit-test
|
"(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]"
|
||||||
[ ] [ "(a)|b" test-regexp ] unit-test
|
"[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)"
|
||||||
[ ] [ "(a|b)" test-regexp ] unit-test
|
"\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}"
|
||||||
[ ] [ "((a)|(b))" test-regexp ] unit-test
|
"\\ueeee" "\\0333" "\\xff" "\\\\" "\\w"
|
||||||
|
} [ regexp-parses ] each
|
||||||
|
|
||||||
[ ] [ "(?:a)" test-regexp ] unit-test
|
{
|
||||||
[ ] [ "(?i:a)" test-regexp ] unit-test
|
"[^]" "[]" "a{foo}" "a{,}" "a{}" "(?)" "\\p{foo}" "\\P{foo}"
|
||||||
[ ] [ "(?-i:a)" test-regexp ] unit-test
|
"\\ueeeg" "\\0339" "\\xfg"
|
||||||
[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
|
} [ regexp-fails ] each
|
||||||
[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
|
|
||||||
|
|
||||||
[ ] [ "(?=a)" test-regexp ] unit-test
|
|
||||||
|
|
||||||
[ ] [ "[abc]" test-regexp ] unit-test
|
|
||||||
[ ] [ "[a-c]" test-regexp ] unit-test
|
|
||||||
[ ] [ "[^a-c]" test-regexp ] unit-test
|
|
||||||
[ "[^]" test-regexp ] must-fail
|
|
||||||
|
|
||||||
[ ] [ "|b" test-regexp ] unit-test
|
|
||||||
[ ] [ "b|" test-regexp ] unit-test
|
|
||||||
[ ] [ "||" test-regexp ] unit-test
|
|
||||||
|
|
|
@ -1,437 +1,183 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays assocs combinators io io.streams.string
|
USING: peg.ebnf kernel math.parser sequences assocs arrays
|
||||||
kernel math math.parser namespaces sets
|
combinators regexp.classes strings splitting peg locals ;
|
||||||
quotations sequences splitting vectors math.order
|
|
||||||
strings regexp.backend regexp.utils
|
|
||||||
unicode.case unicode.categories words locals regexp.classes ;
|
|
||||||
IN: regexp.parser
|
IN: regexp.parser
|
||||||
|
|
||||||
FROM: math.ranges => [a,b] ;
|
TUPLE: range from to ;
|
||||||
|
TUPLE: char-class ranges ;
|
||||||
|
TUPLE: primitive-class class ;
|
||||||
|
TUPLE: not-char-class ranges ;
|
||||||
|
TUPLE: not-primitive-class class ;
|
||||||
|
TUPLE: from-to n m ;
|
||||||
|
TUPLE: at-least n ;
|
||||||
|
TUPLE: up-to n ;
|
||||||
|
TUPLE: exactly n ;
|
||||||
|
TUPLE: times expression times ;
|
||||||
|
TUPLE: concatenation seq ;
|
||||||
|
TUPLE: alternation seq ;
|
||||||
|
TUPLE: maybe term ;
|
||||||
|
TUPLE: star term ;
|
||||||
|
TUPLE: plus term ;
|
||||||
|
TUPLE: with-options tree options ;
|
||||||
|
TUPLE: ast ^? $? tree ;
|
||||||
|
SINGLETON: any-char
|
||||||
|
|
||||||
TUPLE: concatenation seq ; INSTANCE: concatenation node
|
: allowed-char? ( ch -- ? )
|
||||||
TUPLE: alternation seq ; INSTANCE: alternation node
|
".()|[*+?" member? not ;
|
||||||
TUPLE: kleene-star term ; INSTANCE: kleene-star node
|
|
||||||
|
|
||||||
! !!!!!!!!
|
ERROR: bad-number ;
|
||||||
TUPLE: possessive-question term ; INSTANCE: possessive-question node
|
|
||||||
TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node
|
|
||||||
|
|
||||||
! !!!!!!!!
|
: ensure-number ( n -- n )
|
||||||
TUPLE: reluctant-question term ; INSTANCE: reluctant-question node
|
[ bad-number ] unless* ;
|
||||||
TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
|
|
||||||
|
|
||||||
TUPLE: negation term ; INSTANCE: negation node
|
:: at-error ( key assoc quot: ( key -- replacement ) -- value )
|
||||||
TUPLE: constant char ; INSTANCE: constant node
|
key assoc at* [ drop key quot call ] unless ; inline
|
||||||
TUPLE: range from to ; INSTANCE: range node
|
|
||||||
|
|
||||||
MIXIN: parentheses-group
|
ERROR: bad-class name ;
|
||||||
TUPLE: lookahead term ; INSTANCE: lookahead node
|
|
||||||
INSTANCE: lookahead parentheses-group
|
|
||||||
TUPLE: lookbehind term ; INSTANCE: lookbehind node
|
|
||||||
INSTANCE: lookbehind parentheses-group
|
|
||||||
TUPLE: capture-group term ; INSTANCE: capture-group node
|
|
||||||
INSTANCE: capture-group parentheses-group
|
|
||||||
TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
|
|
||||||
INSTANCE: non-capture-group parentheses-group
|
|
||||||
TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
|
|
||||||
INSTANCE: independent-group parentheses-group
|
|
||||||
TUPLE: comment-group term ; INSTANCE: comment-group node
|
|
||||||
INSTANCE: comment-group parentheses-group
|
|
||||||
|
|
||||||
SINGLETON: epsilon INSTANCE: epsilon node
|
: name>class ( name -- class )
|
||||||
|
{
|
||||||
|
{ "Lower" letter-class }
|
||||||
|
{ "Upper" LETTER-class }
|
||||||
|
{ "Alpha" Letter-class }
|
||||||
|
{ "ASCII" ascii-class }
|
||||||
|
{ "Digit" digit-class }
|
||||||
|
{ "Alnum" alpha-class }
|
||||||
|
{ "Punct" punctuation-class }
|
||||||
|
{ "Graph" java-printable-class }
|
||||||
|
{ "Print" java-printable-class }
|
||||||
|
{ "Blank" non-newline-blank-class }
|
||||||
|
{ "Cntrl" control-character-class }
|
||||||
|
{ "XDigit" hex-digit-class }
|
||||||
|
{ "Space" java-blank-class }
|
||||||
|
! TODO: unicode-character-class
|
||||||
|
} [ bad-class ] at-error ;
|
||||||
|
|
||||||
TUPLE: option option on? ; INSTANCE: option node
|
: lookup-escape ( char -- ast )
|
||||||
|
{
|
||||||
|
{ CHAR: t [ CHAR: \t ] }
|
||||||
|
{ CHAR: n [ CHAR: \n ] }
|
||||||
|
{ CHAR: r [ CHAR: \r ] }
|
||||||
|
{ CHAR: f [ HEX: c ] }
|
||||||
|
{ CHAR: a [ HEX: 7 ] }
|
||||||
|
{ CHAR: e [ HEX: 1b ] }
|
||||||
|
{ CHAR: \\ [ CHAR: \\ ] }
|
||||||
|
|
||||||
|
{ CHAR: w [ c-identifier-class primitive-class boa ] }
|
||||||
|
{ CHAR: W [ c-identifier-class not-primitive-class boa ] }
|
||||||
|
{ CHAR: s [ java-blank-class primitive-class boa ] }
|
||||||
|
{ CHAR: S [ java-blank-class not-primitive-class boa ] }
|
||||||
|
{ CHAR: d [ digit-class primitive-class boa ] }
|
||||||
|
{ CHAR: D [ digit-class not-primitive-class boa ] }
|
||||||
|
|
||||||
|
[ ]
|
||||||
|
} case ;
|
||||||
|
|
||||||
|
TUPLE: options on off ;
|
||||||
|
|
||||||
SINGLETONS: unix-lines dotall multiline comments case-insensitive
|
SINGLETONS: unix-lines dotall multiline comments case-insensitive
|
||||||
unicode-case reversed-regexp ;
|
unicode-case reversed-regexp ;
|
||||||
|
|
||||||
SINGLETONS: beginning-of-character-class end-of-character-class
|
: options-assoc ( -- assoc )
|
||||||
left-parenthesis pipe caret dash ;
|
H{
|
||||||
|
{ CHAR: i case-insensitive }
|
||||||
: push1 ( obj -- ) input-stream get stream>> push ;
|
{ CHAR: d unix-lines }
|
||||||
: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
|
{ CHAR: m multiline }
|
||||||
: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
|
{ CHAR: n multiline }
|
||||||
: drop1 ( -- ) read1 drop ;
|
{ CHAR: r reversed-regexp }
|
||||||
|
{ CHAR: s dotall }
|
||||||
: stack ( -- obj ) current-regexp get stack>> ;
|
{ CHAR: u unicode-case }
|
||||||
: change-whole-stack ( quot -- )
|
{ CHAR: x comments }
|
||||||
current-regexp get
|
} ;
|
||||||
[ stack>> swap call ] keep (>>stack) ; inline
|
|
||||||
: push-stack ( obj -- ) stack push ;
|
|
||||||
: pop-stack ( -- obj ) stack pop ;
|
|
||||||
: cut-out ( vector n -- vector' vector ) cut rest ;
|
|
||||||
ERROR: cut-stack-error ;
|
|
||||||
: cut-stack ( obj vector -- vector' vector )
|
|
||||||
[ nip ] [ last-index ] 2bi [ cut-stack-error ] unless* cut-out swap ;
|
|
||||||
|
|
||||||
: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
|
|
||||||
: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
|
|
||||||
: <possessive-question> ( obj -- kleene ) possessive-question boa ;
|
|
||||||
: <reluctant-question> ( obj -- kleene ) reluctant-question boa ;
|
|
||||||
|
|
||||||
: <negation> ( obj -- negation ) negation boa ;
|
|
||||||
: <concatenation> ( seq -- concatenation )
|
|
||||||
>vector [ epsilon ] [ concatenation boa ] if-empty ;
|
|
||||||
: <alternation> ( seq -- alternation ) >vector alternation boa ;
|
|
||||||
: <capture-group> ( obj -- capture-group ) capture-group boa ;
|
|
||||||
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
|
|
||||||
: <constant> ( obj -- constant ) constant boa ;
|
|
||||||
|
|
||||||
: first|concatenation ( seq -- first/concatenation )
|
|
||||||
dup length 1 = [ first ] [ <concatenation> ] if ;
|
|
||||||
|
|
||||||
: first|alternation ( seq -- first/alternation )
|
|
||||||
dup length 1 = [ first ] [ <alternation> ] if ;
|
|
||||||
|
|
||||||
: <character-class-range> ( from to -- obj )
|
|
||||||
2dup <
|
|
||||||
[ character-class-range boa ] [ 2drop unmatchable-class ] if ;
|
|
||||||
|
|
||||||
ERROR: unmatched-parentheses ;
|
|
||||||
|
|
||||||
ERROR: unknown-regexp-option option ;
|
|
||||||
|
|
||||||
: ch>option ( ch -- singleton )
|
: ch>option ( ch -- singleton )
|
||||||
{
|
options-assoc at ;
|
||||||
{ CHAR: i [ case-insensitive ] }
|
|
||||||
{ CHAR: d [ unix-lines ] }
|
|
||||||
{ CHAR: m [ multiline ] }
|
|
||||||
{ CHAR: n [ multiline ] }
|
|
||||||
{ CHAR: r [ reversed-regexp ] }
|
|
||||||
{ CHAR: s [ dotall ] }
|
|
||||||
{ CHAR: u [ unicode-case ] }
|
|
||||||
{ CHAR: x [ comments ] }
|
|
||||||
[ unknown-regexp-option ]
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: option>ch ( option -- string )
|
: option>ch ( option -- string )
|
||||||
{
|
options-assoc value-at ;
|
||||||
{ case-insensitive [ CHAR: i ] }
|
|
||||||
{ multiline [ CHAR: m ] }
|
|
||||||
{ reversed-regexp [ CHAR: r ] }
|
|
||||||
{ dotall [ CHAR: s ] }
|
|
||||||
[ unknown-regexp-option ]
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: toggle-option ( ch ? -- )
|
: parse-options ( on off -- options )
|
||||||
[ ch>option ] dip option boa push-stack ;
|
[ [ ch>option ] map ] bi@ options boa ;
|
||||||
|
|
||||||
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
|
! TODO: make range syntax better (negation, and, etc),
|
||||||
|
! add syntax for various parenthized things,
|
||||||
|
! add greedy and nongreedy forms of matching
|
||||||
|
! (once it's all implemented)
|
||||||
|
|
||||||
: parse-options ( string -- )
|
EBNF: (parse-regexp)
|
||||||
"-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
|
|
||||||
|
|
||||||
ERROR: bad-special-group string ;
|
CharacterInBracket = !("}") Character
|
||||||
|
|
||||||
DEFER: (parse-regexp)
|
Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]]
|
||||||
: nested-parse-regexp ( token ? -- )
|
| "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]]
|
||||||
[ push-stack (parse-regexp) pop-stack ] dip
|
| "u" Character:a Character:b Character:c Character:d
|
||||||
[ <negation> ] when pop-stack new swap >>term push-stack ;
|
=> [[ { a b c d } hex> ensure-number ]]
|
||||||
|
| "x" Character:a Character:b
|
||||||
|
=> [[ { a b } hex> ensure-number ]]
|
||||||
|
| "0" Character:a Character:b Character:c
|
||||||
|
=> [[ { a b c } oct> ensure-number ]]
|
||||||
|
| . => [[ lookup-escape ]]
|
||||||
|
|
||||||
! non-capturing groups
|
Character = "\\" Escape:e => [[ e ]]
|
||||||
: (parse-special-group) ( -- )
|
| . ?[ allowed-char? ]?
|
||||||
read1 {
|
|
||||||
{ [ dup CHAR: # = ] ! comment
|
|
||||||
[ drop comment-group f nested-parse-regexp pop-stack drop ] }
|
|
||||||
{ [ dup CHAR: : = ]
|
|
||||||
[ drop non-capture-group f nested-parse-regexp ] }
|
|
||||||
{ [ dup CHAR: = = ]
|
|
||||||
[ drop lookahead f nested-parse-regexp ] }
|
|
||||||
{ [ dup CHAR: ! = ]
|
|
||||||
[ drop lookahead t nested-parse-regexp ] }
|
|
||||||
{ [ dup CHAR: > = ]
|
|
||||||
[ drop non-capture-group f nested-parse-regexp ] }
|
|
||||||
{ [ dup CHAR: < = peek1 CHAR: = = and ]
|
|
||||||
[ drop drop1 lookbehind f nested-parse-regexp ] }
|
|
||||||
{ [ dup CHAR: < = peek1 CHAR: ! = and ]
|
|
||||||
[ drop drop1 lookbehind t nested-parse-regexp ] }
|
|
||||||
[
|
|
||||||
":)" read-until
|
|
||||||
[ swap prefix ] dip
|
|
||||||
{
|
|
||||||
{ CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] }
|
|
||||||
{ CHAR: ) [ parse-options ] }
|
|
||||||
[ drop bad-special-group ]
|
|
||||||
} case
|
|
||||||
]
|
|
||||||
} cond ;
|
|
||||||
|
|
||||||
: handle-left-parenthesis ( -- )
|
AnyRangeCharacter = Character | "["
|
||||||
peek1 CHAR: ? =
|
|
||||||
[ drop1 (parse-special-group) ]
|
|
||||||
[ capture-group f nested-parse-regexp ] if ;
|
|
||||||
|
|
||||||
: handle-dot ( -- ) any-char push-stack ;
|
RangeCharacter = !("]") AnyRangeCharacter
|
||||||
: handle-pipe ( -- ) pipe push-stack ;
|
|
||||||
: (handle-star) ( obj -- kleene-star )
|
|
||||||
peek1 {
|
|
||||||
{ CHAR: + [ drop1 <possessive-kleene-star> ] }
|
|
||||||
{ CHAR: ? [ drop1 <reluctant-kleene-star> ] }
|
|
||||||
[ drop <kleene-star> ]
|
|
||||||
} case ;
|
|
||||||
: handle-star ( -- ) stack pop (handle-star) push-stack ;
|
|
||||||
: handle-question ( -- )
|
|
||||||
stack pop peek1 {
|
|
||||||
{ CHAR: + [ drop1 <possessive-question> ] }
|
|
||||||
{ CHAR: ? [ drop1 <reluctant-question> ] }
|
|
||||||
[ drop epsilon 2array <alternation> ]
|
|
||||||
} case push-stack ;
|
|
||||||
: handle-plus ( -- )
|
|
||||||
stack pop dup (handle-star)
|
|
||||||
2array <concatenation> push-stack ;
|
|
||||||
|
|
||||||
ERROR: unmatched-brace ;
|
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
|
||||||
: parse-repetition ( -- start finish ? )
|
| RangeCharacter
|
||||||
"}" read-until [ unmatched-brace ] unless
|
|
||||||
[ "," split1 [ string>number ] bi@ ]
|
|
||||||
[ CHAR: , swap index >boolean ] bi ;
|
|
||||||
|
|
||||||
: replicate/concatenate ( n obj -- obj' )
|
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
|
||||||
over zero? [ 2drop epsilon ]
|
| AnyRangeCharacter
|
||||||
[ <repetition> first|concatenation ] if ;
|
|
||||||
|
|
||||||
: exactly-n ( n -- )
|
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
|
||||||
stack pop replicate/concatenate push-stack ;
|
|
||||||
|
|
||||||
: at-least-n ( n -- )
|
CharClass = "^" Ranges:e => [[ e not-char-class boa ]]
|
||||||
stack pop
|
| Ranges:e => [[ e char-class boa ]]
|
||||||
[ replicate/concatenate ] keep
|
|
||||||
<kleene-star> 2array <concatenation> push-stack ;
|
|
||||||
|
|
||||||
: at-most-n ( n -- )
|
Options = [idmsux]*
|
||||||
1+
|
|
||||||
stack pop
|
|
||||||
[ replicate/concatenate ] curry map <alternation> push-stack ;
|
|
||||||
|
|
||||||
: from-m-to-n ( m n -- )
|
Parenthized = "?:" Alternation:a => [[ a ]]
|
||||||
[a,b]
|
| "?" Options:on "-"? Options:off ":" Alternation:a
|
||||||
stack pop
|
=> [[ a on off parse-options with-options boa ]]
|
||||||
[ replicate/concatenate ] curry map
|
| "?#" [^)]* => [[ ignore ]]
|
||||||
<alternation> push-stack ;
|
| Alternation
|
||||||
|
|
||||||
ERROR: invalid-range a b ;
|
Element = "(" Parenthized:p ")" => [[ p ]]
|
||||||
|
| "[" CharClass:r "]" => [[ r ]]
|
||||||
|
| ".":d => [[ any-char ]]
|
||||||
|
| Character
|
||||||
|
|
||||||
: handle-left-brace ( -- )
|
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
|
||||||
parse-repetition
|
|
||||||
[ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
|
|
||||||
[
|
|
||||||
2dup and [ from-m-to-n ]
|
|
||||||
[ [ nip at-most-n ] [ at-least-n ] if* ] if
|
|
||||||
] [ drop 0 max exactly-n ] if ;
|
|
||||||
|
|
||||||
: handle-front-anchor ( -- ) beginning-of-line push-stack ;
|
Times = "," Number:n "}" => [[ n up-to boa ]]
|
||||||
: handle-back-anchor ( -- ) end-of-line push-stack ;
|
| Number:n ",}" => [[ n at-least boa ]]
|
||||||
|
| Number:n "}" => [[ n exactly boa ]]
|
||||||
|
| "}" => [[ bad-number ]]
|
||||||
|
| Number:n "," Number:m "}" => [[ n m from-to boa ]]
|
||||||
|
|
||||||
ERROR: bad-character-class obj ;
|
Repeated = Element:e "{" Times:t => [[ e t times boa ]]
|
||||||
ERROR: expected-posix-class ;
|
| Element:e "?" => [[ e maybe boa ]]
|
||||||
|
| Element:e "*" => [[ e star boa ]]
|
||||||
|
| Element:e "+" => [[ e plus boa ]]
|
||||||
|
| Element
|
||||||
|
|
||||||
: parse-posix-class ( -- obj )
|
Concatenation = Repeated*:r => [[ r concatenation boa ]]
|
||||||
read1 CHAR: { = [ expected-posix-class ] unless
|
|
||||||
"}" read-until [ bad-character-class ] unless
|
|
||||||
{
|
|
||||||
{ "Lower" [ letter-class ] }
|
|
||||||
{ "Upper" [ LETTER-class ] }
|
|
||||||
{ "Alpha" [ Letter-class ] }
|
|
||||||
{ "ASCII" [ ascii-class ] }
|
|
||||||
{ "Digit" [ digit-class ] }
|
|
||||||
{ "Alnum" [ alpha-class ] }
|
|
||||||
{ "Punct" [ punctuation-class ] }
|
|
||||||
{ "Graph" [ java-printable-class ] }
|
|
||||||
{ "Print" [ java-printable-class ] }
|
|
||||||
{ "Blank" [ non-newline-blank-class ] }
|
|
||||||
{ "Cntrl" [ control-character-class ] }
|
|
||||||
{ "XDigit" [ hex-digit-class ] }
|
|
||||||
{ "Space" [ java-blank-class ] }
|
|
||||||
! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
|
|
||||||
[ bad-character-class ]
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: parse-octal ( -- n ) 3 read oct> check-octal ;
|
Alternation = Concatenation:c ("|" Concatenation)*:a
|
||||||
: parse-short-hex ( -- n ) 2 read hex> check-hex ;
|
=> [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]]
|
||||||
: parse-long-hex ( -- n ) 6 read hex> check-hex ;
|
|
||||||
: parse-control-character ( -- n ) read1 ;
|
|
||||||
|
|
||||||
ERROR: bad-escaped-literals seq ;
|
End = !(.)
|
||||||
|
|
||||||
: parse-til-E ( -- obj )
|
Main = Alternation End
|
||||||
"\\E" read-until [ bad-escaped-literals ] unless ;
|
;EBNF
|
||||||
|
|
||||||
:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
|
: parse-regexp ( string -- regexp )
|
||||||
parse-til-E
|
! Hack because I want $ allowable in regexps,
|
||||||
drop1
|
! but with special behavior at the end
|
||||||
[ epsilon ] [
|
! This fails if the regexp is stupid, though...
|
||||||
quot call [ <constant> ] V{ } map-as
|
dup first CHAR: ^ = tuck [ rest ] when
|
||||||
first|concatenation
|
dup peek CHAR: $ = tuck [ but-last ] when
|
||||||
] if-empty ; inline
|
(parse-regexp) ast boa ;
|
||||||
|
|
||||||
: parse-escaped-literals ( -- obj )
|
|
||||||
[ ] (parse-escaped-literals) ;
|
|
||||||
|
|
||||||
: lower-case-literals ( -- obj )
|
|
||||||
[ >lower ] (parse-escaped-literals) ;
|
|
||||||
|
|
||||||
: upper-case-literals ( -- obj )
|
|
||||||
[ >upper ] (parse-escaped-literals) ;
|
|
||||||
|
|
||||||
: parse-escaped ( -- obj )
|
|
||||||
read1
|
|
||||||
{
|
|
||||||
{ CHAR: t [ CHAR: \t <constant> ] }
|
|
||||||
{ CHAR: n [ CHAR: \n <constant> ] }
|
|
||||||
{ CHAR: r [ CHAR: \r <constant> ] }
|
|
||||||
{ CHAR: f [ HEX: c <constant> ] }
|
|
||||||
{ CHAR: a [ HEX: 7 <constant> ] }
|
|
||||||
{ CHAR: e [ HEX: 1b <constant> ] }
|
|
||||||
|
|
||||||
{ CHAR: w [ c-identifier-class ] }
|
|
||||||
{ CHAR: W [ c-identifier-class <negation> ] }
|
|
||||||
{ CHAR: s [ java-blank-class ] }
|
|
||||||
{ CHAR: S [ java-blank-class <negation> ] }
|
|
||||||
{ CHAR: d [ digit-class ] }
|
|
||||||
{ CHAR: D [ digit-class <negation> ] }
|
|
||||||
|
|
||||||
{ CHAR: p [ parse-posix-class ] }
|
|
||||||
{ CHAR: P [ parse-posix-class <negation> ] }
|
|
||||||
{ CHAR: x [ parse-short-hex <constant> ] }
|
|
||||||
{ CHAR: u [ parse-long-hex <constant> ] }
|
|
||||||
{ CHAR: 0 [ parse-octal <constant> ] }
|
|
||||||
{ CHAR: c [ parse-control-character ] }
|
|
||||||
|
|
||||||
{ CHAR: Q [ parse-escaped-literals ] }
|
|
||||||
|
|
||||||
! { CHAR: b [ word-boundary-class ] }
|
|
||||||
! { CHAR: B [ word-boundary-class <negation> ] }
|
|
||||||
! { CHAR: A [ handle-beginning-of-input ] }
|
|
||||||
! { CHAR: z [ handle-end-of-input ] }
|
|
||||||
|
|
||||||
! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
|
|
||||||
|
|
||||||
! m//g mode
|
|
||||||
! { CHAR: G [ end of previous match ] }
|
|
||||||
|
|
||||||
! Group capture
|
|
||||||
! { CHAR: 1 [ CHAR: 1 <constant> ] }
|
|
||||||
! { CHAR: 2 [ CHAR: 2 <constant> ] }
|
|
||||||
! { CHAR: 3 [ CHAR: 3 <constant> ] }
|
|
||||||
! { CHAR: 4 [ CHAR: 4 <constant> ] }
|
|
||||||
! { CHAR: 5 [ CHAR: 5 <constant> ] }
|
|
||||||
! { CHAR: 6 [ CHAR: 6 <constant> ] }
|
|
||||||
! { CHAR: 7 [ CHAR: 7 <constant> ] }
|
|
||||||
! { CHAR: 8 [ CHAR: 8 <constant> ] }
|
|
||||||
! { CHAR: 9 [ CHAR: 9 <constant> ] }
|
|
||||||
|
|
||||||
! Perl extensions
|
|
||||||
! can't do \l and \u because \u is already a 4-hex
|
|
||||||
{ CHAR: L [ lower-case-literals ] }
|
|
||||||
{ CHAR: U [ upper-case-literals ] }
|
|
||||||
|
|
||||||
[ <constant> ]
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: handle-escape ( -- ) parse-escaped push-stack ;
|
|
||||||
|
|
||||||
: handle-dash ( vector -- vector' )
|
|
||||||
H{ { dash CHAR: - } } substitute ;
|
|
||||||
|
|
||||||
: character-class>alternation ( seq -- alternation )
|
|
||||||
[ dup number? [ <constant> ] when ] map first|alternation ;
|
|
||||||
|
|
||||||
: handle-caret ( vector -- vector' )
|
|
||||||
dup [ length 2 >= ] [ first caret eq? ] bi and [
|
|
||||||
rest-slice character-class>alternation <negation>
|
|
||||||
] [
|
|
||||||
character-class>alternation
|
|
||||||
] if ;
|
|
||||||
|
|
||||||
: make-character-class ( -- character-class )
|
|
||||||
[ beginning-of-character-class swap cut-stack ] change-whole-stack
|
|
||||||
handle-dash handle-caret ;
|
|
||||||
|
|
||||||
: apply-dash ( -- )
|
|
||||||
stack [ pop3 nip <character-class-range> ] keep push ;
|
|
||||||
|
|
||||||
: apply-dash? ( -- ? )
|
|
||||||
stack dup length 3 >=
|
|
||||||
[ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
|
|
||||||
|
|
||||||
ERROR: empty-negated-character-class ;
|
|
||||||
DEFER: handle-left-bracket
|
|
||||||
: (parse-character-class) ( -- )
|
|
||||||
read1 [ empty-negated-character-class ] unless* {
|
|
||||||
{ CHAR: [ [ handle-left-bracket t ] }
|
|
||||||
{ CHAR: ] [ make-character-class push-stack f ] }
|
|
||||||
{ CHAR: - [ dash push-stack t ] }
|
|
||||||
{ CHAR: \ [ parse-escaped push-stack t ] }
|
|
||||||
[ push-stack apply-dash? [ apply-dash ] when t ]
|
|
||||||
} case
|
|
||||||
[ (parse-character-class) ] when ;
|
|
||||||
|
|
||||||
: push-constant ( ch -- ) <constant> push-stack ;
|
|
||||||
|
|
||||||
: parse-character-class-second ( -- )
|
|
||||||
read1 {
|
|
||||||
{ CHAR: [ [ CHAR: [ push-constant ] }
|
|
||||||
{ CHAR: ] [ CHAR: ] push-constant ] }
|
|
||||||
{ CHAR: - [ CHAR: - push-constant ] }
|
|
||||||
[ push1 ]
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: parse-character-class-first ( -- )
|
|
||||||
read1 {
|
|
||||||
{ CHAR: ^ [ caret push-stack parse-character-class-second ] }
|
|
||||||
{ CHAR: [ [ CHAR: [ push-constant ] }
|
|
||||||
{ CHAR: ] [ CHAR: ] push-constant ] }
|
|
||||||
{ CHAR: - [ CHAR: - push-constant ] }
|
|
||||||
[ push1 ]
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: handle-left-bracket ( -- )
|
|
||||||
beginning-of-character-class push-stack
|
|
||||||
parse-character-class-first (parse-character-class) ;
|
|
||||||
|
|
||||||
: finish-regexp-parse ( stack -- obj )
|
|
||||||
{ pipe } split
|
|
||||||
[ first|concatenation ] map first|alternation ;
|
|
||||||
|
|
||||||
: handle-right-parenthesis ( -- )
|
|
||||||
stack dup [ parentheses-group "members" word-prop member? ] find-last
|
|
||||||
-rot cut rest
|
|
||||||
[ [ push ] keep current-regexp get (>>stack) ]
|
|
||||||
[ finish-regexp-parse push-stack ] bi* ;
|
|
||||||
|
|
||||||
: parse-regexp-token ( token -- ? )
|
|
||||||
{
|
|
||||||
{ CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning?
|
|
||||||
{ CHAR: ) [ handle-right-parenthesis f ] }
|
|
||||||
{ CHAR: . [ handle-dot t ] }
|
|
||||||
{ CHAR: | [ handle-pipe t ] }
|
|
||||||
{ CHAR: ? [ handle-question t ] }
|
|
||||||
{ CHAR: * [ handle-star t ] }
|
|
||||||
{ CHAR: + [ handle-plus t ] }
|
|
||||||
{ CHAR: { [ handle-left-brace t ] }
|
|
||||||
{ CHAR: [ [ handle-left-bracket t ] }
|
|
||||||
{ CHAR: \ [ handle-escape t ] }
|
|
||||||
[
|
|
||||||
dup CHAR: $ = peek1 f = and
|
|
||||||
[ drop handle-back-anchor f ]
|
|
||||||
[ push-constant t ] if
|
|
||||||
]
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: (parse-regexp) ( -- )
|
|
||||||
read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
|
|
||||||
|
|
||||||
: parse-regexp-beginning ( -- )
|
|
||||||
peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ;
|
|
||||||
|
|
||||||
: parse-regexp ( regexp -- )
|
|
||||||
dup current-regexp [
|
|
||||||
raw>> [
|
|
||||||
<string-reader> [
|
|
||||||
parse-regexp-beginning (parse-regexp)
|
|
||||||
] with-input-stream
|
|
||||||
] unless-empty
|
|
||||||
current-regexp get [ finish-regexp-parse ] change-stack
|
|
||||||
dup stack>> >>parse-tree drop
|
|
||||||
] with-variable ;
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ IN: regexp
|
||||||
|
|
||||||
: construct-regexp ( regexp -- regexp' )
|
: construct-regexp ( regexp -- regexp' )
|
||||||
{
|
{
|
||||||
[ parse-regexp ]
|
[ dup raw>> parse-regexp >>parse-tree drop ]
|
||||||
[ construct-nfa ]
|
[ construct-nfa ]
|
||||||
[ construct-dfa ]
|
[ construct-dfa ]
|
||||||
[ ]
|
[ ]
|
||||||
|
@ -33,9 +33,6 @@ IN: regexp
|
||||||
: match ( string regexp -- slice/f )
|
: match ( string regexp -- slice/f )
|
||||||
(match) return-match ;
|
(match) return-match ;
|
||||||
|
|
||||||
: match* ( string regexp -- slice/f captured-groups )
|
|
||||||
(match) [ return-match ] [ captured-groups>> ] bi ;
|
|
||||||
|
|
||||||
: matches? ( string regexp -- ? )
|
: matches? ( string regexp -- ? )
|
||||||
dupd match
|
dupd match
|
||||||
[ [ length ] bi@ = ] [ drop f ] if* ;
|
[ [ length ] bi@ = ] [ drop f ] if* ;
|
||||||
|
|
|
@ -68,7 +68,7 @@ TUPLE: dfa-traverser
|
||||||
|
|
||||||
: match-class ( transition from-state table -- to-state/f )
|
: match-class ( transition from-state table -- to-state/f )
|
||||||
transitions>> at* [
|
transitions>> at* [
|
||||||
'[ drop _ swap class-member? ] assoc-find [ nip ] [ drop ] if
|
swap '[ drop _ swap class-member? ] assoc-find spin ?
|
||||||
] [ drop ] if ;
|
] [ drop ] if ;
|
||||||
|
|
||||||
: match-default ( transition from-state table -- to-state/f )
|
: match-default ( transition from-state table -- to-state/f )
|
||||||
|
|
Loading…
Reference in New Issue