Merge branch 'master' of git://factorcode.org/git/factor

db4
Slava Pestov 2008-11-24 00:30:09 -06:00
commit d3d57c800b
9 changed files with 310 additions and 160 deletions

View File

@ -7,6 +7,7 @@ IN: regexp.classes
GENERIC: class-member? ( obj class -- ? )
M: word class-member? ( obj class -- ? ) 2drop f ;
M: integer class-member? ( obj class -- ? ) 2drop f ;
M: character-class-range class-member? ( obj class -- ? )
@ -60,3 +61,12 @@ M: java-blank-class class-member? ( obj class -- ? )
M: unmatchable-class class-member? ( obj class -- ? )
2drop f ;
M: terminator-class class-member? ( obj class -- ? )
drop {
[ CHAR: \r = ]
[ CHAR: \n = ]
[ CHAR: \u000085 = ]
[ CHAR: \u002028 = ]
[ CHAR: \u002029 = ]
} 1|| ;

View File

@ -43,7 +43,8 @@ IN: regexp.dfa
dupd pop dup pick find-transitions rot
[
[ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
>r swapd transition make-transition r> dfa-table>> add-transition
[ swapd transition make-transition ] dip
dfa-table>> add-transition
] curry with each
new-transitions
] if-empty ;

View File

@ -18,6 +18,9 @@ SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
: next-state ( regexp -- state )
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
@ -135,7 +138,21 @@ M: non-capture-group nfa-node ( node -- )
M: reluctant-kleene-star nfa-node ( node -- )
term>> <kleene-star> nfa-node ;
!
: add-epsilon-flag ( flag -- )
eps literal-transition add-simple-entry add-traversal-flag ;
M: beginning-of-line nfa-node ( node -- )
drop beginning-of-line add-epsilon-flag ;
M: end-of-line nfa-node ( node -- )
drop end-of-line add-epsilon-flag ;
M: beginning-of-input nfa-node ( node -- )
drop beginning-of-input add-epsilon-flag ;
M: end-of-input nfa-node ( node -- )
drop end-of-input add-epsilon-flag ;
M: negation nfa-node ( node -- )
negation-mode inc

View File

@ -4,7 +4,7 @@ USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser namespaces qualified sets
quotations sequences splitting symbols vectors math.order
unicode.categories strings regexp.backend regexp.utils
unicode.case words ;
unicode.case words locals ;
IN: regexp.parser
FROM: math.ranges => [a,b] ;
@ -44,18 +44,21 @@ TUPLE: character-class-range from to ; INSTANCE: character-class-range node
SINGLETON: epsilon INSTANCE: epsilon node
SINGLETON: any-char INSTANCE: any-char node
SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
SINGLETON: front-anchor INSTANCE: front-anchor node
SINGLETON: back-anchor INSTANCE: back-anchor node
SINGLETON: beginning-of-input INSTANCE: beginning-of-input node
SINGLETON: end-of-input INSTANCE: end-of-input node
SINGLETON: beginning-of-line INSTANCE: beginning-of-line node
SINGLETON: end-of-line INSTANCE: end-of-line node
TUPLE: option-on option ; INSTANCE: option-on node
TUPLE: option-off option ; INSTANCE: option-off node
SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ;
SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ;
SINGLETONS: letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class ;
terminator-class unmatchable-class word-boundary-class ;
SINGLETONS: beginning-of-group end-of-group
beginning-of-character-class end-of-character-class
@ -225,26 +228,12 @@ ERROR: invalid-range a b ;
: handle-left-brace ( -- )
parse-repetition
>r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r>
[ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
[
2dup and [ from-m-to-n ]
[ [ nip at-most-n ] [ at-least-n ] if* ] if
] [ drop 0 max exactly-n ] if ;
SINGLETON: beginning-of-input
SINGLETON: end-of-input
: newlines ( -- obj1 obj2 obj3 )
CHAR: \r <constant>
CHAR: \n <constant>
2dup 2array <concatenation> ;
: beginning-of-line ( -- obj )
beginning-of-input newlines 4array <alternation> lookbehind boa ;
: end-of-line ( -- obj )
end-of-input newlines 4array <alternation> lookahead boa ;
: handle-front-anchor ( -- )
get-multiline beginning-of-line beginning-of-input ? push-stack ;
@ -281,13 +270,26 @@ ERROR: expected-posix-class ;
: parse-control-character ( -- n ) read1 ;
ERROR: bad-escaped-literals seq ;
: parse-escaped-literals ( -- obj )
"\\E" read-until [ bad-escaped-literals ] unless
: parse-til-E ( -- obj )
"\\E" read-until [ bad-escaped-literals ] unless ;
:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
parse-til-E
drop1
[ epsilon ] [
[ <constant> ] V{ } map-as
[ quot call <constant> ] V{ } map-as
first|concatenation
] if-empty ;
] if-empty ; inline
: parse-escaped-literals ( -- obj )
[ ] (parse-escaped-literals) ;
: lower-case-literals ( -- obj )
[ ch>lower ] (parse-escaped-literals) ;
: upper-case-literals ( -- obj )
[ ch>upper ] (parse-escaped-literals) ;
: parse-escaped ( -- obj )
read1
@ -299,12 +301,12 @@ ERROR: bad-escaped-literals seq ;
{ CHAR: a [ HEX: 7 <constant> ] }
{ CHAR: e [ HEX: 1b <constant> ] }
{ CHAR: d [ digit-class ] }
{ CHAR: D [ digit-class <negation> ] }
{ CHAR: s [ java-blank-class ] }
{ CHAR: S [ java-blank-class <negation> ] }
{ CHAR: w [ c-identifier-class ] }
{ CHAR: W [ c-identifier-class <negation> ] }
{ CHAR: s [ java-blank-class ] }
{ CHAR: S [ java-blank-class <negation> ] }
{ CHAR: d [ digit-class ] }
{ CHAR: D [ digit-class <negation> ] }
{ CHAR: p [ parse-posix-class ] }
{ CHAR: P [ parse-posix-class <negation> ] }
@ -313,13 +315,19 @@ ERROR: bad-escaped-literals seq ;
{ CHAR: 0 [ parse-octal <constant> ] }
{ CHAR: c [ parse-control-character ] }
! { CHAR: b [ handle-word-boundary ] }
! { CHAR: B [ handle-word-boundary <negation> ] }
! { CHAR: A [ handle-beginning-of-input ] }
! { CHAR: G [ end of previous match ] }
! { CHAR: Z [ handle-end-of-input ] }
! { CHAR: z [ handle-end-of-input ] } ! except for terminator
{ CHAR: Q [ parse-escaped-literals ] }
! { CHAR: b [ word-boundary-class ] }
! { CHAR: B [ word-boundary-class <negation> ] }
! { CHAR: A [ handle-beginning-of-input ] }
! { CHAR: z [ handle-end-of-input ] }
! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
! m//g mode
! { CHAR: G [ end of previous match ] }
! Group capture
! { CHAR: 1 [ CHAR: 1 <constant> ] }
! { CHAR: 2 [ CHAR: 2 <constant> ] }
! { CHAR: 3 [ CHAR: 3 <constant> ] }
@ -330,7 +338,11 @@ ERROR: bad-escaped-literals seq ;
! { CHAR: 8 [ CHAR: 8 <constant> ] }
! { CHAR: 9 [ CHAR: 9 <constant> ] }
{ CHAR: Q [ parse-escaped-literals ] }
! Perl extensions
! can't do \l and \u because \u is already a 4-hex
{ CHAR: L [ lower-case-literals ] }
{ CHAR: U [ upper-case-literals ] }
[ <constant> ]
} case ;

View File

@ -45,6 +45,7 @@ IN: regexp-tests
! Off by default.
[ f ] [ "\n" "." <regexp> matches? ] unit-test
[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
[ t ] [ "\n" R/ ./s matches? ] unit-test
[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
[ f ] [ "" ".+" <regexp> matches? ] unit-test
@ -210,34 +211,34 @@ IN: regexp-tests
[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
[ t ] [ "aaa" "AAA" <iregexp> matches? ] unit-test
[ f ] [ "aax" "AAA" <iregexp> matches? ] unit-test
[ t ] [ "aaa" "A*" <iregexp> matches? ] unit-test
[ f ] [ "aaba" "A*" <iregexp> matches? ] unit-test
[ t ] [ "b" "[AB]" <iregexp> matches? ] unit-test
[ f ] [ "c" "[AB]" <iregexp> matches? ] unit-test
[ t ] [ "c" "[A-Z]" <iregexp> matches? ] unit-test
[ f ] [ "3" "[A-Z]" <iregexp> matches? ] unit-test
[ t ] [ "aaa" R/ AAA/i matches? ] unit-test
[ f ] [ "aax" R/ AAA/i matches? ] unit-test
[ t ] [ "aaa" R/ A*/i matches? ] unit-test
[ f ] [ "aaba" R/ A*/i matches? ] unit-test
[ t ] [ "b" R/ [AB]/i matches? ] unit-test
[ f ] [ "c" R/ [AB]/i matches? ] unit-test
[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
[ t ] [ "A" R/ [a-z]/i matches? ] unit-test
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -253,7 +254,7 @@ IN: regexp-tests
[ ] [ "(\\$[\\p{XDigit}]|[\\p{Digit}])" <regexp> drop ] unit-test
! Comment
! Comment inside a regular expression
[ t ] [ "ac" "a(?#boo)c" <regexp> matches? ] unit-test
[ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test
@ -283,32 +284,90 @@ IN: regexp-tests
[ { "ABC" "DEF" "GHI" } ]
[ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test
[ 3 ]
[ "1ABC2DEF3GHI4" R/ [A-Z]+/ count-matches ] unit-test
[ 0 ]
[ "123" R/ [A-Z]+/ count-matches ] unit-test
[ "1.2.3.4" ]
[ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test
[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
[ t ] [ "a" R/ ^a/ matches? ] unit-test
[ f ] [ "\na" R/ ^a/ matches? ] unit-test
[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
[ f ] [ "\ra" R/ ^a/ matches? ] unit-test
[ t ] [ "a" R/ a$/ matches? ] unit-test
[ f ] [ "a\n" R/ a$/ matches? ] unit-test
[ f ] [ "a\r" R/ a$/ matches? ] unit-test
[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa/m matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test
! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test
! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test
! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test
! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "a" R/ ^a/m matches? ] unit-test
! [ t ] [ "\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
! Convert to lowercase until E
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
! Convert to uppercase until E
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
@ -323,39 +382,29 @@ IN: regexp-tests
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
! Bug in parsing word
! [ t ] [ "a" R' a' matches? ] unit-test
! "ab" "a(?=b*)" <regexp> match
! "abbbbbc" "a(?=b*c)" <regexp> match
! "ab" "a(?=b*)" <regexp> match
! clear "a(?=b*)" <regexp> "ab" over match
! clear "a(?=b*c)" <regexp> "abbbbbc" over match
! clear "a(?=b*)" <regexp> "ab" over match
! "baz" "(az)(?<=b)" <regexp> first-match
! "cbaz" "a(?<=b*)" <regexp> first-match
! "baz" "a(?<=b)" <regexp> first-match
! clear "^a" <regexp> "a" over match
! clear "^a" <regexp> "\na" over match
! clear "^a" <regexp> "\r\na" over match
! clear "^a" <regexp> "\ra" over match
! "baz" "a(?<!b)" <regexp> first-match
! "caz" "a(?<!b)" <regexp> first-match
! clear "a$" <regexp> "a" over match
! clear "a$" <regexp> "a\n" over match
! clear "a$" <regexp> "a\r" over match
! clear "a$" <regexp> "a\r\n" over match
! "(az)(?<=b)" <regexp> "baz" over first-match
! "a(?<=b*)" <regexp> "cbaz" over first-match
! "a(?<=b)" <regexp> "baz" over first-match
! "a(?<!b)" <regexp> "baz" over first-match
! "a(?<!b)" <regexp> "caz" over first-match
! "a(?=bcdefg)bcd" <regexp> "abcdefg" over first-match
! "a(?#bcdefg)bcd" <regexp> "abcdefg" over first-match
! "a(?:bcdefg)" <regexp> "abcdefg" over first-match
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
! "a(?<=b)" <regexp> "caba" over first-match
! "abcdefg" "a(?=bcdefg)bcd" <regexp> first-match
! "abcdefg" "a(?#bcdefg)bcd" <regexp> first-match
! "abcdefg" "a(?:bcdefg)" <regexp> first-match
! "caba" "a(?<=b)" <regexp> first-match
! capture group 1: "aaaa" 2: ""
! "aaaa" "(a*)(a*)" <regexp> match*

View File

@ -1,9 +1,10 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences
USING: accessors combinators kernel math sequences strings
sets assocs prettyprint.backend make lexer namespaces parser
arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa
regexp.dfa regexp.traversal regexp.transition-tables splitting ;
regexp.dfa regexp.traversal regexp.transition-tables splitting
sorting ;
IN: regexp
: default-regexp ( string -- regexp )
@ -73,42 +74,9 @@ IN: regexp
[ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
: count-matches ( string regexp -- n )
all-matches length 1- ;
all-matches length ;
: initial-option ( regexp option -- regexp' )
over options>> conjoin ;
: <regexp> ( string -- regexp )
default-regexp construct-regexp ;
: <iregexp> ( string -- regexp )
default-regexp
case-insensitive initial-option
construct-regexp ;
: <rregexp> ( string -- regexp )
default-regexp
reversed-regexp initial-option
construct-regexp ;
: parsing-regexp ( accum end -- accum )
lexer get dup skip-blank
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
lexer get dup still-parsing-line?
[ (parse-token) ] [ drop f ] if
"i" = [ <iregexp> ] [ <regexp> ] if parsed ;
: R! CHAR: ! parsing-regexp ; parsing
: R" CHAR: " parsing-regexp ; parsing
: R# CHAR: # parsing-regexp ; parsing
: R' CHAR: ' parsing-regexp ; parsing
: R( CHAR: ) parsing-regexp ; parsing
: R/ CHAR: / parsing-regexp ; parsing
: R@ CHAR: @ parsing-regexp ; parsing
: R[ CHAR: ] parsing-regexp ; parsing
: R` CHAR: ` parsing-regexp ; parsing
: R{ CHAR: } parsing-regexp ; parsing
: R| CHAR: | parsing-regexp ; parsing
<PRIVATE
: find-regexp-syntax ( string -- prefix suffix )
{
@ -125,14 +93,67 @@ IN: regexp
{ "R| " "|" }
} swap [ subseq? not nip ] curry assoc-find drop ;
: option? ( option regexp -- ? )
options>> key? ;
ERROR: unknown-regexp-option option ;
: option>ch ( option -- string )
{
{ case-insensitive [ CHAR: i ] }
{ multiline [ CHAR: m ] }
{ reversed-regexp [ CHAR: r ] }
{ dotall [ CHAR: s ] }
[ unknown-regexp-option ]
} case ;
: ch>option ( ch -- option )
{
{ CHAR: i [ case-insensitive ] }
{ CHAR: m [ multiline ] }
{ CHAR: r [ reversed-regexp ] }
{ CHAR: s [ dotall ] }
[ unknown-regexp-option ]
} case ;
: string>options ( string -- options )
[ ch>option dup ] H{ } map>assoc ;
: options>string ( options -- string )
keys [ option>ch ] map natural-sort >string ;
PRIVATE>
: <optioned-regexp> ( string option-string -- regexp )
[ default-regexp ] [ string>options ] bi* >>options
construct-regexp ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
<PRIVATE
: parsing-regexp ( accum end -- accum )
lexer get dup skip-blank
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
lexer get dup still-parsing-line?
[ (parse-token) ] [ drop f ] if
<optioned-regexp> parsed ;
PRIVATE>
: R! CHAR: ! parsing-regexp ; parsing
: R" CHAR: " parsing-regexp ; parsing
: R# CHAR: # parsing-regexp ; parsing
: R' CHAR: ' parsing-regexp ; parsing
: R( CHAR: ) parsing-regexp ; parsing
: R/ CHAR: / parsing-regexp ; parsing
: R@ CHAR: @ parsing-regexp ; parsing
: R[ CHAR: ] parsing-regexp ; parsing
: R` CHAR: ` parsing-regexp ; parsing
: R{ CHAR: } parsing-regexp ; parsing
: R| CHAR: | parsing-regexp ; parsing
M: regexp pprint*
[
[
dup raw>>
dup find-regexp-syntax swap % swap % %
case-insensitive swap option? [ "i" % ] when
[ raw>> dup find-regexp-syntax swap % swap % % ]
[ options>> options>string % ] bi
] "" make
] keep present-text ;

View File

@ -40,7 +40,7 @@ TUPLE: transition-table transitions start-state final-states ;
2dup [ to>> ] dip maybe-initialize-key
[ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
2dup at* [ 2nip insert-at ]
[ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ;
[ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ;
: add-transition ( transition transition-table -- )
transitions>> set-transition ;

View File

@ -17,6 +17,7 @@ TUPLE: dfa-traverser
capture-group-index
last-state current-state
text
match-failed?
start-index current-index
matches ;
@ -37,14 +38,20 @@ TUPLE: dfa-traverser
H{ } clone >>captured-groups ;
: final-state? ( dfa-traverser -- ? )
[ current-state>> ] [ dfa-table>> final-states>> ] bi
key? ;
[ current-state>> ]
[ dfa-table>> final-states>> ] bi key? ;
: beginning-of-text? ( dfa-traverser -- ? )
current-index>> 0 <= ; inline
: end-of-text? ( dfa-traverser -- ? )
[ current-index>> ] [ text>> length ] bi >= ; inline
: text-finished? ( dfa-traverser -- ? )
{
[ current-state>> empty? ]
[ [ current-index>> ] [ text>> length ] bi >= ]
! [ current-index>> 0 < ]
[ end-of-text? ]
[ match-failed?>> ]
} 1|| ;
: save-final-state ( dfa-straverser -- )
@ -55,8 +62,47 @@ TUPLE: dfa-traverser
dup save-final-state
] when text-finished? ;
: previous-text-character ( dfa-traverser -- ch )
[ text>> ] [ current-index>> 1- ] bi nth ;
: current-text-character ( dfa-traverser -- ch )
[ text>> ] [ current-index>> ] bi nth ;
: next-text-character ( dfa-traverser -- ch )
[ text>> ] [ current-index>> 1+ ] bi nth ;
GENERIC: flag-action ( dfa-traverser flag -- )
M: beginning-of-input flag-action ( dfa-traverser flag -- )
drop
dup beginning-of-text? [ t >>match-failed? ] unless drop ;
M: end-of-input flag-action ( dfa-traverser flag -- )
drop
dup end-of-text? [ t >>match-failed? ] unless drop ;
M: beginning-of-line flag-action ( dfa-traverser flag -- )
drop
dup {
[ beginning-of-text? ]
[ previous-text-character terminator-class class-member? ]
} 1|| [ t >>match-failed? ] unless drop ;
M: end-of-line flag-action ( dfa-traverser flag -- )
drop
dup {
[ end-of-text? ]
[ next-text-character terminator-class class-member? ]
} 1|| [ t >>match-failed? ] unless drop ;
M: word-boundary flag-action ( dfa-traverser flag -- )
drop
dup {
[ end-of-text? ]
[ current-text-character terminator-class class-member? ]
} 1|| [ t >>match-failed? ] unless drop ;
M: lookahead-on flag-action ( dfa-traverser flag -- )
drop
lookahead-counters>> 0 swap push ;
@ -110,11 +156,7 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
[ [ 1+ ] change-current-index ]
[ [ 1- ] change-current-index ] if
dup current-state>> >>last-state
] dip
first >>current-state ;
: match-failed ( dfa-traverser -- dfa-traverser )
V{ } clone >>matches ;
] [ first ] bi* >>current-state ;
: match-literal ( transition from-state table -- to-state/f )
transitions>> at at ;
@ -131,11 +173,9 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
{ [ match-literal ] [ match-class ] [ match-default ] } 3|| ;
: setup-match ( match -- obj state dfa-table )
{
[ current-index>> ] [ text>> ]
[ current-state>> ] [ dfa-table>> ]
} cleave
[ nth ] 2dip ;
[ [ current-index>> ] [ text>> ] bi nth ]
[ current-state>> ]
[ dfa-table>> ] tri ;
: do-match ( dfa-traverser -- dfa-traverser )
dup process-flags

View File

@ -11,14 +11,14 @@ IN: benchmark.regex-dna
: count-patterns ( string -- )
{
R/ agggtaaa|tttaccct/i,
R/ [cgt]gggtaaa|tttaccc[acg]/i,
R/ a[act]ggtaaa|tttacc[agt]t/i,
R/ ag[act]gtaaa|tttac[agt]ct/i,
R/ agg[act]taaa|ttta[agt]cct/i,
R/ aggg[acg]aaa|ttt[cgt]ccct/i,
R/ agggt[cgt]aa|tt[acg]accct/i,
R/ agggta[cgt]a|t[acg]taccct/i,
R/ agggtaaa|tttaccct/i
R/ [cgt]gggtaaa|tttaccc[acg]/i
R/ a[act]ggtaaa|tttacc[agt]t/i
R/ ag[act]gtaaa|tttac[agt]ct/i
R/ agg[act]taaa|ttta[agt]cct/i
R/ aggg[acg]aaa|ttt[cgt]ccct/i
R/ agggt[cgt]aa|tt[acg]accct/i
R/ agggta[cgt]a|t[acg]taccct/i
R/ agggtaa[cgt]|[acg]ttaccct/i
} [
[ raw>> write bl ]