move regexp2 to regexp, bug fixes, more tests

Doug Coleman 2008-09-18 14:42:16 -05:00
parent c6ab49e8ca
commit e59b320df3
15 changed files with 123 additions and 71 deletions

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors hashtables kernel math state-tables vars vectors ; USING: accessors hashtables kernel math state-tables vars vectors ;
IN: regexp2.backend IN: regexp.backend
TUPLE: regexp TUPLE: regexp
raw raw

View File

@ -1,8 +1,8 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order symbols regexp2.parser USING: accessors kernel math math.order symbols regexp.parser
words regexp2.utils unicode.categories combinators.short-circuit ; words regexp.utils unicode.categories combinators.short-circuit ;
IN: regexp2.classes IN: regexp.classes
GENERIC: class-member? ( obj class -- ? ) GENERIC: class-member? ( obj class -- ? )

View File

@ -1,15 +1,14 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp2.nfa regexp2.transition-tables sequences math math.order regexp.nfa regexp.transition-tables sequences
sets sorting vectors regexp2.utils sequences.lib combinators.lib sets sorting vectors regexp.utils sequences.deep ;
sequences.deep ;
USING: io prettyprint threads ; USING: io prettyprint threads ;
IN: regexp2.dfa IN: regexp.dfa
: find-delta ( states transition regexp -- new-states ) : find-delta ( states transition regexp -- new-states )
nfa-table>> transitions>> nfa-table>> transitions>>
rot [ swap at at ] with with map sift concat prune ; rot [ swap at at ] with with gather sift ;
: (find-epsilon-closure) ( states regexp -- new-states ) : (find-epsilon-closure) ( states regexp -- new-states )
eps swap find-delta ; eps swap find-delta ;
@ -26,7 +25,9 @@ IN: regexp2.dfa
: find-transitions ( seq1 regexp -- seq2 ) : find-transitions ( seq1 regexp -- seq2 )
nfa-table>> transitions>> nfa-table>> transitions>>
[ at keys ] curry map concat eps swap remove ; [ at keys ] curry map concat
eps swap remove ;
! dup t member? [ t swap remove t suffix ] when ;
: add-todo-state ( state regexp -- ) : add-todo-state ( state regexp -- )
2dup visited-states>> key? [ 2dup visited-states>> key? [

View File

@ -1,10 +1,10 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs grouping kernel regexp2.backend USING: accessors arrays assocs grouping kernel regexp.backend
locals math namespaces regexp2.parser sequences state-tables fry locals math namespaces regexp.parser sequences state-tables fry
quotations math.order math.ranges vectors unicode.categories quotations math.order math.ranges vectors unicode.categories
regexp2.utils regexp2.transition-tables words sequences.lib sets ; regexp.utils regexp.transition-tables words sets ;
IN: regexp2.nfa IN: regexp.nfa
SYMBOL: negation-mode SYMBOL: negation-mode
: negated? ( -- ? ) negation-mode get 0 or odd? ; : negated? ( -- ? ) negation-mode get 0 or odd? ;
@ -121,6 +121,15 @@ M: character-class-range nfa-node ( node -- )
M: capture-group nfa-node ( node -- ) M: capture-group nfa-node ( node -- )
term>> nfa-node ; term>> nfa-node ;
! xyzzy
M: non-capture-group nfa-node ( node -- )
term>> nfa-node ;
M: reluctant-kleene-star nfa-node ( node -- )
term>> <kleene-star> nfa-node ;
!
M: negation nfa-node ( node -- ) M: negation nfa-node ( node -- )
negation-mode inc negation-mode inc
term>> nfa-node term>> nfa-node

View File

@ -1,13 +1,10 @@
USING: kernel tools.test regexp2.backend regexp2 ; USING: kernel tools.test regexp.backend regexp ;
IN: regexp2.parser IN: regexp.parser
: test-regexp ( string -- ) : test-regexp ( string -- )
default-regexp parse-regexp ; default-regexp parse-regexp ;
: test-regexp2 ( string -- regexp ) ! [ "(" ] [ unmatched-parentheses? ] must-fail-with
default-regexp dup parse-regexp ;
[ "(" ] [ unmatched-parentheses? ] must-fail-with
[ ] [ "a|b" test-regexp ] unit-test [ ] [ "a|b" test-regexp ] unit-test
[ ] [ "a.b" test-regexp ] unit-test [ ] [ "a.b" test-regexp ] unit-test

View File

@ -2,10 +2,10 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators io io.streams.string USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser multi-methods namespaces qualified sets kernel math math.parser multi-methods namespaces qualified sets
quotations sequences sequences.lib splitting symbols vectors quotations sequences splitting symbols vectors math.order
dlists math.order combinators.lib unicode.categories strings unicode.categories strings regexp.backend regexp.utils
sequences.lib regexp2.backend regexp2.utils unicode.case ; unicode.case ;
IN: regexp2.parser IN: regexp.parser
FROM: math.ranges => [a,b] ; FROM: math.ranges => [a,b] ;
@ -280,11 +280,26 @@ ERROR: bad-escaped-literals seq ;
first|concatenation first|concatenation
] if-empty ; ] if-empty ;
ERROR: unrecognized-escape char ;
: parse-escaped ( -- obj ) : parse-escaped ( -- obj )
read1 read1
{ {
{ CHAR: \ [ CHAR: \ <constant> ] } { CHAR: \ [ CHAR: \ <constant> ] }
{ CHAR: - [ CHAR: - <constant> ] }
{ CHAR: { [ CHAR: { <constant> ] }
{ CHAR: } [ CHAR: } <constant> ] }
{ CHAR: [ [ CHAR: [ <constant> ] }
{ CHAR: ] [ CHAR: ] <constant> ] }
{ CHAR: ( [ CHAR: ( <constant> ] }
{ CHAR: ) [ CHAR: ) <constant> ] }
{ CHAR: @ [ CHAR: @ <constant> ] }
{ CHAR: * [ CHAR: * <constant> ] }
{ CHAR: + [ CHAR: + <constant> ] }
{ CHAR: ? [ CHAR: ? <constant> ] }
{ CHAR: . [ CHAR: . <constant> ] } { CHAR: . [ CHAR: . <constant> ] }
! xyzzy
{ CHAR: : [ CHAR: : <constant> ] }
{ CHAR: t [ CHAR: \t <constant> ] } { CHAR: t [ CHAR: \t <constant> ] }
{ CHAR: n [ CHAR: \n <constant> ] } { CHAR: n [ CHAR: \n <constant> ] }
{ CHAR: r [ CHAR: \r <constant> ] } { CHAR: r [ CHAR: \r <constant> ] }
@ -314,8 +329,19 @@ ERROR: bad-escaped-literals seq ;
! { CHAR: G [ end of previous match ] } ! { CHAR: G [ end of previous match ] }
! { CHAR: Z [ handle-end-of-input ] } ! { CHAR: Z [ handle-end-of-input ] }
! { CHAR: z [ handle-end-of-input ] } ! except for terminator ! { CHAR: z [ handle-end-of-input ] } ! except for terminator
! xyzzy
{ CHAR: 1 [ CHAR: 1 <constant> ] }
{ CHAR: 2 [ CHAR: 2 <constant> ] }
{ CHAR: 3 [ CHAR: 3 <constant> ] }
{ CHAR: 4 [ CHAR: 4 <constant> ] }
{ CHAR: 5 [ CHAR: 5 <constant> ] }
{ CHAR: 6 [ CHAR: 6 <constant> ] }
{ CHAR: 7 [ CHAR: 7 <constant> ] }
{ CHAR: 8 [ CHAR: 8 <constant> ] }
{ CHAR: 9 [ CHAR: 9 <constant> ] }
{ CHAR: Q [ parse-escaped-literals ] } { CHAR: Q [ parse-escaped-literals ] }
[ unrecognized-escape ]
} case ; } case ;
: handle-escape ( -- ) parse-escaped push-stack ; : handle-escape ( -- ) parse-escaped push-stack ;

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings help.markup help.syntax regexp2.backend ; USING: kernel strings help.markup help.syntax regexp.backend ;
IN: regexp2 IN: regexp
HELP: <regexp> HELP: <regexp>
{ $values { "string" string } { "regexp" regexp } } { $values { "string" string } { "regexp" regexp } }

View File

@ -1,6 +1,6 @@
USING: regexp2 tools.test kernel sequences regexp2.parser USING: regexp tools.test kernel sequences regexp.parser
regexp2.traversal ; regexp.traversal eval ;
IN: regexp2-tests IN: regexp-tests
[ f ] [ "b" "a*" <regexp> matches? ] unit-test [ f ] [ "b" "a*" <regexp> matches? ] unit-test
[ t ] [ "" "a*" <regexp> matches? ] unit-test [ t ] [ "" "a*" <regexp> matches? ] unit-test
@ -224,6 +224,9 @@ IN: regexp2-tests
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test [ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
[ t ] [ "abc*" "[^\\*]*\\*" <regexp> matches? ] unit-test
[ t ] [ "bca" "[^a]*a" <regexp> matches? ] unit-test
[ ] [ [ ] [
"(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
<regexp> drop <regexp> drop
@ -236,20 +239,20 @@ IN: regexp2-tests
[ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with ! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
[ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test ! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
[ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test ! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
[ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test ! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
[ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test ! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
[ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test ! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
[ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test ! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test ! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test ! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test ! [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test ! [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
@ -268,6 +271,12 @@ IN: regexp2-tests
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
[ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test
[ ] [ "USING: regexp kernel ; R' (ftp|http|https)://(\\w+:?\\w*@)?(\\S+)(:[0-9]+)?(/|/([\\w#!:.?+=&%@!\\-/]))?' drop" eval ] unit-test
[ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test
! Bug in parsing word ! Bug in parsing word
! [ t ] [ "a" R' a' matches? ] unit-test ! [ t ] [ "a" R' a' matches? ] unit-test

View File

@ -1,11 +1,11 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math math.ranges USING: accessors combinators kernel math math.ranges
sequences regexp2.backend regexp2.utils memoize sets sequences regexp.backend regexp.utils memoize sets
regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal regexp.parser regexp.nfa regexp.dfa regexp.traversal
regexp2.transition-tables assocs prettyprint.backend regexp.transition-tables assocs prettyprint.backend
make ; make lexer namespaces parser ;
IN: regexp2 IN: regexp
: default-regexp ( string -- regexp ) : default-regexp ( string -- regexp )
regexp new regexp new
@ -51,17 +51,26 @@ IN: regexp2
reversed-regexp initial-option reversed-regexp initial-option
construct-regexp ; construct-regexp ;
: R! CHAR: ! <regexp> ; parsing
: R" CHAR: " <regexp> ; parsing : parsing-regexp ( accum end -- accum )
: R# CHAR: # <regexp> ; parsing lexer get dup skip-blank
: R' CHAR: ' <regexp> ; parsing [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
: R( CHAR: ) <regexp> ; parsing lexer get dup still-parsing-line?
: R/ CHAR: / <regexp> ; parsing [ (parse-token) ] [ drop f ] if
: R@ CHAR: @ <regexp> ; parsing "i" = [ <iregexp> ] [ <regexp> ] if parsed ;
: R[ CHAR: ] <regexp> ; parsing
: R` CHAR: ` <regexp> ; parsing : R! CHAR: ! parsing-regexp ; parsing
: R{ CHAR: } <regexp> ; parsing : R" CHAR: " parsing-regexp ; parsing
: R| CHAR: | <regexp> ; parsing : R# CHAR: # parsing-regexp ; parsing
: R' CHAR: ' parsing-regexp ; parsing
: R( CHAR: ) parsing-regexp ; parsing
: R/ CHAR: / parsing-regexp ; parsing
: R@ CHAR: @ parsing-regexp ; parsing
: R[ CHAR: ] parsing-regexp ; parsing
: R` CHAR: ` parsing-regexp ; parsing
: R{ CHAR: } parsing-regexp ; parsing
: R| CHAR: | parsing-regexp ; parsing
: find-regexp-syntax ( string -- prefix suffix ) : find-regexp-syntax ( string -- prefix suffix )
{ {
@ -81,6 +90,8 @@ IN: regexp2
: option? ( option regexp -- ? ) : option? ( option regexp -- ? )
options>> key? ; options>> key? ;
USE: multiline
/*
M: regexp pprint* M: regexp pprint*
[ [
[ [
@ -89,3 +100,4 @@ M: regexp pprint*
case-insensitive swap option? [ "i" % ] when case-insensitive swap option? [ "i" % ] when
] "" make ] "" make
] keep present-text ; ] keep present-text ;
*/

View File

@ -1,8 +1,8 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs fry hashtables kernel sequences USING: accessors arrays assocs fry hashtables kernel sequences
vectors regexp2.utils ; vectors regexp.utils ;
IN: regexp2.transition-tables IN: regexp.transition-tables
TUPLE: transition from to obj ; TUPLE: transition from to obj ;
TUPLE: literal-transition < transition ; TUPLE: literal-transition < transition ;

View File

@ -1,10 +1,9 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs combinators combinators.lib kernel USING: accessors assocs combinators kernel math math.ranges
math math.ranges quotations sequences regexp2.parser quotations sequences regexp.parser regexp.classes
regexp2.classes combinators.short-circuit assocs.lib combinators.short-circuit regexp.utils ;
sequences.lib regexp2.utils ; IN: regexp.traversal
IN: regexp2.traversal
TUPLE: dfa-traverser TUPLE: dfa-traverser
dfa-table dfa-table
@ -54,7 +53,7 @@ TUPLE: dfa-traverser
V{ } clone >>matches ; V{ } clone >>matches ;
: match-literal ( transition from-state table -- to-state/f ) : match-literal ( transition from-state table -- to-state/f )
transitions>> [ at ] [ 2drop f ] if-at ; transitions>> at* [ at ] [ 2drop f ] if ;
: match-class ( transition from-state table -- to-state/f ) : match-class ( transition from-state table -- to-state/f )
transitions>> at* [ transitions>> at* [
@ -62,8 +61,8 @@ TUPLE: dfa-traverser
] [ drop ] if ; ] [ drop ] if ;
: match-default ( transition from-state table -- to-state/f ) : match-default ( transition from-state table -- to-state/f )
[ nip ] dip transitions>> [ nip ] dip transitions>> at*
[ t swap [ drop f ] unless-at ] [ drop f ] if-at ; [ t swap at* [ ] [ drop f ] if ] [ drop f ] if ;
: match-transition ( obj from-state dfa -- to-state/f ) : match-transition ( obj from-state dfa -- to-state/f )
{ [ match-literal ] [ match-class ] [ match-default ] } 3|| ; { [ match-literal ] [ match-class ] [ match-default ] } 3|| ;

View File

@ -1,10 +1,9 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators.lib io kernel USING: accessors arrays assocs io kernel math math.order
math math.order namespaces regexp2.backend sequences namespaces regexp.backend sequences unicode.categories
sequences.lib unicode.categories math.ranges fry math.ranges fry combinators.short-circuit vectors ;
combinators.short-circuit vectors ; IN: regexp.utils
IN: regexp2.utils
: (while-changes) ( obj quot pred pred-ret -- obj ) : (while-changes) ( obj quot pred pred-ret -- obj )
! quot: ( obj -- obj' ) ! quot: ( obj -- obj' )