move regexp2 to regexp, bug fixes, more tests

Doug Coleman 2008-09-18 14:42:16 -05:00
parent c6ab49e8ca
commit e59b320df3
15 changed files with 123 additions and 71 deletions

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors hashtables kernel math state-tables vars vectors ;
IN: regexp2.backend
IN: regexp.backend
TUPLE: regexp
raw

View File

@ -1,8 +1,8 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order symbols regexp2.parser
words regexp2.utils unicode.categories combinators.short-circuit ;
IN: regexp2.classes
USING: accessors kernel math math.order symbols regexp.parser
words regexp.utils unicode.categories combinators.short-circuit ;
IN: regexp.classes
GENERIC: class-member? ( obj class -- ? )

View File

@ -1,15 +1,14 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp2.nfa regexp2.transition-tables sequences
sets sorting vectors regexp2.utils sequences.lib combinators.lib
sequences.deep ;
math math.order regexp.nfa regexp.transition-tables sequences
sets sorting vectors regexp.utils sequences.deep ;
USING: io prettyprint threads ;
IN: regexp2.dfa
IN: regexp.dfa
: find-delta ( states transition regexp -- new-states )
nfa-table>> transitions>>
rot [ swap at at ] with with map sift concat prune ;
rot [ swap at at ] with with gather sift ;
: (find-epsilon-closure) ( states regexp -- new-states )
eps swap find-delta ;
@ -26,7 +25,9 @@ IN: regexp2.dfa
: find-transitions ( seq1 regexp -- seq2 )
nfa-table>> transitions>>
[ at keys ] curry map concat eps swap remove ;
[ at keys ] curry map concat
eps swap remove ;
! dup t member? [ t swap remove t suffix ] when ;
: add-todo-state ( state regexp -- )
2dup visited-states>> key? [

View File

@ -1,10 +1,10 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs grouping kernel regexp2.backend
locals math namespaces regexp2.parser sequences state-tables fry
USING: accessors arrays assocs grouping kernel regexp.backend
locals math namespaces regexp.parser sequences state-tables fry
quotations math.order math.ranges vectors unicode.categories
regexp2.utils regexp2.transition-tables words sequences.lib sets ;
IN: regexp2.nfa
regexp.utils regexp.transition-tables words sets ;
IN: regexp.nfa
SYMBOL: negation-mode
: negated? ( -- ? ) negation-mode get 0 or odd? ;
@ -121,6 +121,15 @@ M: character-class-range nfa-node ( node -- )
M: capture-group nfa-node ( node -- )
term>> nfa-node ;
! xyzzy
M: non-capture-group nfa-node ( node -- )
term>> nfa-node ;
M: reluctant-kleene-star nfa-node ( node -- )
term>> <kleene-star> nfa-node ;
!
M: negation nfa-node ( node -- )
negation-mode inc
term>> nfa-node

View File

@ -1,13 +1,10 @@
USING: kernel tools.test regexp2.backend regexp2 ;
IN: regexp2.parser
USING: kernel tools.test regexp.backend regexp ;
IN: regexp.parser
: test-regexp ( string -- )
default-regexp parse-regexp ;
: test-regexp2 ( string -- regexp )
default-regexp dup parse-regexp ;
[ "(" ] [ unmatched-parentheses? ] must-fail-with
! [ "(" ] [ unmatched-parentheses? ] must-fail-with
[ ] [ "a|b" test-regexp ] unit-test
[ ] [ "a.b" test-regexp ] unit-test

View File

@ -2,10 +2,10 @@
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser multi-methods namespaces qualified sets
quotations sequences sequences.lib splitting symbols vectors
dlists math.order combinators.lib unicode.categories strings
sequences.lib regexp2.backend regexp2.utils unicode.case ;
IN: regexp2.parser
quotations sequences splitting symbols vectors math.order
unicode.categories strings regexp.backend regexp.utils
unicode.case ;
IN: regexp.parser
FROM: math.ranges => [a,b] ;
@ -280,11 +280,26 @@ ERROR: bad-escaped-literals seq ;
first|concatenation
] if-empty ;
ERROR: unrecognized-escape char ;
: parse-escaped ( -- obj )
read1
{
{ CHAR: \ [ CHAR: \ <constant> ] }
{ CHAR: - [ CHAR: - <constant> ] }
{ CHAR: { [ CHAR: { <constant> ] }
{ CHAR: } [ CHAR: } <constant> ] }
{ CHAR: [ [ CHAR: [ <constant> ] }
{ CHAR: ] [ CHAR: ] <constant> ] }
{ CHAR: ( [ CHAR: ( <constant> ] }
{ CHAR: ) [ CHAR: ) <constant> ] }
{ CHAR: @ [ CHAR: @ <constant> ] }
{ CHAR: * [ CHAR: * <constant> ] }
{ CHAR: + [ CHAR: + <constant> ] }
{ CHAR: ? [ CHAR: ? <constant> ] }
{ CHAR: . [ CHAR: . <constant> ] }
! xyzzy
{ CHAR: : [ CHAR: : <constant> ] }
{ CHAR: t [ CHAR: \t <constant> ] }
{ CHAR: n [ CHAR: \n <constant> ] }
{ CHAR: r [ CHAR: \r <constant> ] }
@ -314,8 +329,19 @@ ERROR: bad-escaped-literals seq ;
! { CHAR: G [ end of previous match ] }
! { CHAR: Z [ handle-end-of-input ] }
! { CHAR: z [ handle-end-of-input ] } ! except for terminator
! xyzzy
{ CHAR: 1 [ CHAR: 1 <constant> ] }
{ CHAR: 2 [ CHAR: 2 <constant> ] }
{ CHAR: 3 [ CHAR: 3 <constant> ] }
{ CHAR: 4 [ CHAR: 4 <constant> ] }
{ CHAR: 5 [ CHAR: 5 <constant> ] }
{ CHAR: 6 [ CHAR: 6 <constant> ] }
{ CHAR: 7 [ CHAR: 7 <constant> ] }
{ CHAR: 8 [ CHAR: 8 <constant> ] }
{ CHAR: 9 [ CHAR: 9 <constant> ] }
{ CHAR: Q [ parse-escaped-literals ] }
[ unrecognized-escape ]
} case ;
: handle-escape ( -- ) parse-escaped push-stack ;

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings help.markup help.syntax regexp2.backend ;
IN: regexp2
USING: kernel strings help.markup help.syntax regexp.backend ;
IN: regexp
HELP: <regexp>
{ $values { "string" string } { "regexp" regexp } }

View File

@ -1,6 +1,6 @@
USING: regexp2 tools.test kernel sequences regexp2.parser
regexp2.traversal ;
IN: regexp2-tests
USING: regexp tools.test kernel sequences regexp.parser
regexp.traversal eval ;
IN: regexp-tests
[ f ] [ "b" "a*" <regexp> matches? ] unit-test
[ t ] [ "" "a*" <regexp> matches? ] unit-test
@ -224,6 +224,9 @@ IN: regexp2-tests
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
[ t ] [ "abc*" "[^\\*]*\\*" <regexp> matches? ] unit-test
[ t ] [ "bca" "[^a]*a" <regexp> matches? ] unit-test
[ ] [
"(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
<regexp> drop
@ -236,20 +239,20 @@ IN: regexp2-tests
[ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
[ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
[ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
[ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
[ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
[ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
[ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
! [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
! [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
@ -268,6 +271,12 @@ IN: regexp2-tests
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
[ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test
[ ] [ "USING: regexp kernel ; R' (ftp|http|https)://(\\w+:?\\w*@)?(\\S+)(:[0-9]+)?(/|/([\\w#!:.?+=&%@!\\-/]))?' drop" eval ] unit-test
[ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test
! Bug in parsing word
! [ t ] [ "a" R' a' matches? ] unit-test

View File

@ -1,11 +1,11 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math math.ranges
sequences regexp2.backend regexp2.utils memoize sets
regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal
regexp2.transition-tables assocs prettyprint.backend
make ;
IN: regexp2
sequences regexp.backend regexp.utils memoize sets
regexp.parser regexp.nfa regexp.dfa regexp.traversal
regexp.transition-tables assocs prettyprint.backend
make lexer namespaces parser ;
IN: regexp
: default-regexp ( string -- regexp )
regexp new
@ -51,17 +51,26 @@ IN: regexp2
reversed-regexp initial-option
construct-regexp ;
: R! CHAR: ! <regexp> ; parsing
: R" CHAR: " <regexp> ; parsing
: R# CHAR: # <regexp> ; parsing
: R' CHAR: ' <regexp> ; parsing
: R( CHAR: ) <regexp> ; parsing
: R/ CHAR: / <regexp> ; parsing
: R@ CHAR: @ <regexp> ; parsing
: R[ CHAR: ] <regexp> ; parsing
: R` CHAR: ` <regexp> ; parsing
: R{ CHAR: } <regexp> ; parsing
: R| CHAR: | <regexp> ; parsing
: parsing-regexp ( accum end -- accum )
lexer get dup skip-blank
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
lexer get dup still-parsing-line?
[ (parse-token) ] [ drop f ] if
"i" = [ <iregexp> ] [ <regexp> ] if parsed ;
: R! CHAR: ! parsing-regexp ; parsing
: R" CHAR: " parsing-regexp ; parsing
: R# CHAR: # parsing-regexp ; parsing
: R' CHAR: ' parsing-regexp ; parsing
: R( CHAR: ) parsing-regexp ; parsing
: R/ CHAR: / parsing-regexp ; parsing
: R@ CHAR: @ parsing-regexp ; parsing
: R[ CHAR: ] parsing-regexp ; parsing
: R` CHAR: ` parsing-regexp ; parsing
: R{ CHAR: } parsing-regexp ; parsing
: R| CHAR: | parsing-regexp ; parsing
: find-regexp-syntax ( string -- prefix suffix )
{
@ -81,6 +90,8 @@ IN: regexp2
: option? ( option regexp -- ? )
options>> key? ;
USE: multiline
/*
M: regexp pprint*
[
[
@ -89,3 +100,4 @@ M: regexp pprint*
case-insensitive swap option? [ "i" % ] when
] "" make
] keep present-text ;
*/

View File

@ -1,8 +1,8 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs fry hashtables kernel sequences
vectors regexp2.utils ;
IN: regexp2.transition-tables
vectors regexp.utils ;
IN: regexp.transition-tables
TUPLE: transition from to obj ;
TUPLE: literal-transition < transition ;

View File

@ -1,10 +1,9 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs combinators combinators.lib kernel
math math.ranges quotations sequences regexp2.parser
regexp2.classes combinators.short-circuit assocs.lib
sequences.lib regexp2.utils ;
IN: regexp2.traversal
USING: accessors assocs combinators kernel math math.ranges
quotations sequences regexp.parser regexp.classes
combinators.short-circuit regexp.utils ;
IN: regexp.traversal
TUPLE: dfa-traverser
dfa-table
@ -54,7 +53,7 @@ TUPLE: dfa-traverser
V{ } clone >>matches ;
: match-literal ( transition from-state table -- to-state/f )
transitions>> [ at ] [ 2drop f ] if-at ;
transitions>> at* [ at ] [ 2drop f ] if ;
: match-class ( transition from-state table -- to-state/f )
transitions>> at* [
@ -62,8 +61,8 @@ TUPLE: dfa-traverser
] [ drop ] if ;
: match-default ( transition from-state table -- to-state/f )
[ nip ] dip transitions>>
[ t swap [ drop f ] unless-at ] [ drop f ] if-at ;
[ nip ] dip transitions>> at*
[ t swap at* [ ] [ drop f ] if ] [ drop f ] if ;
: match-transition ( obj from-state dfa -- to-state/f )
{ [ match-literal ] [ match-class ] [ match-default ] } 3|| ;

View File

@ -1,10 +1,9 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators.lib io kernel
math math.order namespaces regexp2.backend sequences
sequences.lib unicode.categories math.ranges fry
combinators.short-circuit vectors ;
IN: regexp2.utils
USING: accessors arrays assocs io kernel math math.order
namespaces regexp.backend sequences unicode.categories
math.ranges fry combinators.short-circuit vectors ;
IN: regexp.utils
: (while-changes) ( obj quot pred pred-ret -- obj )
! quot: ( obj -- obj' )