diff --git a/unfinished/regexp2/authors.txt b/unfinished/regexp/authors.txt similarity index 100% rename from unfinished/regexp2/authors.txt rename to unfinished/regexp/authors.txt diff --git a/unfinished/regexp2/backend/backend.factor b/unfinished/regexp/backend/backend.factor similarity index 96% rename from unfinished/regexp2/backend/backend.factor rename to unfinished/regexp/backend/backend.factor index fa5c1f7f97..1a261fb0af 100644 --- a/unfinished/regexp2/backend/backend.factor +++ b/unfinished/regexp/backend/backend.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors hashtables kernel math state-tables vars vectors ; -IN: regexp2.backend +IN: regexp.backend TUPLE: regexp raw diff --git a/unfinished/regexp2/classes/classes.factor b/unfinished/regexp/classes/classes.factor similarity index 90% rename from unfinished/regexp2/classes/classes.factor rename to unfinished/regexp/classes/classes.factor index 7737e02d40..a2d91b97fb 100644 --- a/unfinished/regexp2/classes/classes.factor +++ b/unfinished/regexp/classes/classes.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors kernel math math.order symbols regexp2.parser -words regexp2.utils unicode.categories combinators.short-circuit ; -IN: regexp2.classes +USING: accessors kernel math math.order symbols regexp.parser +words regexp.utils unicode.categories combinators.short-circuit ; +IN: regexp.classes GENERIC: class-member? ( obj class -- ? ) diff --git a/unfinished/regexp2/dfa/dfa.factor b/unfinished/regexp/dfa/dfa.factor similarity index 87% rename from unfinished/regexp2/dfa/dfa.factor rename to unfinished/regexp/dfa/dfa.factor index cd2f4186f4..6f244dc8af 100644 --- a/unfinished/regexp2/dfa/dfa.factor +++ b/unfinished/regexp/dfa/dfa.factor @@ -1,15 +1,14 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators fry kernel locals -math math.order regexp2.nfa regexp2.transition-tables sequences -sets sorting vectors regexp2.utils sequences.lib combinators.lib -sequences.deep ; +math math.order regexp.nfa regexp.transition-tables sequences +sets sorting vectors regexp.utils sequences.deep ; USING: io prettyprint threads ; -IN: regexp2.dfa +IN: regexp.dfa : find-delta ( states transition regexp -- new-states ) nfa-table>> transitions>> - rot [ swap at at ] with with map sift concat prune ; + rot [ swap at at ] with with gather sift ; : (find-epsilon-closure) ( states regexp -- new-states ) eps swap find-delta ; @@ -26,7 +25,9 @@ IN: regexp2.dfa : find-transitions ( seq1 regexp -- seq2 ) nfa-table>> transitions>> - [ at keys ] curry map concat eps swap remove ; + [ at keys ] curry map concat + eps swap remove ; + ! dup t member? [ t swap remove t suffix ] when ; : add-todo-state ( state regexp -- ) 2dup visited-states>> key? [ diff --git a/unfinished/regexp2/nfa/nfa.factor b/unfinished/regexp/nfa/nfa.factor similarity index 92% rename from unfinished/regexp2/nfa/nfa.factor rename to unfinished/regexp/nfa/nfa.factor index 792d9fe30f..f070c3528b 100644 --- a/unfinished/regexp2/nfa/nfa.factor +++ b/unfinished/regexp/nfa/nfa.factor @@ -1,10 +1,10 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs grouping kernel regexp2.backend -locals math namespaces regexp2.parser sequences state-tables fry +USING: accessors arrays assocs grouping kernel regexp.backend +locals math namespaces regexp.parser sequences state-tables fry quotations math.order math.ranges vectors unicode.categories -regexp2.utils regexp2.transition-tables words sequences.lib sets ; -IN: regexp2.nfa +regexp.utils regexp.transition-tables words sets ; +IN: regexp.nfa SYMBOL: negation-mode : negated? ( -- ? ) negation-mode get 0 or odd? ; @@ -121,6 +121,15 @@ M: character-class-range nfa-node ( node -- ) M: capture-group nfa-node ( node -- ) term>> nfa-node ; +! xyzzy +M: non-capture-group nfa-node ( node -- ) + term>> nfa-node ; + +M: reluctant-kleene-star nfa-node ( node -- ) + term>> nfa-node ; + +! + M: negation nfa-node ( node -- ) negation-mode inc term>> nfa-node diff --git a/unfinished/regexp2/parser/parser-tests.factor b/unfinished/regexp/parser/parser-tests.factor similarity index 82% rename from unfinished/regexp2/parser/parser-tests.factor rename to unfinished/regexp/parser/parser-tests.factor index 6911e8e76d..0f25b2e3bf 100644 --- a/unfinished/regexp2/parser/parser-tests.factor +++ b/unfinished/regexp/parser/parser-tests.factor @@ -1,13 +1,10 @@ -USING: kernel tools.test regexp2.backend regexp2 ; -IN: regexp2.parser +USING: kernel tools.test regexp.backend regexp ; +IN: regexp.parser : test-regexp ( string -- ) default-regexp parse-regexp ; -: test-regexp2 ( string -- regexp ) - default-regexp dup parse-regexp ; - -[ "(" ] [ unmatched-parentheses? ] must-fail-with +! [ "(" ] [ unmatched-parentheses? ] must-fail-with [ ] [ "a|b" test-regexp ] unit-test [ ] [ "a.b" test-regexp ] unit-test diff --git a/unfinished/regexp2/parser/parser.factor b/unfinished/regexp/parser/parser.factor similarity index 92% rename from unfinished/regexp2/parser/parser.factor rename to unfinished/regexp/parser/parser.factor index fb1bd08bfe..eaee70210e 100644 --- a/unfinished/regexp2/parser/parser.factor +++ b/unfinished/regexp/parser/parser.factor @@ -2,10 +2,10 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators io io.streams.string kernel math math.parser multi-methods namespaces qualified sets -quotations sequences sequences.lib splitting symbols vectors -dlists math.order combinators.lib unicode.categories strings -sequences.lib regexp2.backend regexp2.utils unicode.case ; -IN: regexp2.parser +quotations sequences splitting symbols vectors math.order +unicode.categories strings regexp.backend regexp.utils +unicode.case ; +IN: regexp.parser FROM: math.ranges => [a,b] ; @@ -280,11 +280,26 @@ ERROR: bad-escaped-literals seq ; first|concatenation ] if-empty ; +ERROR: unrecognized-escape char ; + : parse-escaped ( -- obj ) read1 { { CHAR: \ [ CHAR: \ ] } + { CHAR: - [ CHAR: - ] } + { CHAR: { [ CHAR: { ] } + { CHAR: } [ CHAR: } ] } + { CHAR: [ [ CHAR: [ ] } + { CHAR: ] [ CHAR: ] ] } + { CHAR: ( [ CHAR: ( ] } + { CHAR: ) [ CHAR: ) ] } + { CHAR: @ [ CHAR: @ ] } + { CHAR: * [ CHAR: * ] } + { CHAR: + [ CHAR: + ] } + { CHAR: ? [ CHAR: ? ] } { CHAR: . [ CHAR: . ] } +! xyzzy + { CHAR: : [ CHAR: : ] } { CHAR: t [ CHAR: \t ] } { CHAR: n [ CHAR: \n ] } { CHAR: r [ CHAR: \r ] } @@ -314,8 +329,19 @@ ERROR: bad-escaped-literals seq ; ! { CHAR: G [ end of previous match ] } ! { CHAR: Z [ handle-end-of-input ] } ! { CHAR: z [ handle-end-of-input ] } ! except for terminator +! xyzzy + { CHAR: 1 [ CHAR: 1 ] } + { CHAR: 2 [ CHAR: 2 ] } + { CHAR: 3 [ CHAR: 3 ] } + { CHAR: 4 [ CHAR: 4 ] } + { CHAR: 5 [ CHAR: 5 ] } + { CHAR: 6 [ CHAR: 6 ] } + { CHAR: 7 [ CHAR: 7 ] } + { CHAR: 8 [ CHAR: 8 ] } + { CHAR: 9 [ CHAR: 9 ] } { CHAR: Q [ parse-escaped-literals ] } + [ unrecognized-escape ] } case ; : handle-escape ( -- ) parse-escaped push-stack ; diff --git a/unfinished/regexp2/regexp2-docs.factor b/unfinished/regexp/regexp-docs.factor similarity index 89% rename from unfinished/regexp2/regexp2-docs.factor rename to unfinished/regexp/regexp-docs.factor index f903c14bc4..f6a1fe1876 100644 --- a/unfinished/regexp2/regexp2-docs.factor +++ b/unfinished/regexp/regexp-docs.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel strings help.markup help.syntax regexp2.backend ; -IN: regexp2 +USING: kernel strings help.markup help.syntax regexp.backend ; +IN: regexp HELP: { $values { "string" string } { "regexp" regexp } } diff --git a/unfinished/regexp2/regexp2-tests.factor b/unfinished/regexp/regexp-tests.factor similarity index 90% rename from unfinished/regexp2/regexp2-tests.factor rename to unfinished/regexp/regexp-tests.factor index e77a7a4419..78098952d3 100644 --- a/unfinished/regexp2/regexp2-tests.factor +++ b/unfinished/regexp/regexp-tests.factor @@ -1,6 +1,6 @@ -USING: regexp2 tools.test kernel sequences regexp2.parser -regexp2.traversal ; -IN: regexp2-tests +USING: regexp tools.test kernel sequences regexp.parser +regexp.traversal eval ; +IN: regexp-tests [ f ] [ "b" "a*" matches? ] unit-test [ t ] [ "" "a*" matches? ] unit-test @@ -224,6 +224,9 @@ IN: regexp2-tests [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test [ t ] [ ".o" "\\.[a-z]" matches? ] unit-test +[ t ] [ "abc*" "[^\\*]*\\*" matches? ] unit-test +[ t ] [ "bca" "[^a]*a" matches? ] unit-test + [ ] [ "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" drop @@ -236,20 +239,20 @@ IN: regexp2-tests -[ "{Lower}" ] [ invalid-range? ] must-fail-with +! [ "{Lower}" ] [ invalid-range? ] must-fail-with -[ 1 ] [ "aaacb" "a+?" match-head ] unit-test -[ 1 ] [ "aaacb" "aa??" match-head ] unit-test -[ f ] [ "aaaab" "a++ab" matches? ] unit-test -[ t ] [ "aaacb" "a++cb" matches? ] unit-test -[ 3 ] [ "aacb" "aa?c" match-head ] unit-test -[ 3 ] [ "aacb" "aa??c" match-head ] unit-test +! [ 1 ] [ "aaacb" "a+?" match-head ] unit-test +! [ 1 ] [ "aaacb" "aa??" match-head ] unit-test +! [ f ] [ "aaaab" "a++ab" matches? ] unit-test +! [ t ] [ "aaacb" "a++cb" matches? ] unit-test +! [ 3 ] [ "aacb" "aa?c" match-head ] unit-test +! [ 3 ] [ "aacb" "aa??c" match-head ] unit-test -[ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test -[ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test +! [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test +! [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test -[ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test -[ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test +! [ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test +! [ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test ! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test @@ -268,6 +271,12 @@ IN: regexp2-tests ! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test +[ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test + +[ ] [ "USING: regexp kernel ; R' (ftp|http|https)://(\\w+:?\\w*@)?(\\S+)(:[0-9]+)?(/|/([\\w#!:.?+=&%@!\\-/]))?' drop" eval ] unit-test + +[ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test + ! Bug in parsing word ! [ t ] [ "a" R' a' matches? ] unit-test diff --git a/unfinished/regexp2/regexp2.factor b/unfinished/regexp/regexp.factor similarity index 67% rename from unfinished/regexp2/regexp2.factor rename to unfinished/regexp/regexp.factor index feec8ea97e..47c6e52c39 100644 --- a/unfinished/regexp2/regexp2.factor +++ b/unfinished/regexp/regexp.factor @@ -1,11 +1,11 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors combinators kernel math math.ranges -sequences regexp2.backend regexp2.utils memoize sets -regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal -regexp2.transition-tables assocs prettyprint.backend -make ; -IN: regexp2 +sequences regexp.backend regexp.utils memoize sets +regexp.parser regexp.nfa regexp.dfa regexp.traversal +regexp.transition-tables assocs prettyprint.backend +make lexer namespaces parser ; +IN: regexp : default-regexp ( string -- regexp ) regexp new @@ -51,17 +51,26 @@ IN: regexp2 reversed-regexp initial-option construct-regexp ; -: R! CHAR: ! ; parsing -: R" CHAR: " ; parsing -: R# CHAR: # ; parsing -: R' CHAR: ' ; parsing -: R( CHAR: ) ; parsing -: R/ CHAR: / ; parsing -: R@ CHAR: @ ; parsing -: R[ CHAR: ] ; parsing -: R` CHAR: ` ; parsing -: R{ CHAR: } ; parsing -: R| CHAR: | ; parsing + +: parsing-regexp ( accum end -- accum ) + lexer get dup skip-blank + [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column + lexer get dup still-parsing-line? + [ (parse-token) ] [ drop f ] if + "i" = [ ] [ ] if parsed ; + +: R! CHAR: ! parsing-regexp ; parsing +: R" CHAR: " parsing-regexp ; parsing +: R# CHAR: # parsing-regexp ; parsing +: R' CHAR: ' parsing-regexp ; parsing +: R( CHAR: ) parsing-regexp ; parsing +: R/ CHAR: / parsing-regexp ; parsing +: R@ CHAR: @ parsing-regexp ; parsing +: R[ CHAR: ] parsing-regexp ; parsing +: R` CHAR: ` parsing-regexp ; parsing +: R{ CHAR: } parsing-regexp ; parsing +: R| CHAR: | parsing-regexp ; parsing + : find-regexp-syntax ( string -- prefix suffix ) { @@ -81,6 +90,8 @@ IN: regexp2 : option? ( option regexp -- ? ) options>> key? ; +USE: multiline +/* M: regexp pprint* [ [ @@ -89,3 +100,4 @@ M: regexp pprint* case-insensitive swap option? [ "i" % ] when ] "" make ] keep present-text ; +*/ diff --git a/unfinished/regexp2/summary.txt b/unfinished/regexp/summary.txt similarity index 100% rename from unfinished/regexp2/summary.txt rename to unfinished/regexp/summary.txt diff --git a/unfinished/regexp2/tags.txt b/unfinished/regexp/tags.txt similarity index 100% rename from unfinished/regexp2/tags.txt rename to unfinished/regexp/tags.txt diff --git a/unfinished/regexp2/transition-tables/transition-tables.factor b/unfinished/regexp/transition-tables/transition-tables.factor similarity index 95% rename from unfinished/regexp2/transition-tables/transition-tables.factor rename to unfinished/regexp/transition-tables/transition-tables.factor index c67985af4a..82e2db8496 100644 --- a/unfinished/regexp2/transition-tables/transition-tables.factor +++ b/unfinished/regexp/transition-tables/transition-tables.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs fry hashtables kernel sequences -vectors regexp2.utils ; -IN: regexp2.transition-tables +vectors regexp.utils ; +IN: regexp.transition-tables TUPLE: transition from to obj ; TUPLE: literal-transition < transition ; diff --git a/unfinished/regexp2/traversal/traversal.factor b/unfinished/regexp/traversal/traversal.factor similarity index 86% rename from unfinished/regexp2/traversal/traversal.factor rename to unfinished/regexp/traversal/traversal.factor index ba9284c110..752323de91 100644 --- a/unfinished/regexp2/traversal/traversal.factor +++ b/unfinished/regexp/traversal/traversal.factor @@ -1,10 +1,9 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors assocs combinators combinators.lib kernel -math math.ranges quotations sequences regexp2.parser -regexp2.classes combinators.short-circuit assocs.lib -sequences.lib regexp2.utils ; -IN: regexp2.traversal +USING: accessors assocs combinators kernel math math.ranges +quotations sequences regexp.parser regexp.classes +combinators.short-circuit regexp.utils ; +IN: regexp.traversal TUPLE: dfa-traverser dfa-table @@ -54,7 +53,7 @@ TUPLE: dfa-traverser V{ } clone >>matches ; : match-literal ( transition from-state table -- to-state/f ) - transitions>> [ at ] [ 2drop f ] if-at ; + transitions>> at* [ at ] [ 2drop f ] if ; : match-class ( transition from-state table -- to-state/f ) transitions>> at* [ @@ -62,8 +61,8 @@ TUPLE: dfa-traverser ] [ drop ] if ; : match-default ( transition from-state table -- to-state/f ) - [ nip ] dip transitions>> - [ t swap [ drop f ] unless-at ] [ drop f ] if-at ; + [ nip ] dip transitions>> at* + [ t swap at* [ ] [ drop f ] if ] [ drop f ] if ; : match-transition ( obj from-state dfa -- to-state/f ) { [ match-literal ] [ match-class ] [ match-default ] } 3|| ; diff --git a/unfinished/regexp2/utils/utils.factor b/unfinished/regexp/utils/utils.factor similarity index 91% rename from unfinished/regexp2/utils/utils.factor rename to unfinished/regexp/utils/utils.factor index ab51436f8b..fb058ecf92 100644 --- a/unfinished/regexp2/utils/utils.factor +++ b/unfinished/regexp/utils/utils.factor @@ -1,10 +1,9 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs combinators.lib io kernel -math math.order namespaces regexp2.backend sequences -sequences.lib unicode.categories math.ranges fry -combinators.short-circuit vectors ; -IN: regexp2.utils +USING: accessors arrays assocs io kernel math math.order +namespaces regexp.backend sequences unicode.categories +math.ranges fry combinators.short-circuit vectors ; +IN: regexp.utils : (while-changes) ( obj quot pred pred-ret -- obj ) ! quot: ( obj -- obj' )