From 9814021e062a4cb1e62b21bfb74de6c8c70efca6 Mon Sep 17 00:00:00 2001 From: erg Date: Tue, 20 May 2008 18:57:48 -0500 Subject: [PATCH 1/2] comment out failing unit tests, addinng character groups --- extra/regexp4/regexp4-tests.factor | 84 ++++++++++++++-------------- extra/regexp4/regexp4.factor | 88 ++++++++++++++++++++---------- 2 files changed, 101 insertions(+), 71 deletions(-) diff --git a/extra/regexp4/regexp4-tests.factor b/extra/regexp4/regexp4-tests.factor index e878351b7e..cc71dee6aa 100644 --- a/extra/regexp4/regexp4-tests.factor +++ b/extra/regexp4/regexp4-tests.factor @@ -77,58 +77,58 @@ IN: regexp4-tests [ t ] [ "aaa" "a{1,3}" matches? ] unit-test [ f ] [ "aaaa" "a{1,3}" matches? ] unit-test -[ f ] [ "" "[a]" matches? ] unit-test -[ t ] [ "a" "[a]" matches? ] unit-test -[ t ] [ "a" "[abc]" matches? ] unit-test -[ f ] [ "b" "[a]" matches? ] unit-test -[ f ] [ "d" "[abc]" matches? ] unit-test -[ t ] [ "ab" "[abc]{1,2}" matches? ] unit-test -[ f ] [ "abc" "[abc]{1,2}" matches? ] unit-test +! [ f ] [ "" "[a]" matches? ] unit-test +! [ t ] [ "a" "[a]" matches? ] unit-test +! [ t ] [ "a" "[abc]" matches? ] unit-test +! [ f ] [ "b" "[a]" matches? ] unit-test +! [ f ] [ "d" "[abc]" matches? ] unit-test +! [ t ] [ "ab" "[abc]{1,2}" matches? ] unit-test +! [ f ] [ "abc" "[abc]{1,2}" matches? ] unit-test -[ f ] [ "" "[^a]" matches? ] unit-test -[ f ] [ "a" "[^a]" matches? ] unit-test -[ f ] [ "a" "[^abc]" matches? ] unit-test -[ t ] [ "b" "[^a]" matches? ] unit-test -[ t ] [ "d" "[^abc]" matches? ] unit-test -[ f ] [ "ab" "[^abc]{1,2}" matches? ] unit-test -[ f ] [ "abc" "[^abc]{1,2}" matches? ] unit-test +! [ f ] [ "" "[^a]" matches? ] unit-test +! [ f ] [ "a" "[^a]" matches? ] unit-test +! [ f ] [ "a" "[^abc]" matches? ] unit-test +! [ t ] [ "b" "[^a]" matches? ] unit-test +! [ t ] [ "d" "[^abc]" matches? ] unit-test +! [ f ] [ "ab" "[^abc]{1,2}" matches? ] unit-test +! [ f ] [ "abc" "[^abc]{1,2}" matches? ] unit-test -[ t ] [ "]" "[]]" matches? ] unit-test -[ f ] [ "]" "[^]]" matches? ] unit-test +! [ t ] [ "]" "[]]" matches? ] unit-test +! [ f ] [ "]" "[^]]" matches? ] unit-test ! [ "^" "[^]" matches? ] must-fail -[ t ] [ "^" "[]^]" matches? ] unit-test -[ t ] [ "]" "[]^]" matches? ] unit-test +! [ t ] [ "^" "[]^]" matches? ] unit-test +! [ t ] [ "]" "[]^]" matches? ] unit-test -[ t ] [ "[" "[[]" matches? ] unit-test -[ f ] [ "^" "[^^]" matches? ] unit-test -[ t ] [ "a" "[^^]" matches? ] unit-test +! [ t ] [ "[" "[[]" matches? ] unit-test +! [ f ] [ "^" "[^^]" matches? ] unit-test +! [ t ] [ "a" "[^^]" matches? ] unit-test -[ t ] [ "-" "[-]" matches? ] unit-test -[ f ] [ "a" "[-]" matches? ] unit-test -[ f ] [ "-" "[^-]" matches? ] unit-test -[ t ] [ "a" "[^-]" matches? ] unit-test +! [ t ] [ "-" "[-]" matches? ] unit-test +! [ f ] [ "a" "[-]" matches? ] unit-test +! [ f ] [ "-" "[^-]" matches? ] unit-test +! [ t ] [ "a" "[^-]" matches? ] unit-test -[ t ] [ "-" "[-a]" matches? ] unit-test -[ t ] [ "a" "[-a]" matches? ] unit-test -[ t ] [ "-" "[a-]" matches? ] unit-test -[ t ] [ "a" "[a-]" matches? ] unit-test -[ f ] [ "b" "[a-]" matches? ] unit-test -[ f ] [ "-" "[^-]" matches? ] unit-test -[ t ] [ "a" "[^-]" matches? ] unit-test +! [ t ] [ "-" "[-a]" matches? ] unit-test +! [ t ] [ "a" "[-a]" matches? ] unit-test +! [ t ] [ "-" "[a-]" matches? ] unit-test +! [ t ] [ "a" "[a-]" matches? ] unit-test +! [ f ] [ "b" "[a-]" matches? ] unit-test +! [ f ] [ "-" "[^-]" matches? ] unit-test +! [ t ] [ "a" "[^-]" matches? ] unit-test -[ f ] [ "-" "[a-c]" matches? ] unit-test -[ t ] [ "-" "[^a-c]" matches? ] unit-test -[ t ] [ "b" "[a-c]" matches? ] unit-test -[ f ] [ "b" "[^a-c]" matches? ] unit-test +! [ f ] [ "-" "[a-c]" matches? ] unit-test +! [ t ] [ "-" "[^a-c]" matches? ] unit-test +! [ t ] [ "b" "[a-c]" matches? ] unit-test +! [ f ] [ "b" "[^a-c]" matches? ] unit-test -[ t ] [ "-" "[a-c-]" matches? ] unit-test -[ f ] [ "-" "[^a-c-]" matches? ] unit-test +! [ t ] [ "-" "[a-c-]" matches? ] unit-test +! [ f ] [ "-" "[^a-c-]" matches? ] unit-test -[ t ] [ "\\" "[\\\\]" matches? ] unit-test -[ f ] [ "a" "[\\\\]" matches? ] unit-test -[ f ] [ "\\" "[^\\\\]" matches? ] unit-test -[ t ] [ "a" "[^\\\\]" matches? ] unit-test +! [ t ] [ "\\" "[\\\\]" matches? ] unit-test +! [ f ] [ "a" "[\\\\]" matches? ] unit-test +! [ f ] [ "\\" "[^\\\\]" matches? ] unit-test +! [ t ] [ "a" "[^\\\\]" matches? ] unit-test ! ((A)(B(C))) diff --git a/extra/regexp4/regexp4.factor b/extra/regexp4/regexp4.factor index faf52f098d..a33a57bcea 100644 --- a/extra/regexp4/regexp4.factor +++ b/extra/regexp4/regexp4.factor @@ -8,29 +8,31 @@ symbols ; IN: regexp4 SYMBOLS: eps start-state final-state beginning-of-text -end-of-text left-paren right-paren alternation ; +end-of-text left-parenthesis alternation left-bracket +caret dash ampersand semicolon ; -SYMBOL: runtim-epsilon +SYMBOL: runtime-epsilon -TUPLE: regexp raw paren-count bracket-count +TUPLE: regexp raw parentheses-count bracket-count state stack nfa new-states dfa minimized-dfa dot-matches-newlines? character-sets capture-group captured-groups ; TUPLE: capture-group n range ; -ERROR: paren-underflow ; -ERROR: unbalanced-paren ; +ERROR: parentheses-underflow ; +ERROR: unbalanced-parentheses ; +ERROR: unbalanced-brackets ; : push-stack ( regexp token -- ) swap stack>> push ; : push-all-stack ( regexp seq -- ) swap stack>> push-all ; : next-state ( regexp -- n ) [ 1+ ] change-state state>> ; -: check-paren-underflow ( regexp -- ) - paren-count>> 0 < [ paren-underflow ] when ; +: check-parentheses-underflow ( regexp -- ) + parentheses-count>> 0 < [ parentheses-underflow ] when ; -: check-unbalanced-paren ( regexp -- ) - paren-count>> 0 > [ unbalanced-paren ] when ; +: check-unbalanced-parentheses ( regexp -- ) + parentheses-count>> 0 > [ unbalanced-parentheses ] when ; :: (apply-alternation) ( stack regexp -- ) [let | s2 [ stack peek first ] @@ -82,10 +84,12 @@ ERROR: unbalanced-paren ; [ (apply-alternation) ] [ (apply-concatenation) ] if apply-loop ] when ; +: cut-stack ( n vector -- vector' vector ) + tuck index cut reverse dup pop* ; + : apply-til-last ( token regexp -- ) - swap [ - tuck index cut reverse dup pop* - ] change-stack >r reverse r> apply-loop stack>> push-all ; + swap [ cut-stack ] change-stack + >r reverse r> apply-loop stack>> push-all ; : concatenation-loop ( regexp -- ) dup stack>> dup apply-concatenation? @@ -294,16 +298,6 @@ ERROR: bad-hex number ; [ get-char CHAR: } = ] take-until "," split1 [ [ string>number ] bi@ ] keep >boolean ; -: take-until-] - [ get-char CHAR: ] = ] take-until ; - -: make-character-set ( regexp str -- ) - dup - [ length 1 > ] [ first CHAR: ^ = ] bi and - [ rest t ] [ f ] if - >r [ member? ] curry r> - [ [ not ] compose ] when make-nontoken-nfa ; - : parse-escaped ( regexp -- ) next get-char { { CHAR: \ [ CHAR: \ make-nontoken-nfa ] } @@ -340,6 +334,39 @@ ERROR: bad-hex number ; [ dup digit? [ parse-backreference ] [ make-nontoken-nfa ] if ] } case ; +: make-character-set ( regexp -- ) + stack>> throw ; + +: (parse-character-set) ( regexp -- ) + [ + next get-char + { + { CHAR: [ [ [ 1+ ] change-bracket-count left-bracket push-stack ] } + { CHAR: ] [ [ 1- ] change-bracket-count left-bracket over stack>> cut-stack ] } + { CHAR: - [ dash push-stack ] } + { CHAR: & [ ampersand push-stack ] } + { CHAR: : [ semicolon push-stack ] } + { CHAR: \ [ parse-escaped ] } + { f [ unbalanced-brackets ] } + [ make-nontoken-nfa ] + } case + ] [ + dup bracket-count>> 0 > + [ (parse-character-set) ] + [ make-character-set ] if + ] bi ; + +: parse-character-set-first ( regexp -- ) + get-next + { + { CHAR: ^ [ caret push-stack next ] } + { CHAR: ] [ CHAR: ] make-nontoken-nfa next ] } + [ 2drop ] + } case ; + +: parse-character-set ( regexp -- ) + [ parse-character-set-first ] [ (parse-character-set) ] bi ; + ERROR: unsupported-token token ; : parse-token ( regexp token -- ) dup { @@ -347,20 +374,24 @@ ERROR: unsupported-token token ; { CHAR: $ [ drop back-anchor-construction ] } { CHAR: \ [ drop parse-escaped ] } { CHAR: | [ drop dup concatenation-loop alternation push-stack ] } - { CHAR: ( [ drop [ 1+ ] change-paren-count left-paren push-stack ] } - { CHAR: ) [ drop [ 1- ] change-paren-count left-paren apply-til-last ] } + { CHAR: ( [ drop [ 1+ ] change-parentheses-count left-parenthesis push-stack ] } + { CHAR: ) [ drop [ 1- ] change-parentheses-count left-parenthesis apply-til-last ] } { CHAR: * [ drop apply-kleene-closure ] } { CHAR: + [ drop apply-plus-closure ] } { CHAR: ? [ drop apply-question-closure ] } { CHAR: { [ drop parse-brace apply-brace-closure ] } - ! { CHAR: [ [ drop parse-character-set ] } + { CHAR: [ [ + drop + dup left-bracket push-stack + [ 1+ ] change-bracket-count parse-character-set + ] } ! { CHAR: } [ drop drop "brace" ] } ! { CHAR: ? [ drop ] } { CHAR: . [ drop dot-construction ] } { beginning-of-text [ push-stack ] } { end-of-text [ drop { - [ check-unbalanced-paren ] + [ check-unbalanced-parentheses ] [ concatenation-loop ] [ beginning-of-text apply-til-last ] [ set-start-state ] @@ -461,7 +492,8 @@ ERROR: unsupported-token token ; : ( raw -- obj ) regexp new swap >>raw - 0 >>paren-count + 0 >>parentheses-count + 0 >>bracket-count -1 >>state V{ } clone >>stack V{ } clone >>character-sets @@ -535,8 +567,6 @@ TUPLE: dfa-traverser : matches? ( string regexp -- ? ) dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ; - - ! character classes ! TUPLE: range-class from to ; ! TUPLE: or-class left right ; From 9307bb3b4a4105701e976c74b07d6f55f75c1369 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Wed, 21 May 2008 00:30:22 -0500 Subject: [PATCH 2/2] working on character classes still have to do negation, &&, and :foo: --- extra/regexp4/regexp4-tests.factor | 86 +++++++++++++++--------------- extra/regexp4/regexp4.factor | 63 ++++++++++++++-------- 2 files changed, 85 insertions(+), 64 deletions(-) diff --git a/extra/regexp4/regexp4-tests.factor b/extra/regexp4/regexp4-tests.factor index cc71dee6aa..ec8656bc27 100644 --- a/extra/regexp4/regexp4-tests.factor +++ b/extra/regexp4/regexp4-tests.factor @@ -77,58 +77,58 @@ IN: regexp4-tests [ t ] [ "aaa" "a{1,3}" matches? ] unit-test [ f ] [ "aaaa" "a{1,3}" matches? ] unit-test -! [ f ] [ "" "[a]" matches? ] unit-test -! [ t ] [ "a" "[a]" matches? ] unit-test -! [ t ] [ "a" "[abc]" matches? ] unit-test -! [ f ] [ "b" "[a]" matches? ] unit-test -! [ f ] [ "d" "[abc]" matches? ] unit-test -! [ t ] [ "ab" "[abc]{1,2}" matches? ] unit-test -! [ f ] [ "abc" "[abc]{1,2}" matches? ] unit-test +[ f ] [ "" "[a]" matches? ] unit-test +[ t ] [ "a" "[a]" matches? ] unit-test +[ t ] [ "a" "[abc]" matches? ] unit-test +[ f ] [ "b" "[a]" matches? ] unit-test +[ f ] [ "d" "[abc]" matches? ] unit-test +[ t ] [ "ab" "[abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[abc]{1,2}" matches? ] unit-test -! [ f ] [ "" "[^a]" matches? ] unit-test -! [ f ] [ "a" "[^a]" matches? ] unit-test -! [ f ] [ "a" "[^abc]" matches? ] unit-test -! [ t ] [ "b" "[^a]" matches? ] unit-test -! [ t ] [ "d" "[^abc]" matches? ] unit-test -! [ f ] [ "ab" "[^abc]{1,2}" matches? ] unit-test -! [ f ] [ "abc" "[^abc]{1,2}" matches? ] unit-test +[ f ] [ "" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^abc]" matches? ] unit-test +[ t ] [ "b" "[^a]" matches? ] unit-test +[ t ] [ "d" "[^abc]" matches? ] unit-test +[ f ] [ "ab" "[^abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[^abc]{1,2}" matches? ] unit-test -! [ t ] [ "]" "[]]" matches? ] unit-test -! [ f ] [ "]" "[^]]" matches? ] unit-test +[ t ] [ "]" "[]]" matches? ] unit-test +[ f ] [ "]" "[^]]" matches? ] unit-test -! [ "^" "[^]" matches? ] must-fail -! [ t ] [ "^" "[]^]" matches? ] unit-test -! [ t ] [ "]" "[]^]" matches? ] unit-test +[ "^" "[^]" matches? ] must-fail +[ t ] [ "^" "[]^]" matches? ] unit-test +[ t ] [ "]" "[]^]" matches? ] unit-test -! [ t ] [ "[" "[[]" matches? ] unit-test -! [ f ] [ "^" "[^^]" matches? ] unit-test -! [ t ] [ "a" "[^^]" matches? ] unit-test +[ t ] [ "[" "[[]" matches? ] unit-test +[ f ] [ "^" "[^^]" matches? ] unit-test +[ t ] [ "a" "[^^]" matches? ] unit-test -! [ t ] [ "-" "[-]" matches? ] unit-test -! [ f ] [ "a" "[-]" matches? ] unit-test -! [ f ] [ "-" "[^-]" matches? ] unit-test -! [ t ] [ "a" "[^-]" matches? ] unit-test +[ t ] [ "-" "[-]" matches? ] unit-test +[ f ] [ "a" "[-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test -! [ t ] [ "-" "[-a]" matches? ] unit-test -! [ t ] [ "a" "[-a]" matches? ] unit-test -! [ t ] [ "-" "[a-]" matches? ] unit-test -! [ t ] [ "a" "[a-]" matches? ] unit-test -! [ f ] [ "b" "[a-]" matches? ] unit-test -! [ f ] [ "-" "[^-]" matches? ] unit-test -! [ t ] [ "a" "[^-]" matches? ] unit-test +[ t ] [ "-" "[-a]" matches? ] unit-test +[ t ] [ "a" "[-a]" matches? ] unit-test +[ t ] [ "-" "[a-]" matches? ] unit-test +[ t ] [ "a" "[a-]" matches? ] unit-test +[ f ] [ "b" "[a-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test -! [ f ] [ "-" "[a-c]" matches? ] unit-test -! [ t ] [ "-" "[^a-c]" matches? ] unit-test -! [ t ] [ "b" "[a-c]" matches? ] unit-test -! [ f ] [ "b" "[^a-c]" matches? ] unit-test +[ f ] [ "-" "[a-c]" matches? ] unit-test +[ t ] [ "-" "[^a-c]" matches? ] unit-test +[ t ] [ "b" "[a-c]" matches? ] unit-test +[ f ] [ "b" "[^a-c]" matches? ] unit-test -! [ t ] [ "-" "[a-c-]" matches? ] unit-test -! [ f ] [ "-" "[^a-c-]" matches? ] unit-test +[ t ] [ "-" "[a-c-]" matches? ] unit-test +[ f ] [ "-" "[^a-c-]" matches? ] unit-test -! [ t ] [ "\\" "[\\\\]" matches? ] unit-test -! [ f ] [ "a" "[\\\\]" matches? ] unit-test -! [ f ] [ "\\" "[^\\\\]" matches? ] unit-test -! [ t ] [ "a" "[^\\\\]" matches? ] unit-test +[ t ] [ "\\" "[\\\\]" matches? ] unit-test +[ f ] [ "a" "[\\\\]" matches? ] unit-test +[ f ] [ "\\" "[^\\\\]" matches? ] unit-test +[ t ] [ "a" "[^\\\\]" matches? ] unit-test ! ((A)(B(C))) diff --git a/extra/regexp4/regexp4.factor b/extra/regexp4/regexp4.factor index a33a57bcea..07ef430de3 100644 --- a/extra/regexp4/regexp4.factor +++ b/extra/regexp4/regexp4.factor @@ -4,12 +4,12 @@ USING: accessors arrays assocs combinators kernel math sequences namespaces locals combinators.lib state-tables math.parser state-parser sets dlists unicode.categories math.order quotations shuffle math.ranges splitting -symbols ; +symbols fry ; IN: regexp4 SYMBOLS: eps start-state final-state beginning-of-text end-of-text left-parenthesis alternation left-bracket -caret dash ampersand semicolon ; +caret dash ampersand colon ; SYMBOL: runtime-epsilon @@ -84,12 +84,14 @@ ERROR: unbalanced-brackets ; [ (apply-alternation) ] [ (apply-concatenation) ] if apply-loop ] when ; -: cut-stack ( n vector -- vector' vector ) - tuck index cut reverse dup pop* ; +: cut-out ( vector n -- vector' vector ) cut rest ; + +: cut-stack ( obj vector -- vector' vector ) + tuck last-index cut-out swap ; -: apply-til-last ( token regexp -- ) +: apply-til-last ( regexp token -- ) swap [ cut-stack ] change-stack - >r reverse r> apply-loop stack>> push-all ; + apply-loop stack>> push-all ; : concatenation-loop ( regexp -- ) dup stack>> dup apply-concatenation? @@ -300,13 +302,13 @@ ERROR: bad-hex number ; : parse-escaped ( regexp -- ) next get-char { - { CHAR: \ [ CHAR: \ make-nontoken-nfa ] } - { CHAR: t [ CHAR: \t make-nontoken-nfa ] } - { CHAR: n [ CHAR: \n make-nontoken-nfa ] } - { CHAR: r [ CHAR: \r make-nontoken-nfa ] } - { CHAR: f [ HEX: c make-nontoken-nfa ] } - { CHAR: a [ HEX: 7 make-nontoken-nfa ] } - { CHAR: e [ HEX: 1b make-nontoken-nfa ] } + { CHAR: \ [ [ CHAR: \ = ] make-nontoken-nfa ] } + { CHAR: t [ [ CHAR: \t = ] make-nontoken-nfa ] } + { CHAR: n [ [ CHAR: \n = ] make-nontoken-nfa ] } + { CHAR: r [ [ CHAR: \r = ] make-nontoken-nfa ] } + { CHAR: f [ [ HEX: c = ] make-nontoken-nfa ] } + { CHAR: a [ [ HEX: 7 = ] make-nontoken-nfa ] } + { CHAR: e [ [ HEX: 1b = ] make-nontoken-nfa ] } { CHAR: d [ [ digit? ] make-nontoken-nfa ] } { CHAR: D [ [ digit? not ] make-nontoken-nfa ] } @@ -335,31 +337,50 @@ ERROR: bad-hex number ; } case ; : make-character-set ( regexp -- ) - stack>> throw ; + left-bracket over stack>> cut-stack + pick (>>stack) + [ dup number? [ '[ dup , = ] ] when ] map + [ [ drop t ] 2array ] map [ drop f ] suffix [ cond ] curry + make-nontoken-nfa ; +: apply-dash ( regexp -- ) + stack>> dup [ pop ] [ pop* ] [ pop ] tri + swap '[ dup , , between? ] swap push ; + +: apply-dash? ( regexp -- ? ) + stack>> dup length 3 >= + [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ; + +DEFER: parse-character-set : (parse-character-set) ( regexp -- ) [ next get-char { - { CHAR: [ [ [ 1+ ] change-bracket-count left-bracket push-stack ] } - { CHAR: ] [ [ 1- ] change-bracket-count left-bracket over stack>> cut-stack ] } + { CHAR: [ [ + [ 1+ ] change-bracket-count left-bracket push-stack + parse-character-set + ] } + { CHAR: ] [ + [ 1- ] change-bracket-count + make-character-set + ] } { CHAR: - [ dash push-stack ] } - { CHAR: & [ ampersand push-stack ] } - { CHAR: : [ semicolon push-stack ] } + ! { CHAR: & [ ampersand push-stack ] } + ! { CHAR: : [ semicolon push-stack ] } { CHAR: \ [ parse-escaped ] } { f [ unbalanced-brackets ] } - [ make-nontoken-nfa ] + [ dupd push-stack dup apply-dash? [ apply-dash ] [ drop ] if ] } case ] [ dup bracket-count>> 0 > - [ (parse-character-set) ] - [ make-character-set ] if + [ (parse-character-set) ] [ drop ] if ] bi ; : parse-character-set-first ( regexp -- ) get-next { { CHAR: ^ [ caret push-stack next ] } + { CHAR: [ [ CHAR: [ make-nontoken-nfa next ] } { CHAR: ] [ CHAR: ] make-nontoken-nfa next ] } [ 2drop ] } case ;