Switching basis/globs to regexps (and EBNF for syntax); this exposes a bug in regexp

2009-03-02 15:31:28 -06:00 · 2009-03-02 15:31:28 -06:00 · 0b5ebce339
parent 03ae348e78
commit 0b5ebce339
7 changed files with 44 additions and 30 deletions
--- a/basis/globs/globs-tests.factor
+++ b/basis/globs/globs-tests.factor
@ -14,5 +14,6 @@ USING: tools.test globs ;
 [ f ] [ "foo.java" "*.{xml,txt}" glob-matches? ] unit-test
 [ t ] [ "foo.txt" "*.{xml,txt}" glob-matches? ] unit-test
 [ t ] [ "foo.xml" "*.{xml,txt}" glob-matches? ] unit-test
-[ f ] [ "foo." "*.{,xml,txt}" glob-matches? ] unit-test
+[ f ] [ "foo." "*.{xml,txt}" glob-matches? ] unit-test
+[ t ] [ "foo." "*.{,xml,txt}" glob-matches? ] unit-test
 [ t ] [ "foo.{" "*.{" glob-matches? ] unit-test
--- a/basis/globs/globs.factor
+++ b/basis/globs/globs.factor
@ -1,42 +1,42 @@
-! Copyright (C) 2007 Slava Pestov.
+! Copyright (C) 2007, 2009 Slava Pestov, Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: parser-combinators parser-combinators.regexp lists sequences kernel
-promises strings unicode.case ;
+USING: sequences kernel regexp.combinators regexp.matchers strings unicode.case
+peg.ebnf regexp arrays ;
 IN: globs

-<PRIVATE
+EBNF: <glob>

-: 'char' ( -- parser )
-    [ ",*?" member? not ] satisfy ;
+Character = "\\" .:c => [[ c 1string <literal> ]]
+          | !(","|"}") . => [[ 1string <literal> ]]

-: 'string' ( -- parser )
-    'char' <+> [ >lower token ] <@ ;
+RangeCharacter = !("]") .

-: 'escaped-char' ( -- parser )
-    "\\" token any-char-parser &> [ 1token ] <@ ;
+Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <char-range> ]]
+      | RangeCharacter => [[ 1string <literal> ]]

-: 'escaped-string' ( -- parser )
-    'string' 'escaped-char' <|> ;
+StartRange = .:a "-" RangeCharacter:b => [[ a b <char-range> ]]
+           | . => [[ 1string <literal> ]]

-DEFER: 'term'
+Ranges = StartRange:s Range*:r => [[ r s prefix ]]

-: 'glob' ( -- parser )
-    'term' <*> [ <and-parser> ] <@ ;
+CharClass = "^"?:n Ranges:e => [[ e <or> n [ <not> ] when ]]

-: 'union' ( -- parser )
-    'glob' "," token nonempty-list-of "{" "}" surrounded-by
-    [ <or-parser> ] <@ ;
+AlternationBody = Concatenation:c "," AlternationBody:a => [[ a c prefix ]]
+                | Concatenation => [[ 1array ]]

-LAZY: 'term' ( -- parser )
-    'union'
-    'character-class' <|>
-    "?" token [ drop any-char-parser ] <@ <|>
-    "*" token [ drop any-char-parser <*> ] <@ <|>
-    'escaped-string' <|> ;
+Element = "*" => [[ R/ .*/ ]]
+        | "?" => [[ R/ ./ ]]
+        | "[" CharClass:c "]" => [[ c ]]
+        | "{" AlternationBody:b "}" => [[ b <or> ]]
+        | Character

-PRIVATE>
+Concatenation = Element* => [[ <sequence> ]]

-: <glob> ( string -- glob ) 'glob' just parse-1 just ;
+End = !(.)
+
+Main = Concatenation End
+
+;EBNF

 : glob-matches? ( input glob -- ? )
-    [ >lower ] [ <glob> ] bi* parse nil? not ;
+    [ >case-fold ] bi@ <glob> matches? ;
--- a/basis/regexp/combinators/combinators-tests.factor
+++ b/basis/regexp/combinators/combinators-tests.factor
@ -1,6 +1,6 @@
 ! Copyright (C) 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
-USING: regexp.combinators tools.test regexp kernel sequences ;
+USING: regexp.combinators tools.test regexp kernel sequences regexp.matchers ;
 IN: regexp.combinators.tests

 : strings ( -- regexp )
--- a/basis/regexp/combinators/combinators.factor
+++ b/basis/regexp/combinators/combinators.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 USING: regexp sequences kernel regexp.negation regexp.ast
-accessors fry ;
+accessors fry regexp.classes ;
 IN: regexp.combinators

 <PRIVATE
@ -18,6 +18,11 @@ CONSTANT: <nothing> R/ (?~.*)/
 : <literal> ( string -- regexp )
    [ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable

+: <char-range> ( char1 char2 -- regexp )
+    [ [ "[" "-" surround ] [ "]" append ] bi* append ]
+    [ <range> ]
+    2bi make-regexp ;
+
 : <or> ( regexps -- disjunction )
    [ [ raw>> "(" ")" surround ] map "|" join ]
    [ [ parse-tree>> ] map <alternation> ] bi
--- a/basis/regexp/matchers/matchers.factor
+++ b/basis/regexp/matchers/matchers.factor
@ -32,9 +32,13 @@ GENERIC: match-index-from ( i string matcher -- index/f )
 : match-head ( str matcher -- slice/f )
    [ 0 ] 2dip match-from ;

+<PRIVATE
+
 : next-match ( i string matcher -- i match/f )
    match-from [ dup [ to>> ] when ] keep ;

+PRIVATE>
+
 :: all-matches ( string matcher -- seq )
    0 [ dup ] [ string matcher next-match ] [ ] produce nip but-last ;

--- a/basis/regexp/minimize/minimize.factor
+++ b/basis/regexp/minimize/minimize.factor
@ -19,6 +19,7 @@ IN: regexp.minimize

 : rewrite-transitions ( transition-table assoc quot -- transition-table )
    [
+        [ clone ] dip
        [ '[ _ at ] change-start-state ]
        [ '[ [ _ at ] map-set ] change-final-states ]
        [ ] tri
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@ -342,6 +342,9 @@ IN: regexp-tests
 [ f ] [ "πc" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
 [ f ] [ "Ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test

+[ t ] [ "aaaa" R/ .*a./ matches? ] unit-test
+
+! DFA is compiled when needed, or when literal
 [ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test
 [ t ] [ R/ foo/ dfa>> >boolean ] unit-test