From e2fda2e227e29f4d99497c97ea9edb47f4cf695e Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Tue, 10 Mar 2009 19:34:49 -0500 Subject: [PATCH] Fixing help-lint for regexp; adding first-match and re-contains? --- basis/regexp/regexp-docs.factor | 27 ++++++++++++++++-------- basis/regexp/regexp-tests.factor | 36 ++++++++++++++++++-------------- basis/regexp/regexp.factor | 15 ++++++------- 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index ce4a54df87..1d28e5e92f 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -39,13 +39,14 @@ ARTICLE: { "regexp" "theory" } "The theory of regular expressions" "The Factor regular expression engine was built with the design decision to support negation and intersection at the expense of backreferences. This lets us have a guaranteed linear-time matching algorithm. Systems like Ragel and Lex also use this algorithm, but in the Factor regular expression engine, all other features of regexps are still present." ; ARTICLE: { "regexp" "operations" } "Matching operations with regular expressions" -{ $subsection all-matches } { $subsection matches? } +{ $subsection re-contains? } +{ $subsection first-match } +{ $subsection all-matches } { $subsection re-split1 } { $subsection re-split } { $subsection re-replace } -{ $subsection count-matches } -{ $subsection re-replace } ; +{ $subsection count-matches } ; HELP: { $values { "string" string } { "regexp" regexp } } @@ -63,25 +64,33 @@ HELP: regexp { $class-description "The class of regular expressions. To construct these, see " { $link { "regexp" "construction" } } "." } ; HELP: matches? -{ $values { "string" string } { "matcher" regexp } { "?" "a boolean" } } +{ $values { "string" string } { "regexp" regexp } { "?" "a boolean" } } { $description "Tests if the string as a whole matches the given regular expression." } ; HELP: re-split1 -{ $values { "string" string } { "matcher" regexp } { "before" string } { "after/f" string } } +{ $values { "string" string } { "regexp" regexp } { "before" string } { "after/f" string } } { $description "Searches the string for a substring which matches the pattern. If found, the input string is split on the leftmost and longest occurence of the match, and the two halves are given as output. If no match is found, then the input string and " { $link f } " are output." } ; HELP: all-matches -{ $values { "string" string } { "matcher" regexp } { "seq" "a sequence of slices of the input" } } +{ $values { "string" string } { "regexp" regexp } { "seq" "a sequence of slices of the input" } } { $description "Finds a sequence of disjoint substrings which each match the pattern. It chooses this by finding the leftmost longest match, and then the leftmost longest match which starts after the end of the previous match, and so on." } ; HELP: count-matches -{ $values { "string" string } { "matcher" regexp } { "n" integer } } +{ $values { "string" string } { "regexp" regexp } { "n" integer } } { $description "Counts how many disjoint matches the regexp has in the string, as made unambiguous by " { $link all-matches } "." } ; HELP: re-split -{ $values { "string" string } { "matcher" regexp } { "seq" "a sequence of slices of the input" } } +{ $values { "string" string } { "regexp" regexp } { "seq" "a sequence of slices of the input" } } { $description "Splits the input string into chunks separated by the regular expression. Each chunk contains no match of the regexp. The chunks are chosen by the strategy of " { $link all-matches } "." } ; HELP: re-replace -{ $values { "string" string } { "matcher" regexp } { "replacement" string } { "result" string } } +{ $values { "string" string } { "regexp" regexp } { "replacement" string } { "result" string } } { $description "Replaces substrings which match the input regexp with the given replacement text. The boundaries of the substring are chosen by the strategy used by " { $link all-matches } "." } ; + +HELP: first-match +{ $values { "string" string } { "regexp" regexp } { "slice/f" "the match, if one exists" } } +{ $description "Finds the first match of the regular expression in the string, and returns it as a slice. If there is no match, then " { $link f } " is returned." } ; + +HELP: re-contains? +{ $values { "string" string } { "regexp" regexp } { "?" "a boolean" } } +{ $description "Determines whether the string has a substring which matches the regular expression given." } ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index f7d3dae3f3..f05416ab94 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -211,8 +211,8 @@ IN: regexp-tests [ f ] [ "aaaxb" "a+ab" matches? ] unit-test [ t ] [ "aaacb" "a+cb" matches? ] unit-test -[ "aaa" ] [ "aaacb" "a*" match-head >string ] unit-test -[ "aa" ] [ "aaacb" "aa?" match-head >string ] unit-test +[ "aaa" ] [ "aaacb" "a*" first-match >string ] unit-test +[ "aa" ] [ "aaacb" "aa?" first-match >string ] unit-test [ t ] [ "aaa" R/ AAA/i matches? ] unit-test [ f ] [ "aax" R/ AAA/i matches? ] unit-test @@ -268,13 +268,13 @@ IN: regexp-tests [ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test -[ "ab" ] [ "ab" "(a|ab)(bc)?" match-head >string ] unit-test -[ "abc" ] [ "abc" "(a|ab)(bc)?" match-head >string ] unit-test +[ "ab" ] [ "ab" "(a|ab)(bc)?" first-match >string ] unit-test +[ "abc" ] [ "abc" "(a|ab)(bc)?" first-match >string ] unit-test -[ "ab" ] [ "ab" "(ab|a)(bc)?" match-head >string ] unit-test -[ "abc" ] [ "abc" "(ab|a)(bc)?" match-head >string ] unit-test +[ "ab" ] [ "ab" "(ab|a)(bc)?" first-match >string ] unit-test +[ "abc" ] [ "abc" "(ab|a)(bc)?" first-match >string ] unit-test -[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" match-head >string ] unit-test +[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" first-match >string ] unit-test [ { "1" "2" "3" "4" } ] [ "1ABC2DEF3GHI4" R/ [A-Z]+/ re-split [ >string ] map ] unit-test @@ -300,18 +300,18 @@ IN: regexp-tests [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test -[ "" ] [ "ab" "a(?!b)" match-head >string ] unit-test -[ "a" ] [ "ac" "a(?!b)" match-head >string ] unit-test +[ "" ] [ "ab" "a(?!b)" first-match >string ] unit-test +[ "a" ] [ "ac" "a(?!b)" first-match >string ] unit-test [ t ] [ "fxxbar" ".{3}(?!foo)bar" matches? ] unit-test [ t ] [ "foobar" ".{3}(?!foo)bar" matches? ] unit-test [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test -[ "a" ] [ "ab" "a(?=b)(?=b)" match-head >string ] unit-test -[ "a" ] [ "ba" "(?<=b)(?<=b)a" match-head >string ] unit-test -[ "a" ] [ "cab" "(?<=c)a(?=b)" match-head >string ] unit-test +[ "a" ] [ "ab" "a(?=b)(?=b)" first-match >string ] unit-test +[ "a" ] [ "ba" "(?<=b)(?<=b)a" first-match >string ] unit-test +[ "a" ] [ "cab" "(?<=c)a(?=b)" first-match >string ] unit-test -[ 3 ] [ "foobar" "foo(?=bar)" match-head length ] unit-test -[ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test +[ 3 ] [ "foobar" "foo(?=bar)" first-match length ] unit-test +[ f ] [ "foobxr" "foo(?=bar)" first-match ] unit-test ! Bug in parsing word [ t ] [ "a" R' a' matches? ] unit-test @@ -424,8 +424,12 @@ IN: regexp-tests [ 1 ] [ "a\r" R/ a$/m count-matches ] unit-test [ 1 ] [ "a\r\n" R/ a$/m count-matches ] unit-test -[ f ] [ "foobxr" "foo\\z" match-head ] unit-test -[ 3 ] [ "foo" "foo\\z" match-head length ] unit-test +[ f ] [ "foobxr" "foo\\z" first-match ] unit-test +[ 3 ] [ "foo" "foo\\z" first-match length ] unit-test + +[ t ] [ "a foo b" R/ foo/ re-contains? ] unit-test +[ f ] [ "a bar b" R/ foo/ re-contains? ] unit-test +[ t ] [ "foo" R/ foo/ re-contains? ] unit-test ! [ t ] [ "foo" "\\bfoo\\b" matches? ] unit-test ! [ t ] [ "afoob" "\\Bfoo\\B" matches? ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 94bbc2af58..90218e05bd 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -89,16 +89,17 @@ PRIVATE> slices [ from>> ] map string length suffix [ string ] 2map ; -: match-head ( str regexp -- slice/f ) - [ - [ 0 ] [ check-string ] [ dup dfa>> '[ _ _ execute ] ] tri* - match-from - ] call( str regexp -- slice/f ) ; - PRIVATE> +: first-match ( string regexp -- slice/f ) + [ 0 ] [ check-string ] [ ] tri* + do-next-match nip ; + +: re-contains? ( string regexp -- ? ) + first-match >boolean ; + : re-split1 ( string regexp -- before after/f ) - dupd match-head [ 1array split-slices first2 ] [ f ] if* ; + dupd first-match [ 1array split-slices first2 ] [ f ] if* ; : re-split ( string regexp -- seq ) dupd all-matches split-slices ;