diff --git a/extra/regexp/regexp-tests.factor b/extra/regexp/regexp-tests.factor index d76b038ffa..823e7c7f36 100755 --- a/extra/regexp/regexp-tests.factor +++ b/extra/regexp/regexp-tests.factor @@ -199,3 +199,26 @@ IN: regexp-tests "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" f drop ] unit-test + +[ t ] [ "fxxbar" "(?!foo).{3}bar" f matches? ] unit-test +[ f ] [ "foobar" "(?!foo).{3}bar" f matches? ] unit-test + +[ 3 ] [ "foobar" "foo(?=bar)" f match-head ] unit-test +[ f ] [ "foobxr" "foo(?=bar)" f match-head ] unit-test + +[ f ] [ "foobxr" "foo\\z" f match-head ] unit-test +[ 3 ] [ "foo" "foo\\z" f match-head ] unit-test + +[ 3 ] [ "foo bar" "foo\\b" f match-head ] unit-test +[ f ] [ "fooxbar" "foo\\b" f matches? ] unit-test +[ t ] [ "foo" "foo\\b" f matches? ] unit-test +[ t ] [ "foo bar" "foo\\b bar" f matches? ] unit-test +[ f ] [ "fooxbar" "foo\\bxbar" f matches? ] unit-test +[ f ] [ "foo" "foo\\bbar" f matches? ] unit-test + +[ f ] [ "foo bar" "foo\\B" f matches? ] unit-test +[ 3 ] [ "fooxbar" "foo\\B" f match-head ] unit-test +[ t ] [ "foo" "foo\\B" f matches? ] unit-test +[ f ] [ "foo bar" "foo\\B bar" f matches? ] unit-test +[ t ] [ "fooxbar" "foo\\Bxbar" f matches? ] unit-test +[ f ] [ "foo" "foo\\Bbar" f matches? ] unit-test diff --git a/extra/regexp/regexp.factor b/extra/regexp/regexp.factor index 9d696319fc..c4b60e76e4 100755 --- a/extra/regexp/regexp.factor +++ b/extra/regexp/regexp.factor @@ -1,7 +1,7 @@ USING: arrays combinators kernel lazy-lists math math.parser namespaces parser parser-combinators parser-combinators.simple promises quotations sequences combinators.lib strings -assocs prettyprint.backend ; +assocs prettyprint.backend memoize ; USE: io IN: regexp @@ -148,10 +148,22 @@ TUPLE: group-result str ; C: group-result : 'non-capturing-group' ( -- parser ) - 'regexp' "(?:" ")" surrounded-by ; + "?:" token 'regexp' &> ; + +: 'positive-lookahead-group' ( -- parser ) + "?=" token 'regexp' &> [ ensure ] <@ ; + +: 'negative-lookahead-group' ( -- parser ) + "?!" token 'regexp' &> [ ensure-not ] <@ ; + +: 'simple-group' ( -- parser ) + 'regexp' [ [ ] <@ ] <@ ; : 'group' ( -- parser ) - 'regexp' [ [ ] <@ ] <@ + 'non-capturing-group' + 'positive-lookahead-group' + 'negative-lookahead-group' + 'simple-group' <|> <|> <|> "(" ")" surrounded-by ; : 'range' ( -- parser ) @@ -181,12 +193,21 @@ C: group-result [ ignore-case? get ] <@ "\\Q" "\\E" surrounded-by ; +: 'break' ( quot -- parser ) + satisfy ensure epsilon just <|> ; + +: 'break-escape' ( -- parser ) + "$" token [ "\r\n" member? ] 'break' <@literal + "\\b" token [ blank? ] 'break' <@literal <|> + "\\B" token [ blank? not ] 'break' <@literal <|> + "\\z" token epsilon just <@literal <|> ; + : 'simple' ( -- parser ) 'escaped-seq' - 'non-capturing-group' <|> + 'break-escape' <|> 'group' <|> - 'char' <|> - 'character-class' <|> ; + 'character-class' <|> + 'char' <|> ; : 'exactly-n' ( -- parser ) 'integer' [ exactly-n ] <@delay ; @@ -226,7 +247,7 @@ C: group-result : 'dummy' ( -- parser ) epsilon [ ] <@literal ; -: 'term' ( -- parser ) +MEMO: 'term' ( -- parser ) 'simple' 'repetition' 'interval' 'dummy' <|> <|> <&> [ first2 call ] <@ [ ] <@ ;