From 8b286cea4cadbfff3b9d12a7a23c74c400d8468f Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 11 Mar 2009 15:51:54 -0500 Subject: [PATCH] Adding word breaks to regexp --- basis/regexp/ast/ast.factor | 4 +-- basis/regexp/classes/classes.factor | 2 +- basis/regexp/compiler/compiler.factor | 9 ++++++- basis/regexp/parser/parser.factor | 10 +++++--- basis/regexp/regexp-tests.factor | 32 ++++++++++++------------ basis/regexp/regexp.factor | 13 +++------- basis/unicode/breaks/breaks-tests.factor | 2 ++ basis/unicode/breaks/breaks.factor | 17 +++++++++++++ 8 files changed, 56 insertions(+), 33 deletions(-) diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index 9288766888..ffaed2db62 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -58,8 +58,8 @@ M: from-to : char-class ( ranges ? -- term ) [ ] dip [ ] when ; -TUPLE: lookahead term positive? ; +TUPLE: lookahead term ; C: lookahead -TUPLE: lookbehind term positive? ; +TUPLE: lookbehind term ; C: lookbehind diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 4ddd470189..1959a91cb5 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class unmatchable-class terminator-class word-boundary-class ; -SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ; +SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file word-break ; TUPLE: range from to ; C: range diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index 0e0c0eaae6..c837df0f0f 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -3,7 +3,7 @@ USING: regexp.classes kernel sequences regexp.negation quotations assocs fry math locals combinators accessors words compiler.units kernel.private strings -sequences.private arrays call namespaces +sequences.private arrays call namespaces unicode.breaks regexp.transition-tables combinators.short-circuit ; IN: regexp.compiler @@ -15,6 +15,10 @@ SYMBOL: backwards? quot drop [ 2drop t ] ; +M: f question>quot drop [ 2drop f ] ; + +M: not-class question>quot + class>> question>quot [ not ] compose ; M: beginning-of-input question>quot drop [ drop zero? ] ; @@ -36,6 +40,9 @@ M: $ question>quot M: ^ question>quot drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ; +M: word-break question>quot + drop [ word-break-at? ] ; + : (execution-quot) ( next-state -- quot ) ! The conditions here are for lookaround and anchors, etc dup condition? [ diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index adbf0c53d3..c6a69f2508 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -56,6 +56,8 @@ ERROR: bad-class name ; { CHAR: z [ end-of-input ] } { CHAR: Z [ end-of-file ] } { CHAR: A [ beginning-of-input ] } + { CHAR: b [ word-break ] } + { CHAR: B [ word-break ] } [ ] } case ; @@ -138,10 +140,10 @@ Parenthized = "?:" Alternation:a => [[ a ]] => [[ a on off parse-options ]] | "?#" [^)]* => [[ f ]] | "?~" Alternation:a => [[ a ]] - | "?=" Alternation:a => [[ a t ]] - | "?!" Alternation:a => [[ a f ]] - | "?<=" Alternation:a => [[ a t ]] - | "? [[ a f ]] + | "?=" Alternation:a => [[ a ]] + | "?!" Alternation:a => [[ a ]] + | "?<=" Alternation:a => [[ a ]] + | "? [[ a ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index e01241552d..0b94f8296d 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -433,24 +433,24 @@ IN: regexp-tests [ { "foo" "fxx" "fab" } ] [ "fab fxx foo" R/ f../r all-matches [ >string ] map ] unit-test -! [ t ] [ "foo" "\\bfoo\\b" matches? ] unit-test -! [ t ] [ "afoob" "\\Bfoo\\B" matches? ] unit-test -! [ t ] [ "afoob" "\\bfoo\\b" matches? ] unit-test -! [ f ] [ "foo" "\\Bfoo\\B" matches? ] unit-test +[ t ] [ "foo" "\\bfoo\\b" re-contains? ] unit-test +[ t ] [ "afoob" "\\Bfoo\\B" re-contains? ] unit-test +[ f ] [ "afoob" "\\bfoo\\b" re-contains? ] unit-test +[ f ] [ "foo" "\\Bfoo\\B" re-contains? ] unit-test -! [ 3 ] [ "foo bar" "foo\\b" match-index-head ] unit-test -! [ f ] [ "fooxbar" "foo\\b" matches? ] unit-test -! [ t ] [ "foo" "foo\\b" matches? ] unit-test -! [ t ] [ "foo bar" "foo\\b bar" matches? ] unit-test -! [ f ] [ "fooxbar" "foo\\bxbar" matches? ] unit-test -! [ f ] [ "foo" "foo\\bbar" matches? ] unit-test +[ 3 ] [ "foo bar" "foo\\b" first-match length ] unit-test +[ f ] [ "fooxbar" "foo\\b" re-contains? ] unit-test +[ t ] [ "foo" "foo\\b" re-contains? ] unit-test +[ t ] [ "foo bar" "foo\\b bar" matches? ] unit-test +[ f ] [ "fooxbar" "foo\\bxbar" matches? ] unit-test +[ f ] [ "foo" "foo\\bbar" matches? ] unit-test -! [ f ] [ "foo bar" "foo\\B" matches? ] unit-test -! [ 3 ] [ "fooxbar" "foo\\B" match-index-head ] unit-test -! [ t ] [ "foo" "foo\\B" matches? ] unit-test -! [ f ] [ "foo bar" "foo\\B bar" matches? ] unit-test -! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test -! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test +[ f ] [ "foo bar" "foo\\B" re-contains? ] unit-test +[ 3 ] [ "fooxbar" "foo\\B" first-match length ] unit-test +[ f ] [ "foo" "foo\\B" re-contains? ] unit-test +[ f ] [ "foo bar" "foo\\B bar" matches? ] unit-test +[ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test +[ f ] [ "foo" "foo\\Bbar" matches? ] unit-test ! [ 1 ] [ "aaacb" "a+?" match-index-head ] unit-test ! [ 1 ] [ "aaacb" "aa??" match-index-head ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 7f27a13104..a7f2fa4e12 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -17,21 +17,16 @@ TUPLE: reverse-regexp < regexp ; > @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline - M: lookahead question>quot ! Returns ( index string -- ? ) - [ ast>dfa dfa>shortest-word '[ f _ execute ] ] maybe-negated ; + term>> ast>dfa dfa>shortest-word '[ f _ execute ] ; : ( ast -- reversed ) "r" string>options ; M: lookbehind question>quot ! Returns ( index string -- ? ) - [ - - ast>dfa dfa>reverse-shortest-word - '[ [ 1- ] dip f _ execute ] - ] maybe-negated ; + term>> + ast>dfa dfa>reverse-shortest-word + '[ [ 1- ] dip f _ execute ] ; : check-string ( string -- string ) ! Make this configurable diff --git a/basis/unicode/breaks/breaks-tests.factor b/basis/unicode/breaks/breaks-tests.factor index d8e220cf18..493c2db0c2 100644 --- a/basis/unicode/breaks/breaks-tests.factor +++ b/basis/unicode/breaks/breaks-tests.factor @@ -37,3 +37,5 @@ IN: unicode.breaks.tests grapheme-break-test parse-test-file [ >graphemes ] test word-break-test parse-test-file [ >words ] test + +[ { t f t t f t } ] [ 6 [ "as df" word-break-at? ] map ] unit-test diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor index ddcb99b829..f2e9454545 100644 --- a/basis/unicode/breaks/breaks.factor +++ b/basis/unicode/breaks/breaks.factor @@ -228,3 +228,20 @@ PRIVATE> : >words ( str -- words ) [ first-word ] >pieces ; + + + +: word-break-at? ( i str -- ? ) + { + [ drop zero? ] + [ length = ] + [ + [ nth-next [ word-break-prop ] dip ] 2keep + word-break-next nip + ] + } 2|| ;