Adding word breaks to regexp

db4
Daniel Ehrenberg 2009-03-11 15:51:54 -05:00
parent c193f1b68a
commit 8b286cea4c
8 changed files with 56 additions and 33 deletions

View File

@ -58,8 +58,8 @@ M: from-to <times>
: char-class ( ranges ? -- term ) : char-class ( ranges ? -- term )
[ <or-class> ] dip [ <not-class> ] when ; [ <or-class> ] dip [ <not-class> ] when ;
TUPLE: lookahead term positive? ; TUPLE: lookahead term ;
C: <lookahead> lookahead C: <lookahead> lookahead
TUPLE: lookbehind term positive? ; TUPLE: lookbehind term ;
C: <lookbehind> lookbehind C: <lookbehind> lookbehind

View File

@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ; unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ; SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file word-break ;
TUPLE: range from to ; TUPLE: range from to ;
C: <range> range C: <range> range

View File

@ -3,7 +3,7 @@
USING: regexp.classes kernel sequences regexp.negation USING: regexp.classes kernel sequences regexp.negation
quotations assocs fry math locals combinators quotations assocs fry math locals combinators
accessors words compiler.units kernel.private strings accessors words compiler.units kernel.private strings
sequences.private arrays call namespaces sequences.private arrays call namespaces unicode.breaks
regexp.transition-tables combinators.short-circuit ; regexp.transition-tables combinators.short-circuit ;
IN: regexp.compiler IN: regexp.compiler
@ -15,6 +15,10 @@ SYMBOL: backwards?
<PRIVATE <PRIVATE
M: t question>quot drop [ 2drop t ] ; M: t question>quot drop [ 2drop t ] ;
M: f question>quot drop [ 2drop f ] ;
M: not-class question>quot
class>> question>quot [ not ] compose ;
M: beginning-of-input question>quot M: beginning-of-input question>quot
drop [ drop zero? ] ; drop [ drop zero? ] ;
@ -36,6 +40,9 @@ M: $ question>quot
M: ^ question>quot M: ^ question>quot
drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ; drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
M: word-break question>quot
drop [ word-break-at? ] ;
: (execution-quot) ( next-state -- quot ) : (execution-quot) ( next-state -- quot )
! The conditions here are for lookaround and anchors, etc ! The conditions here are for lookaround and anchors, etc
dup condition? [ dup condition? [

View File

@ -56,6 +56,8 @@ ERROR: bad-class name ;
{ CHAR: z [ end-of-input <tagged-epsilon> ] } { CHAR: z [ end-of-input <tagged-epsilon> ] }
{ CHAR: Z [ end-of-file <tagged-epsilon> ] } { CHAR: Z [ end-of-file <tagged-epsilon> ] }
{ CHAR: A [ beginning-of-input <tagged-epsilon> ] } { CHAR: A [ beginning-of-input <tagged-epsilon> ] }
{ CHAR: b [ word-break <tagged-epsilon> ] }
{ CHAR: B [ word-break <not-class> <tagged-epsilon> ] }
[ ] [ ]
} case ; } case ;
@ -138,10 +140,10 @@ Parenthized = "?:" Alternation:a => [[ a ]]
=> [[ a on off parse-options <with-options> ]] => [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]] | "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]] | "?~" Alternation:a => [[ a <negation> ]]
| "?=" Alternation:a => [[ a t <lookahead> <tagged-epsilon> ]] | "?=" Alternation:a => [[ a <lookahead> <tagged-epsilon> ]]
| "?!" Alternation:a => [[ a f <lookahead> <tagged-epsilon> ]] | "?!" Alternation:a => [[ a <lookahead> <not-class> <tagged-epsilon> ]]
| "?<=" Alternation:a => [[ a t <lookbehind> <tagged-epsilon> ]] | "?<=" Alternation:a => [[ a <lookbehind> <tagged-epsilon> ]]
| "?<!" Alternation:a => [[ a f <lookbehind> <tagged-epsilon> ]] | "?<!" Alternation:a => [[ a <lookbehind> <not-class> <tagged-epsilon> ]]
| Alternation | Alternation
Element = "(" Parenthized:p ")" => [[ p ]] Element = "(" Parenthized:p ")" => [[ p ]]

View File

@ -433,24 +433,24 @@ IN: regexp-tests
[ { "foo" "fxx" "fab" } ] [ "fab fxx foo" R/ f../r all-matches [ >string ] map ] unit-test [ { "foo" "fxx" "fab" } ] [ "fab fxx foo" R/ f../r all-matches [ >string ] map ] unit-test
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test [ t ] [ "foo" "\\bfoo\\b" <regexp> re-contains? ] unit-test
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test [ t ] [ "afoob" "\\Bfoo\\B" <regexp> re-contains? ] unit-test
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test [ f ] [ "afoob" "\\bfoo\\b" <regexp> re-contains? ] unit-test
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test [ f ] [ "foo" "\\Bfoo\\B" <regexp> re-contains? ] unit-test
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-index-head ] unit-test [ 3 ] [ "foo bar" "foo\\b" <regexp> first-match length ] unit-test
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test [ f ] [ "fooxbar" "foo\\b" <regexp> re-contains? ] unit-test
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test [ t ] [ "foo" "foo\\b" <regexp> re-contains? ] unit-test
! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
! [ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test [ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test [ f ] [ "foo bar" "foo\\B" <regexp> re-contains? ] unit-test
! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-index-head ] unit-test [ 3 ] [ "fooxbar" "foo\\B" <regexp> first-match length ] unit-test
! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test [ f ] [ "foo" "foo\\B" <regexp> re-contains? ] unit-test
! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test ! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test ! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test

View File

@ -17,21 +17,16 @@ TUPLE: reverse-regexp < regexp ;
<PRIVATE <PRIVATE
: maybe-negated ( lookaround quot -- regexp-quot )
'[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
M: lookahead question>quot ! Returns ( index string -- ? ) M: lookahead question>quot ! Returns ( index string -- ? )
[ ast>dfa dfa>shortest-word '[ f _ execute ] ] maybe-negated ; term>> ast>dfa dfa>shortest-word '[ f _ execute ] ;
: <reversed-option> ( ast -- reversed ) : <reversed-option> ( ast -- reversed )
"r" string>options <with-options> ; "r" string>options <with-options> ;
M: lookbehind question>quot ! Returns ( index string -- ? ) M: lookbehind question>quot ! Returns ( index string -- ? )
[ term>> <reversed-option>
<reversed-option> ast>dfa dfa>reverse-shortest-word
ast>dfa dfa>reverse-shortest-word '[ [ 1- ] dip f _ execute ] ;
'[ [ 1- ] dip f _ execute ]
] maybe-negated ;
: check-string ( string -- string ) : check-string ( string -- string )
! Make this configurable ! Make this configurable

View File

@ -37,3 +37,5 @@ IN: unicode.breaks.tests
grapheme-break-test parse-test-file [ >graphemes ] test grapheme-break-test parse-test-file [ >graphemes ] test
word-break-test parse-test-file [ >words ] test word-break-test parse-test-file [ >words ] test
[ { t f t t f t } ] [ 6 [ "as df" word-break-at? ] map ] unit-test

View File

@ -228,3 +228,20 @@ PRIVATE>
: >words ( str -- words ) : >words ( str -- words )
[ first-word ] >pieces ; [ first-word ] >pieces ;
<PRIVATE
: nth-next ( i str -- str[i-1] str[i] )
[ [ 1- ] keep ] dip '[ _ nth ] bi@ ;
PRIVATE>
: word-break-at? ( i str -- ? )
{
[ drop zero? ]
[ length = ]
[
[ nth-next [ word-break-prop ] dip ] 2keep
word-break-next nip
]
} 2|| ;