Adding word breaks to regexp
parent
c193f1b68a
commit
8b286cea4c
|
@ -58,8 +58,8 @@ M: from-to <times>
|
||||||
: char-class ( ranges ? -- term )
|
: char-class ( ranges ? -- term )
|
||||||
[ <or-class> ] dip [ <not-class> ] when ;
|
[ <or-class> ] dip [ <not-class> ] when ;
|
||||||
|
|
||||||
TUPLE: lookahead term positive? ;
|
TUPLE: lookahead term ;
|
||||||
C: <lookahead> lookahead
|
C: <lookahead> lookahead
|
||||||
|
|
||||||
TUPLE: lookbehind term positive? ;
|
TUPLE: lookbehind term ;
|
||||||
C: <lookbehind> lookbehind
|
C: <lookbehind> lookbehind
|
||||||
|
|
|
@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class
|
||||||
control-character-class hex-digit-class java-blank-class c-identifier-class
|
control-character-class hex-digit-class java-blank-class c-identifier-class
|
||||||
unmatchable-class terminator-class word-boundary-class ;
|
unmatchable-class terminator-class word-boundary-class ;
|
||||||
|
|
||||||
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ;
|
SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file word-break ;
|
||||||
|
|
||||||
TUPLE: range from to ;
|
TUPLE: range from to ;
|
||||||
C: <range> range
|
C: <range> range
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
USING: regexp.classes kernel sequences regexp.negation
|
USING: regexp.classes kernel sequences regexp.negation
|
||||||
quotations assocs fry math locals combinators
|
quotations assocs fry math locals combinators
|
||||||
accessors words compiler.units kernel.private strings
|
accessors words compiler.units kernel.private strings
|
||||||
sequences.private arrays call namespaces
|
sequences.private arrays call namespaces unicode.breaks
|
||||||
regexp.transition-tables combinators.short-circuit ;
|
regexp.transition-tables combinators.short-circuit ;
|
||||||
IN: regexp.compiler
|
IN: regexp.compiler
|
||||||
|
|
||||||
|
@ -15,6 +15,10 @@ SYMBOL: backwards?
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
M: t question>quot drop [ 2drop t ] ;
|
M: t question>quot drop [ 2drop t ] ;
|
||||||
|
M: f question>quot drop [ 2drop f ] ;
|
||||||
|
|
||||||
|
M: not-class question>quot
|
||||||
|
class>> question>quot [ not ] compose ;
|
||||||
|
|
||||||
M: beginning-of-input question>quot
|
M: beginning-of-input question>quot
|
||||||
drop [ drop zero? ] ;
|
drop [ drop zero? ] ;
|
||||||
|
@ -36,6 +40,9 @@ M: $ question>quot
|
||||||
M: ^ question>quot
|
M: ^ question>quot
|
||||||
drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
|
drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
|
||||||
|
|
||||||
|
M: word-break question>quot
|
||||||
|
drop [ word-break-at? ] ;
|
||||||
|
|
||||||
: (execution-quot) ( next-state -- quot )
|
: (execution-quot) ( next-state -- quot )
|
||||||
! The conditions here are for lookaround and anchors, etc
|
! The conditions here are for lookaround and anchors, etc
|
||||||
dup condition? [
|
dup condition? [
|
||||||
|
|
|
@ -56,6 +56,8 @@ ERROR: bad-class name ;
|
||||||
{ CHAR: z [ end-of-input <tagged-epsilon> ] }
|
{ CHAR: z [ end-of-input <tagged-epsilon> ] }
|
||||||
{ CHAR: Z [ end-of-file <tagged-epsilon> ] }
|
{ CHAR: Z [ end-of-file <tagged-epsilon> ] }
|
||||||
{ CHAR: A [ beginning-of-input <tagged-epsilon> ] }
|
{ CHAR: A [ beginning-of-input <tagged-epsilon> ] }
|
||||||
|
{ CHAR: b [ word-break <tagged-epsilon> ] }
|
||||||
|
{ CHAR: B [ word-break <not-class> <tagged-epsilon> ] }
|
||||||
[ ]
|
[ ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
|
@ -138,10 +140,10 @@ Parenthized = "?:" Alternation:a => [[ a ]]
|
||||||
=> [[ a on off parse-options <with-options> ]]
|
=> [[ a on off parse-options <with-options> ]]
|
||||||
| "?#" [^)]* => [[ f ]]
|
| "?#" [^)]* => [[ f ]]
|
||||||
| "?~" Alternation:a => [[ a <negation> ]]
|
| "?~" Alternation:a => [[ a <negation> ]]
|
||||||
| "?=" Alternation:a => [[ a t <lookahead> <tagged-epsilon> ]]
|
| "?=" Alternation:a => [[ a <lookahead> <tagged-epsilon> ]]
|
||||||
| "?!" Alternation:a => [[ a f <lookahead> <tagged-epsilon> ]]
|
| "?!" Alternation:a => [[ a <lookahead> <not-class> <tagged-epsilon> ]]
|
||||||
| "?<=" Alternation:a => [[ a t <lookbehind> <tagged-epsilon> ]]
|
| "?<=" Alternation:a => [[ a <lookbehind> <tagged-epsilon> ]]
|
||||||
| "?<!" Alternation:a => [[ a f <lookbehind> <tagged-epsilon> ]]
|
| "?<!" Alternation:a => [[ a <lookbehind> <not-class> <tagged-epsilon> ]]
|
||||||
| Alternation
|
| Alternation
|
||||||
|
|
||||||
Element = "(" Parenthized:p ")" => [[ p ]]
|
Element = "(" Parenthized:p ")" => [[ p ]]
|
||||||
|
|
|
@ -433,24 +433,24 @@ IN: regexp-tests
|
||||||
|
|
||||||
[ { "foo" "fxx" "fab" } ] [ "fab fxx foo" R/ f../r all-matches [ >string ] map ] unit-test
|
[ { "foo" "fxx" "fab" } ] [ "fab fxx foo" R/ f../r all-matches [ >string ] map ] unit-test
|
||||||
|
|
||||||
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
|
[ t ] [ "foo" "\\bfoo\\b" <regexp> re-contains? ] unit-test
|
||||||
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
[ t ] [ "afoob" "\\Bfoo\\B" <regexp> re-contains? ] unit-test
|
||||||
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
|
[ f ] [ "afoob" "\\bfoo\\b" <regexp> re-contains? ] unit-test
|
||||||
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
[ f ] [ "foo" "\\Bfoo\\B" <regexp> re-contains? ] unit-test
|
||||||
|
|
||||||
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-index-head ] unit-test
|
[ 3 ] [ "foo bar" "foo\\b" <regexp> first-match length ] unit-test
|
||||||
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
|
[ f ] [ "fooxbar" "foo\\b" <regexp> re-contains? ] unit-test
|
||||||
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
|
[ t ] [ "foo" "foo\\b" <regexp> re-contains? ] unit-test
|
||||||
! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
|
[ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
|
||||||
! [ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
|
[ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
|
||||||
! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
|
[ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
|
[ f ] [ "foo bar" "foo\\B" <regexp> re-contains? ] unit-test
|
||||||
! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-index-head ] unit-test
|
[ 3 ] [ "fooxbar" "foo\\B" <regexp> first-match length ] unit-test
|
||||||
! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
|
[ f ] [ "foo" "foo\\B" <regexp> re-contains? ] unit-test
|
||||||
! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
|
[ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
|
||||||
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
|
[ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
|
||||||
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
|
[ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
|
! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
|
||||||
! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test
|
! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test
|
||||||
|
|
|
@ -17,21 +17,16 @@ TUPLE: reverse-regexp < regexp ;
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
: maybe-negated ( lookaround quot -- regexp-quot )
|
|
||||||
'[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
|
|
||||||
|
|
||||||
M: lookahead question>quot ! Returns ( index string -- ? )
|
M: lookahead question>quot ! Returns ( index string -- ? )
|
||||||
[ ast>dfa dfa>shortest-word '[ f _ execute ] ] maybe-negated ;
|
term>> ast>dfa dfa>shortest-word '[ f _ execute ] ;
|
||||||
|
|
||||||
: <reversed-option> ( ast -- reversed )
|
: <reversed-option> ( ast -- reversed )
|
||||||
"r" string>options <with-options> ;
|
"r" string>options <with-options> ;
|
||||||
|
|
||||||
M: lookbehind question>quot ! Returns ( index string -- ? )
|
M: lookbehind question>quot ! Returns ( index string -- ? )
|
||||||
[
|
term>> <reversed-option>
|
||||||
<reversed-option>
|
ast>dfa dfa>reverse-shortest-word
|
||||||
ast>dfa dfa>reverse-shortest-word
|
'[ [ 1- ] dip f _ execute ] ;
|
||||||
'[ [ 1- ] dip f _ execute ]
|
|
||||||
] maybe-negated ;
|
|
||||||
|
|
||||||
: check-string ( string -- string )
|
: check-string ( string -- string )
|
||||||
! Make this configurable
|
! Make this configurable
|
||||||
|
|
|
@ -37,3 +37,5 @@ IN: unicode.breaks.tests
|
||||||
|
|
||||||
grapheme-break-test parse-test-file [ >graphemes ] test
|
grapheme-break-test parse-test-file [ >graphemes ] test
|
||||||
word-break-test parse-test-file [ >words ] test
|
word-break-test parse-test-file [ >words ] test
|
||||||
|
|
||||||
|
[ { t f t t f t } ] [ 6 [ "as df" word-break-at? ] map ] unit-test
|
||||||
|
|
|
@ -228,3 +228,20 @@ PRIVATE>
|
||||||
|
|
||||||
: >words ( str -- words )
|
: >words ( str -- words )
|
||||||
[ first-word ] >pieces ;
|
[ first-word ] >pieces ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
: nth-next ( i str -- str[i-1] str[i] )
|
||||||
|
[ [ 1- ] keep ] dip '[ _ nth ] bi@ ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: word-break-at? ( i str -- ? )
|
||||||
|
{
|
||||||
|
[ drop zero? ]
|
||||||
|
[ length = ]
|
||||||
|
[
|
||||||
|
[ nth-next [ word-break-prop ] dip ] 2keep
|
||||||
|
word-break-next nip
|
||||||
|
]
|
||||||
|
} 2|| ;
|
||||||
|
|
Loading…
Reference in New Issue