diff --git a/basis/regexp/classes/classes-tests.factor b/basis/regexp/classes/classes-tests.factor index 9a210fb576..520e23c749 100644 --- a/basis/regexp/classes/classes-tests.factor +++ b/basis/regexp/classes/classes-tests.factor @@ -30,6 +30,7 @@ IN: regexp.classes.tests [ T{ or-class { seq { 3 2 } } } ] [ { 2 3 } 1 2array ] unit-test [ f ] [ t ] unit-test [ t ] [ f ] unit-test +[ f ] [ 1 1 t replace-question ] unit-test ! Making classes into nested conditionals @@ -43,7 +44,7 @@ IN: regexp.classes.tests SYMBOL: foo SYMBOL: bar -[ T{ condition f T{ primitive-class f bar } T{ condition f T{ primitive-class f foo } { 1 2 3 } { 1 3 } } T{ condition f T{ primitive-class f foo } { 1 2 } { 1 } } } ] [ { { 1 t } { 2 T{ primitive-class f foo } } { 3 T{ primitive-class f bar } } } table>condition ] unit-test +[ T{ condition f T{ primitive-class f bar } T{ condition f T{ primitive-class f foo } { 1 3 2 } { 1 3 } } T{ condition f T{ primitive-class f foo } { 1 2 } { 1 } } } ] [ { { 1 t } { 3 T{ primitive-class f bar } } { 2 T{ primitive-class f foo } } } table>condition ] unit-test [ t ] [ foo dup t replace-question ] unit-test [ f ] [ foo dup f replace-question ] unit-test diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index f8fce02213..6ea87fbb49 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class unmatchable-class terminator-class word-boundary-class ; -SINGLETONS: beginning-of-input ^ end-of-input $ ; +SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ; TUPLE: range from to ; C: range @@ -233,7 +233,7 @@ M: or-class replace-question replace-compound ; M: not-class replace-question - class>> replace-question ; + [ class>> ] 2dip replace-question ; : answer ( table question answer -- new-table ) '[ _ _ replace-question ] assoc-map @@ -258,7 +258,7 @@ M: not-class class>questions class>> class>questions ; M: object class>questions 1array ; : table>questions ( table -- questions ) - values class>questions t swap remove ; + values [ class>questions ] gather >array t swap remove ; : table>condition ( table -- condition ) ! input table is state => class @@ -269,3 +269,12 @@ M: object class>questions 1array ; [ [ question>> ] [ yes>> ] [ no>> ] tri ] dip '[ _ condition-map ] bi@ ] [ call ] if ; inline recursive + +: condition-states ( condition -- states ) + dup condition? [ + [ yes>> ] [ no>> ] bi + [ condition-states ] bi@ append prune + ] [ 1array ] if ; + +: condition-at ( condition assoc -- new-condition ) + '[ _ at ] condition-map ; diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index 30c9a5a5cb..d0f60fc6a2 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -3,27 +3,76 @@ USING: regexp.classes kernel sequences regexp.negation quotations regexp.minimize assocs fry math locals combinators accessors words compiler.units kernel.private strings -sequences.private arrays regexp.matchers call ; +sequences.private arrays regexp.matchers call namespaces +regexp.transition-tables combinators.short-circuit ; IN: regexp.compiler -: literals>cases ( literal-transitions -- case-body ) - [ 1quotation ] assoc-map ; +GENERIC: question>quot ( question -- quot ) + +quot drop [ 2drop t ] ; + +M: beginning-of-input question>quot + drop [ drop zero? ] ; + +M: end-of-input question>quot + drop [ length = ] ; + +M: end-of-file question>quot + drop [ + { + [ length swap - 2 <= ] + [ swap tail { "\n" "\r\n" "\r" "" } member? ] + } 2&& + [ [ nip [ length ] keep ] when ] keep + ] ; + +M: $ question>quot + drop [ { [ length = ] [ ?nth "\r\n" member? ] } 2|| ] ; + +M: ^ question>quot + drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ; + +! Maybe the condition>quot things can be combined, given a suitable method +! for question>quot on classes, but maybe that'd make stack shuffling annoying + +: execution-quot ( next-state -- quot ) + ! The conditions here are for lookaround and anchors, etc + dup condition? [ + [ question>> question>quot ] [ yes>> ] [ no>> ] tri + [ execution-quot ] bi@ + '[ 2dup @ _ _ if ] + ] [ + ! There shouldn't be a condition like this! + dup sequence? + [ [ [ 2drop ] ] [ first '[ _ execute ] ] if-empty ] + [ '[ _ execute ] ] if + ] if ; + +TUPLE: box contents ; +C: box : condition>quot ( condition -- quot ) + ! Conditions here are for different classes dup condition? [ [ question>> ] [ yes>> ] [ no>> ] tri [ condition>quot ] bi@ '[ dup _ class-member? _ _ if ] ] [ - [ [ 3drop ] ] [ '[ drop _ execute ] ] if-empty + contents>> + [ [ 3drop ] ] [ execution-quot '[ drop @ ] ] if-empty ] if ; -: new-non-literals>dispatch ( non-literal-transitions -- quot ) - table>condition condition>quot ; - : non-literals>dispatch ( non-literal-transitions -- quot ) - [ [ '[ dup _ class-member? ] ] [ '[ drop _ execute ] ] bi* ] assoc-map - [ 3drop ] suffix '[ _ cond ] ; + [ swap ] assoc-map ! we want state => predicate, and get the opposite as input + table>condition [ ] condition-map condition>quot ; + +: literals>cases ( literal-transitions -- case-body ) + [ execution-quot ] assoc-map ; : expand-one-or ( or-class transition -- alist ) [ seq>> ] dip '[ _ 2array ] map ; @@ -38,17 +87,22 @@ IN: regexp.compiler >alist expand-or [ first integer? ] partition [ literals>cases ] [ non-literals>dispatch ] bi* ; -:: step ( last-match index str case-body final? -- last-index/f ) +:: step ( last-match index str quot final? direction -- last-index/f ) final? index last-match ? index str bounds-check? [ - index 1+ str + index direction + str index str nth-unsafe - case-body case + quot call ] when ; inline +: direction ( -- n ) + backwards? get -1 1 ? ; + : transitions>quot ( transitions final-state? -- quot ) - [ split-literals suffix ] dip - '[ { array-capacity sequence } declare _ _ step ] ; + dup shortest? get and [ 2drop [ drop nip ] ] [ + [ split-literals swap case>quot ] dip direction + '[ { array-capacity string } declare _ _ _ step ] + ] if ; : word>quot ( word dfa -- quot ) [ transitions>> at ] @@ -64,30 +118,37 @@ IN: regexp.compiler ] each ] with-compilation-unit ; -: transitions-at ( transitions assoc -- new-transitions ) - dup '[ - [ _ at ] - [ [ _ at ] assoc-map ] bi* - ] assoc-map ; - : states>words ( dfa -- words dfa ) dup transitions>> keys [ gensym ] H{ } map>assoc - [ [ transitions-at ] rewrite-transitions ] + [ transitions-at ] [ values ] bi swap ; : dfa>word ( dfa -- word ) states>words [ states>code ] keep start-state>> ; -: check-sequence ( string -- string ) +: check-string ( string -- string ) ! Make this configurable - dup sequence? [ "String required" throw ] unless ; + dup string? [ "String required" throw ] unless ; -: run-regexp ( start-index string word -- ? ) - { [ f ] [ >fixnum ] [ check-sequence ] [ execute ] } spread ; inline +: setup-regexp ( start-index string -- f start-index string ) + [ f ] [ >fixnum ] [ check-string ] tri* ; inline + +PRIVATE> + +! The quotation returned is ( start-index string -- i/f ) : dfa>quotation ( dfa -- quot ) - dfa>word '[ _ run-regexp ] ; + dfa>word execution-quot '[ setup-regexp @ ] ; + +: dfa>shortest-quotation ( dfa -- quot ) + t shortest? [ dfa>quotation ] with-variable ; + +: dfa>reverse-quotation ( dfa -- quot ) + t backwards? [ dfa>quotation ] with-variable ; + +: dfa>reverse-shortest-quotation ( dfa -- quot ) + t backwards? [ dfa>shortest-quotation ] with-variable ; TUPLE: quot-matcher quot ; C: quot-matcher diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 6ddc0396a7..d137ee3e4f 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -39,21 +39,26 @@ IN: regexp.dfa : find-transitions ( dfa-state nfa -- next-dfa-state ) transitions>> - '[ _ at keys ] gather - epsilon swap remove ; + '[ _ at keys [ condition-states ] map concat ] gather + [ tagged-epsilon? not ] filter ; : add-todo-state ( state visited-states new-states -- ) 3dup drop key? [ 3drop ] [ [ conjoin ] [ push ] bi-curry* bi ] if ; +: add-todo-states ( state/condition visited-states new-states -- ) + [ condition-states ] 2dip + '[ _ _ add-todo-state ] each ; + :: new-transitions ( nfa dfa new-states visited-states -- nfa dfa ) new-states [ nfa dfa ] [ pop :> state + state dfa transitions>> maybe-initialize-key state nfa find-transitions [| trans | state trans nfa find-closure :> new-state - new-state visited-states new-states add-todo-state + new-state visited-states new-states add-todo-states state new-state trans dfa set-transition ] each nfa dfa new-states visited-states new-transitions @@ -73,7 +78,7 @@ IN: regexp.dfa : construct-dfa ( nfa -- dfa ) dup initialize-dfa - dup start-state>> 1vector + dup start-state>> condition-states >vector H{ } clone new-transitions [ set-final-states ] keep ; diff --git a/basis/regexp/disambiguate/disambiguate.factor b/basis/regexp/disambiguate/disambiguate.factor index abfe76d832..eac9c7e81d 100644 --- a/basis/regexp/disambiguate/disambiguate.factor +++ b/basis/regexp/disambiguate/disambiguate.factor @@ -42,6 +42,3 @@ TUPLE: parts in out ; ] preserving-epsilon ] assoc-map ] change-transitions ; - -: nfa>dfa ( nfa -- dfa ) - disambiguate construct-dfa minimize ; diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor index c98cf131cb..822ca68caf 100644 --- a/basis/regexp/minimize/minimize.factor +++ b/basis/regexp/minimize/minimize.factor @@ -5,29 +5,11 @@ accessors locals math sorting arrays sets hashtables regexp.dfa combinators.short-circuit regexp.classes ; IN: regexp.minimize -: number-transitions ( transitions numbering -- new-transitions ) - dup '[ - [ _ at ] - [ [ [ _ at ] condition-map ] assoc-map ] bi* - ] assoc-map ; - : table>state-numbers ( table -- assoc ) transitions>> keys [ swap ] H{ } assoc-map-as ; -: map-set ( assoc quot -- new-assoc ) - '[ drop @ dup ] assoc-map ; inline - -: rewrite-transitions ( transition-table assoc quot -- transition-table ) - [ - [ clone ] dip - [ '[ _ at ] change-start-state ] - [ '[ [ _ at ] map-set ] change-final-states ] - [ ] tri - ] dip '[ _ @ ] change-transitions ; inline - : number-states ( table -- newtable ) - dup table>state-numbers - [ number-transitions ] rewrite-transitions ; + dup table>state-numbers transitions-at ; : no-conditions? ( state transition-table -- ? ) transitions>> at values [ condition? ] any? not ; @@ -103,4 +85,4 @@ IN: regexp.minimize [ combine-transitions ] rewrite-transitions ; : minimize ( table -- minimal-table ) - clone number-states combine-states ; + clone number-states ; ! combine-states ; diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor index 0cfcdfc6ea..b03223fabf 100644 --- a/basis/regexp/negation/negation.factor +++ b/basis/regexp/negation/negation.factor @@ -2,11 +2,12 @@ ! See http://factorcode.org/license.txt for BSD license. USING: regexp.nfa regexp.disambiguate kernel sequences assocs regexp.classes hashtables accessors fry vectors -regexp.ast regexp.transition-tables regexp.minimize namespaces ; +regexp.ast regexp.transition-tables regexp.minimize +regexp.dfa namespaces ; IN: regexp.negation : ast>dfa ( parse-tree -- minimal-dfa ) - construct-nfa nfa>dfa ; + construct-nfa disambiguate construct-dfa minimize ; CONSTANT: fail-state -1 @@ -33,15 +34,9 @@ CONSTANT: fail-state -1 [ add-fail-state ] change-transitions dup inverse-final-states >>final-states ; -: renumber-transitions ( transitions numbering -- new-transitions ) - dup '[ - [ _ at ] - [ [ [ _ at ] map ] assoc-map ] bi* - ] assoc-map ; - : renumber-states ( transition-table -- transition-table ) dup transitions>> keys [ next-state ] H{ } map>assoc - [ renumber-transitions ] rewrite-transitions ; + transitions-at ; : box-transitions ( transition-table -- transition-table ) [ [ [ 1vector ] assoc-map ] assoc-map ] change-transitions ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 302b1ebc55..2dc2c1798b 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -56,9 +56,16 @@ M:: star nfa-node ( node -- start end ) s2 s3 ; GENERIC: modify-epsilon ( tag -- newtag ) +! Potential off-by-one errors when lookaround nested in lookbehind M: object modify-epsilon ; +M: $ modify-epsilon + multiline option? [ drop end-of-input ] unless ; + +M: ^ modify-epsilon + multiline option? [ drop beginning-of-input ] unless ; + M: tagged-epsilon nfa-node clone [ modify-epsilon ] change-tag add-simple-entry ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 18aef7fa49..5870395b7c 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -54,6 +54,7 @@ ERROR: bad-class name ; { CHAR: D [ digit-class ] } { CHAR: z [ end-of-input ] } + { CHAR: Z [ end-of-file ] } { CHAR: A [ beginning-of-input ] } [ ] } case ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 9425e38727..488ab8cba3 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -45,9 +45,9 @@ IN: regexp-tests ! Dotall mode -- when on, . matches newlines. ! Off by default. [ f ] [ "\n" "." matches? ] unit-test -! [ t ] [ "\n" "(?s)." matches? ] unit-test +[ t ] [ "\n" "(?s:.)" matches? ] unit-test [ t ] [ "\n" R/ ./s matches? ] unit-test -! [ f ] [ "\n\n" "(?s).(?-s)." matches? ] unit-test +[ f ] [ "\n\n" "(?s:.)." matches? ] unit-test [ f ] [ "" ".+" matches? ] unit-test [ t ] [ "a" ".+" matches? ] unit-test @@ -221,17 +221,15 @@ IN: regexp-tests [ t ] [ "c" R/ [A-Z]/i matches? ] unit-test [ f ] [ "3" R/ [A-Z]/i matches? ] unit-test -/* -[ t ] [ "a" "(?i)a" matches? ] unit-test -[ t ] [ "a" "(?i)a" matches? ] unit-test -[ t ] [ "A" "(?i)a" matches? ] unit-test -[ t ] [ "A" "(?i)a" matches? ] unit-test +[ t ] [ "a" "(?i:a)" matches? ] unit-test +[ t ] [ "a" "(?i:a)" matches? ] unit-test +[ t ] [ "A" "(?i:a)" matches? ] unit-test +[ t ] [ "A" "(?i:a)" matches? ] unit-test -[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test -[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test -[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test -[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test -*/ +[ t ] [ "a" R/ (?-i:a)/i matches? ] unit-test +[ t ] [ "a" R/ (?-i:a)/i matches? ] unit-test +[ f ] [ "A" R/ (?-i:a)/i matches? ] unit-test +[ f ] [ "A" R/ (?-i:a)/i matches? ] unit-test [ f ] [ "A" "[a-z]" matches? ] unit-test [ t ] [ "A" R/ [a-z]/i matches? ] unit-test @@ -242,8 +240,8 @@ IN: regexp-tests [ t ] [ "abc" reverse R/ abc/r matches? ] unit-test [ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test -[ t ] [ "xabc" R/ abc/ match-index-head >boolean ] unit-test -[ t ] [ "xabc" R/ a[bB][cC]/ match-index-head >boolean ] unit-test +[ t ] [ 3 "xabc" R/ abc/ match-index-from >boolean ] unit-test +[ t ] [ 3 "xabc" R/ a[bB][cC]/ match-index-from >boolean ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test @@ -276,10 +274,6 @@ IN: regexp-tests [ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" match-head >string ] unit-test -! [ t ] [ "a:b" ".+:?" matches? ] unit-test - -! [ 1 ] [ "hello" ".+?" match length ] unit-test - [ { "1" "2" "3" "4" } ] [ "1ABC2DEF3GHI4" R/ [A-Z]+/ re-split [ >string ] map ] unit-test @@ -304,18 +298,16 @@ IN: regexp-tests [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test -/* [ f ] [ "ab" "a(?!b)" match-head ] unit-test [ "a" ] [ "ac" "a(?!b)" match-head >string ] unit-test [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test [ "a" ] [ "ab" "a(?=b)(?=b)" match-head >string ] unit-test -[ "a" ] [ "ba" "a(?<=b)(?<=b)" match-head >string ] unit-test -[ "a" ] [ "cab" "a(?=b)(?<=c)" match-head >string ] unit-test +[ "a" ] [ "ba" "(?<=b)(?<=b)a" match-head >string ] unit-test +[ "a" ] [ "cab" "(?<=c)a(?=b)" match-head >string ] unit-test [ 3 ] [ "foobar" "foo(?=bar)" match-index-head ] unit-test [ f ] [ "foobxr" "foo(?=bar)" match-index-head ] unit-test -*/ ! Bug in parsing word [ t ] [ "a" R' a' matches? ] unit-test @@ -349,56 +341,70 @@ IN: regexp-tests [ f ] [ "foo" dfa>> >boolean ] unit-test [ t ] [ R/ foo/ dfa>> >boolean ] unit-test -! [ t ] [ "a" R/ ^a/ matches? ] unit-test -! [ f ] [ "\na" R/ ^a/ matches? ] unit-test -! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test -! [ f ] [ "\ra" R/ ^a/ matches? ] unit-test +[ t ] [ "a" R/ ^a/ matches? ] unit-test +[ f ] [ "\na" R/ ^a/ matches? ] unit-test +[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test +[ f ] [ "\ra" R/ ^a/ matches? ] unit-test -! [ t ] [ "a" R/ a$/ matches? ] unit-test -! [ f ] [ "a\n" R/ a$/ matches? ] unit-test -! [ f ] [ "a\r" R/ a$/ matches? ] unit-test -! [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test +[ 1 ] [ "a" R/ ^a/ count-matches ] unit-test +[ 0 ] [ "\na" R/ ^a/ count-matches ] unit-test +[ 0 ] [ "\r\na" R/ ^a/ count-matches ] unit-test +[ 0 ] [ "\ra" R/ ^a/ count-matches ] unit-test -! [ t ] [ "a" R/ a$|b$/ matches? ] unit-test -! [ t ] [ "b" R/ a$|b$/ matches? ] unit-test -! [ t ] [ "ab" R/ a$|b$/ matches? ] unit-test -! [ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test +[ t ] [ "a" R/ a$/ matches? ] unit-test +[ f ] [ "a\n" R/ a$/ matches? ] unit-test +[ f ] [ "a\r" R/ a$/ matches? ] unit-test +[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test -! [ t ] [ "a" R/ \Aa/ matches? ] unit-test -! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test -! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test -! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test +[ 1 ] [ "a" R/ a$/ count-matches ] unit-test +[ 0 ] [ "a\n" R/ a$/ count-matches ] unit-test +[ 0 ] [ "a\r" R/ a$/ count-matches ] unit-test +[ 0 ] [ "a\r\n" R/ a$/ count-matches ] unit-test -! [ t ] [ "a" R/ \Aa/m matches? ] unit-test -! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test -! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test -! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test +[ t ] [ "a" R/ a$|b$/ matches? ] unit-test +[ t ] [ "b" R/ a$|b$/ matches? ] unit-test +[ f ] [ "ab" R/ a$|b$/ matches? ] unit-test +[ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test -! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test +[ t ] [ "a" R/ \Aa/ matches? ] unit-test +[ f ] [ "\na" R/ \Aaa/ matches? ] unit-test +[ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test +[ f ] [ "\ra" R/ \Aa/ matches? ] unit-test -! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test -! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test +[ t ] [ "a" R/ \Aa/m matches? ] unit-test +[ f ] [ "\na" R/ \Aaa/m matches? ] unit-test +[ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test +[ f ] [ "\ra" R/ \Aa/m matches? ] unit-test +[ 0 ] [ "\ra" R/ \Aa/m count-matches ] unit-test -! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test -! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test +[ f ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test +[ 1 ] [ "\r\n\n\n\nam" R/ ^am/m count-matches ] unit-test -! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test -! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test -! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test -! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test +[ t ] [ "a" R/ \Aa\z/m matches? ] unit-test +[ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test -! [ t ] [ "a" R/ ^a/m matches? ] unit-test -! [ t ] [ "\na" R/ ^a/m matches? ] unit-test -! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test -! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test +[ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test +[ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test -! [ t ] [ "a" R/ a$/m matches? ] unit-test -! [ t ] [ "a\n" R/ a$/m matches? ] unit-test -! [ t ] [ "a\r" R/ a$/m matches? ] unit-test -! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test +[ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test +[ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test +[ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test +[ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test -! [ f ] [ "foobxr" "foo\\z" match-index-head ] unit-test -! [ 3 ] [ "foo" "foo\\z" match-index-head ] unit-test +[ t ] [ "a" R/ ^a/m matches? ] unit-test +[ f ] [ "\na" R/ ^a/m matches? ] unit-test +[ 1 ] [ "\na" R/ ^a/m count-matches ] unit-test +[ 1 ] [ "\r\na" R/ ^a/m count-matches ] unit-test +[ 1 ] [ "\ra" R/ ^a/m count-matches ] unit-test + +[ t ] [ "a" R/ a$/m matches? ] unit-test +[ f ] [ "a\n" R/ a$/m matches? ] unit-test +[ 1 ] [ "a\n" R/ a$/m count-matches ] unit-test +[ 1 ] [ "a\r" R/ a$/m count-matches ] unit-test +[ 1 ] [ "a\r\n" R/ a$/m count-matches ] unit-test + +[ f ] [ "foobxr" "foo\\z" match-index-head ] unit-test +[ 3 ] [ "foo" "foo\\z" match-index-head ] unit-test ! [ t ] [ "foo" "\\bfoo\\b" matches? ] unit-test ! [ t ] [ "afoob" "\\Bfoo\\B" matches? ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 1bd242315f..6693691ba8 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -3,7 +3,7 @@ USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend prettyprint.custom make lexer namespaces parser arrays fry locals regexp.minimize -regexp.parser regexp.nfa regexp.dfa +regexp.parser regexp.nfa regexp.dfa regexp.classes regexp.transition-tables splitting sorting regexp.ast regexp.negation regexp.matchers regexp.compiler ; IN: regexp @@ -27,6 +27,7 @@ TUPLE: regexp TUPLE: reverse-matcher regexp ; C: reverse-matcher +! Reverse matchers won't work properly with most combinators, for now reverse-matcher : ( ast -- reversed ) "r" string>options ; +M: lookahead question>quot ! Returns ( index string -- ? ) + term>> ast>dfa dfa>shortest-quotation ; + +M: lookbehind question>quot ! Returns ( index string -- ? ) + term>> + ast>dfa dfa>reverse-shortest-quotation + [ [ 1- ] dip ] prepose ; + : compile-reverse ( regexp -- regexp ) dup '[ [ _ get-ast - ast>dfa dfa>quotation + ast>dfa dfa>reverse-quotation ] unless* ] change-reverse-dfa ; -M: regexp match-index-from ( string regexp -- index/f ) +M: regexp match-index-from compile-regexp dfa>> match-index-from ; -M: reverse-matcher match-index-from ( string regexp -- index/f ) - [ ] [ regexp>> compile-reverse reverse-dfa>> ] bi* +M: reverse-matcher match-index-from + regexp>> compile-reverse reverse-dfa>> match-index-from ; +! The following two should do some caching + : find-regexp-syntax ( string -- prefix suffix ) { { "R/ " "/" } diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index 2fad7451b0..89471d2ce2 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs fry hashtables kernel sequences -vectors locals ; +vectors locals regexp.classes ; IN: regexp.transition-tables TUPLE: transition-table transitions start-state final-states ; @@ -12,10 +12,11 @@ TUPLE: transition-table transitions start-state final-states ; H{ } clone >>final-states ; : maybe-initialize-key ( key hashtable -- ) + ! Why do we have to do this? 2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ; :: (set-transition) ( from to obj hash -- ) - to hash maybe-initialize-key + to condition? [ to hash maybe-initialize-key ] unless from hash at [ [ to obj ] dip set-at ] [ to obj associate from hash set-at ] if* ; @@ -31,3 +32,23 @@ TUPLE: transition-table transitions start-state final-states ; : add-transition ( from to obj transition-table -- ) transitions>> (add-transition) ; + +: map-set ( assoc quot -- new-assoc ) + '[ drop @ dup ] assoc-map ; inline + +: rewrite-transitions ( transition-table assoc quot -- transition-table ) + [ + [ clone ] dip + [ '[ _ condition-at ] change-start-state ] + [ '[ [ _ at ] map-set ] change-final-states ] + [ ] tri + ] dip '[ _ @ ] change-transitions ; inline + +: number-transitions ( transitions numbering -- new-transitions ) + dup '[ + [ _ at ] + [ [ _ condition-at ] assoc-map ] bi* + ] assoc-map ; + +: transitions-at ( transitions numbering -- transitions ) + [ number-transitions ] rewrite-transitions ;