diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor new file mode 100644 index 0000000000..a322eb2387 --- /dev/null +++ b/basis/regexp/compiler/compiler.factor @@ -0,0 +1,65 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: regexp regexp.private regexp.classes kernel sequences regexp.negation +quotations regexp.minimize assocs fry math locals combinators +accessors words compiler.units ; +IN: regexp.compiler + +: literals>cases ( literal-transitions -- case-body ) + [ 1quotation ] assoc-map ; + +: non-literals>dispatch ( non-literal-transitions -- quot ) + [ [ '[ dup _ class-member? ] ] [ 1quotation ] bi* ] assoc-map + [ 3drop f ] suffix '[ _ cond ] ; + +: split-literals ( transitions -- case default ) + ! Convert disjunction of literals to literals. Also maybe small ranges. + >alist [ first integer? ] partition + [ literals>cases ] [ non-literals>dispatch ] bi* ; + +USING: kernel.private strings sequences.private ; + +:: step ( index str case-body final? -- match? ) + index str bounds-check? [ + index 1+ str + index str nth-unsafe + case-body case + ] [ final? ] if ; inline + +: transitions>quot ( transitions final-state? -- quot ) + [ split-literals suffix ] dip + '[ { array-capacity string } declare _ _ step ] ; + +: word>quot ( word dfa -- quot ) + [ transitions>> at ] + [ final-states>> key? ] 2bi + transitions>quot ; + +: states>code ( words dfa -- ) + '[ + [ + dup _ word>quot + (( index string -- ? )) define-declared + ] each + ] with-compilation-unit ; + +: transitions-at ( transitions assoc -- new-transitions ) + dup '[ + [ _ at ] + [ [ _ at ] assoc-map ] bi* + ] assoc-map ; + +: states>words ( dfa -- words dfa ) + dup transitions>> keys [ gensym ] H{ } map>assoc + [ [ transitions-at ] rewrite-transitions ] + [ values ] + bi swap ; + +: dfa>word ( dfa -- word ) + states>words [ states>code ] keep start-state>> ; + +: run-regexp ( string word -- ? ) + [ 0 ] 2dip execute ; inline + +: regexp>quotation ( regexp -- quot ) + compile-regexp dfa>> dfa>word '[ _ run-regexp ] ; diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index acf59b0637..01e3e01119 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -49,7 +49,7 @@ IN: regexp.dfa [| trans | state trans nfa find-closure :> new-state new-state visited-states new-states add-todo-state - state new-state trans transition make-transition dfa add-transition + state new-state trans dfa add-transition ] each nfa dfa new-states visited-states new-transitions ] if-empty ; diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor index 67e77ac7ca..0cfcdfc6ea 100644 --- a/basis/regexp/negation/negation.factor +++ b/basis/regexp/negation/negation.factor @@ -48,7 +48,7 @@ CONSTANT: fail-state -1 : unify-final-state ( transition-table -- transition-table ) dup [ final-states>> keys ] keep - '[ -2 epsilon _ add-transition ] each + '[ -2 epsilon _ add-transition ] each H{ { -2 -2 } } >>final-states ; : adjoin-dfa ( transition-table -- start end ) diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 6362681168..55147a1d26 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -51,12 +51,12 @@ SYMBOL: nfa-table GENERIC: nfa-node ( node -- start-state end-state ) -: add-simple-entry ( obj class -- start-state end-state ) - [ next-state next-state 2dup ] 2dip - make-transition nfa-table get add-transition ; +: add-simple-entry ( obj -- start-state end-state ) + [ next-state next-state 2dup ] dip + nfa-table get add-transition ; : epsilon-transition ( source target -- ) - epsilon nfa-table get add-transition ; + epsilon nfa-table get add-transition ; M:: star nfa-node ( node -- start end ) node term>> nfa-node :> s1 :> s0 @@ -69,7 +69,7 @@ M:: star nfa-node ( node -- start end ) s2 s3 ; M: tagged-epsilon nfa-node - literal-transition add-simple-entry ; + add-simple-entry ; M: concatenation nfa-node ( node -- start end ) [ first>> ] [ second>> ] bi @@ -103,9 +103,7 @@ M: integer modify-class ] when ; M: integer nfa-node ( node -- start end ) - modify-class dup class? - class-transition literal-transition ? - add-simple-entry ; + modify-class add-simple-entry ; M: primitive-class modify-class class>> modify-class ; @@ -141,7 +139,7 @@ M: range modify-class ] when ; M: class nfa-node - modify-class class-transition add-simple-entry ; + modify-class add-simple-entry ; M: with-options nfa-node ( node -- start end ) dup options>> [ tree>> nfa-node ] using-options ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 54bc305b4f..71df08285f 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -240,7 +240,9 @@ IN: regexp-tests [ t ] [ "abc" R/ abc/r matches? ] unit-test [ t ] [ "abc" R/ a[bB][cC]/r matches? ] unit-test -! [ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/ matches? ] unit-test ! FIXME + +[ t ] [ "xabc" R/ abc/ match-from-end >boolean ] unit-test +[ t ] [ "xabc" R/ a[bB][cC]/ match-from-end >boolean ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 8d4f948827..e9cd5328e2 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -12,38 +12,48 @@ TUPLE: regexp { raw read-only } { parse-tree read-only } { options read-only } - dfa ; + dfa reverse-dfa ; : make-regexp ( string ast -- regexp ) - f f f regexp boa ; foldable + f f f f regexp boa ; foldable ! Foldable because, when the dfa slot is set, ! it'll be set to the same thing regardless of who sets it : ( string options -- regexp ) [ dup parse-regexp ] [ string>options ] bi* - f regexp boa ; + f f regexp boa ; : ( string -- regexp ) "" ; > ] [ options>> ] bi ; + : compile-regexp ( regexp -- regexp ) - dup dfa>> [ - dup - [ parse-tree>> ] - [ options>> ] bi - ast>dfa - >>dfa - ] unless ; + dup '[ [ _ get-ast ast>dfa ] unless* ] change-dfa ; + +: ( ast -- reversed ) + "r" string>options ; + +: compile-reverse ( regexp -- regexp ) + dup '[ [ _ get-ast ast>dfa ] unless* ] change-reverse-dfa ; : (match) ( string regexp -- dfa-traverser ) - compile-regexp dfa>> do-match ; inline + compile-regexp dfa>> do-match ; + +: (match-reversed) ( string regexp -- dfa-traverser ) + [ ] [ compile-reverse reverse-dfa>> ] bi* + do-match ; PRIVATE> : match ( string regexp -- slice/f ) (match) return-match ; +: match-from-end ( string regexp -- slice/f ) + (match-reversed) return-match ; + : matches? ( string regexp -- ? ) dupd match [ [ length ] bi@ = ] [ drop f ] if* ; @@ -109,11 +119,18 @@ PRIVATE> { "R| " "|" } } swap [ subseq? not nip ] curry assoc-find drop ; +: take-until ( end lexer -- string ) + dup skip-blank [ + [ index-from ] 2keep + [ swapd subseq ] + [ 2drop 1+ ] 3bi + ] change-lexer-column ; + +: parse-noblank-token ( lexer -- str/f ) + dup still-parsing-line? [ (parse-token) ] [ drop f ] if ; + : parsing-regexp ( accum end -- accum ) - lexer get dup skip-blank - [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column - lexer get dup still-parsing-line? - [ (parse-token) ] [ drop f ] if + lexer get [ take-until ] [ parse-noblank-token ] bi compile-regexp parsed ; PRIVATE> diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index c02ebce91f..2b0a5c2bcc 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -1,32 +1,9 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs fry hashtables kernel sequences -vectors ; +vectors locals ; IN: regexp.transition-tables -TUPLE: transition from to obj ; -TUPLE: literal-transition < transition ; -TUPLE: class-transition < transition ; -TUPLE: default-transition < transition ; - -TUPLE: literal obj ; -TUPLE: class obj ; -TUPLE: default ; -: make-transition ( from to obj class -- obj ) - new - swap >>obj - swap >>to - swap >>from ; - -: ( from to obj -- transition ) - literal-transition make-transition ; - -: ( from to obj -- transition ) - class-transition make-transition ; - -: ( from to -- transition ) - t default-transition make-transition ; - TUPLE: transition-table transitions start-state final-states ; : ( -- transition-table ) @@ -37,12 +14,11 @@ TUPLE: transition-table transitions start-state final-states ; : maybe-initialize-key ( key hashtable -- ) 2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ; -: set-transition ( transition hash -- ) - #! set the state as a key - 2dup [ to>> ] dip maybe-initialize-key - [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip - 2dup at* [ 2nip push-at ] - [ drop [ H{ } clone [ push-at ] keep ] 2dip set-at ] if ; +:: set-transition ( from to obj hash -- ) + to hash maybe-initialize-key + from hash at + [ [ to obj ] dip push-at ] + [ to 1vector obj associate from hash set-at ] if* ; -: add-transition ( transition transition-table -- ) +: add-transition ( from to obj transition-table -- ) transitions>> set-transition ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index 5d48353f56..7a0d83051b 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -9,7 +9,6 @@ TUPLE: dfa-traverser dfa-table current-state text - match-failed? start-index current-index matches ; @@ -25,9 +24,6 @@ TUPLE: dfa-traverser [ current-state>> ] [ dfa-table>> final-states>> ] bi key? ; -: beginning-of-text? ( dfa-traverser -- ? ) - current-index>> 0 <= ; inline - : end-of-text? ( dfa-traverser -- ? ) [ current-index>> ] [ text>> length ] bi >= ; inline @@ -35,7 +31,6 @@ TUPLE: dfa-traverser { [ current-state>> not ] [ end-of-text? ] - [ match-failed?>> ] } 1|| ; : save-final-state ( dfa-straverser -- ) @@ -59,7 +54,8 @@ TUPLE: dfa-traverser 1 text-character ; : increment-state ( dfa-traverser state -- dfa-traverser ) - [ [ 1 + ] change-current-index ] dip >>current-state ; + >>current-state + [ 1 + ] change-current-index ; : match-literal ( transition from-state table -- to-state/f ) transitions>> at at ; @@ -69,11 +65,8 @@ TUPLE: dfa-traverser swap '[ drop _ swap class-member? ] assoc-find spin ? ] [ drop ] if ; -: match-default ( transition from-state table -- to-state/f ) - [ drop ] 2dip transitions>> at t swap at ; - : match-transition ( obj from-state dfa -- to-state/f ) - { [ match-literal ] [ match-class ] [ match-default ] } 3|| ; + { [ match-literal ] [ match-class ] } 3|| ; : setup-match ( match -- obj state dfa-table ) [ [ current-index>> ] [ text>> ] bi nth ] @@ -90,6 +83,6 @@ TUPLE: dfa-traverser dup matches>> [ drop f ] [ - [ [ text>> ] [ start-index>> ] bi ] - [ peek ] bi* rot + [ [ start-index>> ] [ text>> ] bi ] + [ peek ] bi* swap ] if-empty ;