diff --git a/basis/regexp/combinators/combinators-tests.factor b/basis/regexp/combinators/combinators-tests.factor index 6690440345..ddfd0dcaad 100644 --- a/basis/regexp/combinators/combinators-tests.factor +++ b/basis/regexp/combinators/combinators-tests.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: regexp.combinators tools.test regexp kernel sequences regexp.matchers ; +USING: regexp.combinators tools.test regexp kernel sequences ; IN: regexp.combinators.tests : strings ( -- regexp ) diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index eedf05a81e..0e0c0eaae6 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -1,19 +1,19 @@ ! Copyright (C) 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: regexp.classes kernel sequences regexp.negation -quotations regexp.minimize assocs fry math locals combinators +quotations assocs fry math locals combinators accessors words compiler.units kernel.private strings -sequences.private arrays regexp.matchers call namespaces +sequences.private arrays call namespaces regexp.transition-tables combinators.short-circuit ; IN: regexp.compiler GENERIC: question>quot ( question -- quot ) -quot drop [ 2drop t ] ; M: beginning-of-input question>quot @@ -122,34 +122,23 @@ C: box [ values ] bi swap ; -: dfa>word ( dfa -- word ) +: dfa>main-word ( dfa -- word ) states>words [ states>code ] keep start-state>> ; -: check-string ( string -- string ) - ! Make this configurable - dup string? [ "String required" throw ] unless ; - -: setup-regexp ( start-index string -- f start-index string ) - [ f ] [ >fixnum ] [ check-string ] tri* ; inline - PRIVATE> -! The quotation returned is ( start-index string -- i/f ) +: simple-define-temp ( quot effect -- word ) + [ [ define-temp ] with-compilation-unit ] call( quot effect -- word ) ; -: dfa>quotation ( dfa -- quot ) - dfa>word execution-quot '[ setup-regexp @ ] ; +: dfa>word ( dfa -- quot ) + dfa>main-word execution-quot '[ drop [ f ] 2dip @ ] + (( start-index string regexp -- i/f )) simple-define-temp ; -: dfa>shortest-quotation ( dfa -- quot ) - t shortest? [ dfa>quotation ] with-variable ; +: dfa>shortest-word ( dfa -- word ) + t shortest? [ dfa>word ] with-variable ; -: dfa>reverse-quotation ( dfa -- quot ) - t backwards? [ dfa>quotation ] with-variable ; +: dfa>reverse-word ( dfa -- word ) + t backwards? [ dfa>word ] with-variable ; -: dfa>reverse-shortest-quotation ( dfa -- quot ) - t backwards? [ dfa>shortest-quotation ] with-variable ; - -TUPLE: quot-matcher quot ; -C: quot-matcher - -M: quot-matcher match-index-from - quot>> call( index string -- i/f ) ; +: dfa>reverse-shortest-word ( dfa -- word ) + t backwards? [ dfa>shortest-word ] with-variable ; diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor index 0633dca192..8b0a2f6edf 100644 --- a/basis/regexp/negation/negation.factor +++ b/basis/regexp/negation/negation.factor @@ -6,9 +6,6 @@ regexp.ast regexp.transition-tables regexp.minimize regexp.dfa namespaces ; IN: regexp.negation -: ast>dfa ( parse-tree -- minimal-dfa ) - construct-nfa disambiguate construct-dfa minimize ; - CONSTANT: fail-state -1 : add-default-transition ( state's-transitions -- new-state's-transitions ) @@ -49,5 +46,8 @@ CONSTANT: fail-state -1 [ final-states>> keys first ] [ nfa-table get [ transitions>> ] bi@ swap update ] tri ; +: ast>dfa ( parse-tree -- minimal-dfa ) + construct-nfa disambiguate construct-dfa minimize ; + M: negation nfa-node ( node -- start end ) term>> ast>dfa negate-table adjoin-dfa ; diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index d77abe877e..ce4a54df87 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel strings help.markup help.syntax regexp.matchers math ; +USING: kernel strings help.markup help.syntax math ; IN: regexp ABOUT: "regexp" diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 99cb8dbd22..fa907011fd 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: regexp tools.test kernel sequences regexp.parser regexp.private -eval strings multiline accessors regexp.matchers ; +eval strings multiline accessors ; IN: regexp-tests \ must-infer @@ -239,11 +239,11 @@ IN: regexp-tests [ f ] [ "A" "\\p{Lower}" matches? ] unit-test [ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test -[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test -[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test +[ t ] [ "abc" R/ abc/r matches? ] unit-test +[ t ] [ "abc" R/ a[bB][cC]/r matches? ] unit-test -[ t ] [ 3 "xabc" R/ abc/ match-index-from >boolean ] unit-test -[ t ] [ 3 "xabc" R/ a[bB][cC]/ match-index-from >boolean ] unit-test +[ t ] [ 3 "xabc" R/ abc/r match-index-from >boolean ] unit-test +[ t ] [ 3 "xabc" R/ a[bB][cC]/r match-index-from >boolean ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test @@ -341,9 +341,19 @@ IN: regexp-tests [ t ] [ "aaaa" R/ .*a./ matches? ] unit-test +[ f ] [ "ab" R/ (?~ac|\p{Lower}b)/ matches? ] unit-test +[ f ] [ "ab" R/ (?~ac|[a-z]b)/ matches? ] unit-test +[ f ] [ "ac" R/ (?~ac|\p{Lower}b)/ matches? ] unit-test +[ f ] [ "ac" R/ (?~ac|[a-z]b)/ matches? ] unit-test +[ f ] [ "ac" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test +[ f ] [ "ab" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test +[ f ] [ "πb" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test +[ t ] [ "πc" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test +[ t ] [ "Ab" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test + ! DFA is compiled when needed, or when literal -[ f ] [ "foo" dfa>> >boolean ] unit-test -[ t ] [ R/ foo/ dfa>> >boolean ] unit-test +[ regexp-initial-word ] [ "foo" dfa>> ] unit-test +[ f ] [ R/ foo/ dfa>> \ regexp-initial-word = ] unit-test [ t ] [ "a" R/ ^a/ matches? ] unit-test [ f ] [ "\na" R/ ^a/ matches? ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index f938ddf60a..aacd888ccb 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -2,71 +2,162 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend prettyprint.custom make lexer -namespaces parser arrays fry locals regexp.minimize -regexp.parser regexp.nfa regexp.dfa regexp.classes -regexp.transition-tables splitting sorting regexp.ast -regexp.negation regexp.matchers regexp.compiler ; +namespaces parser arrays fry locals regexp.parser splitting +sorting regexp.ast regexp.negation regexp.compiler words +call call.private math.ranges ; IN: regexp TUPLE: regexp { raw read-only } { parse-tree read-only } { options read-only } - dfa reverse-dfa ; + dfa next-match ; -: make-regexp ( string ast -- regexp ) - f f f f regexp boa ; foldable - ! Foldable because, when the dfa slot is set, - ! it'll be set to the same thing regardless of who sets it +TUPLE: reverse-regexp < regexp ; -: ( string options -- regexp ) - [ dup parse-regexp ] [ string>options ] bi* - f f regexp boa ; + ( string -- regexp ) "" ; +: maybe-negated ( lookaround quot -- regexp-quot ) + '[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline -TUPLE: reverse-matcher regexp ; -C: reverse-matcher -! Reverse matchers won't work properly with most combinators, for now +M: lookahead question>quot ! Returns ( index string -- ? ) + [ ast>dfa dfa>shortest-word '[ f _ execute ] ] maybe-negated ; + +: ( ast -- reversed ) + "r" string>options ; + +M: lookbehind question>quot ! Returns ( index string -- ? ) + [ + + ast>dfa dfa>reverse-shortest-word + '[ [ 1- ] dip f _ execute ] + ] maybe-negated ; + +> execute( index string regexp -- i/f ) ; + +: match-index-head ( string regexp -- index/f ) + [ 0 ] 2dip [ check-string ] dip match-index-from ; + +PRIVATE> + +: matches? ( string regexp -- ? ) + dupd match-index-head + [ swap length = ] [ drop f ] if* ; + + ] [ 2drop f ] if* ; inline + +: match-from ( i string quot -- slice/f ) + [ [ length [a,b) ] keep ] dip + '[ _ _ match-slice ] map-find drop ; inline + +: next-match ( i string quot -- i match/f ) + match-from [ dup [ to>> ] when ] keep ; inline + +: do-next-match ( i string regexp -- i match/f ) + dup next-match>> execute( i string regexp -- i match/f ) ; + +PRIVATE> + +: all-matches ( string regexp -- seq ) + [ check-string ] dip + [ 0 [ dup ] ] 2dip '[ _ _ do-next-match ] produce + nip but-last ; + +: count-matches ( string regexp -- n ) + all-matches length ; + +> ] map 0 prefix + slices [ from>> ] map string length suffix + [ string ] 2map ; + +: match-head ( str regexp -- slice/f ) + [ + [ 0 ] [ check-string ] [ dup dfa>> '[ _ _ execute ] ] tri* + match-from + ] call( str regexp -- slice/f ) ; + +PRIVATE> + +: re-split1 ( string regexp -- before after/f ) + dupd match-head [ 1array split-slices first2 ] [ f ] if* ; + +: re-split ( string regexp -- seq ) + dupd all-matches split-slices ; + +: re-replace ( string regexp replacement -- result ) + [ re-split ] dip join ; > ] [ options>> ] bi ; -: compile-regexp ( regexp -- regexp ) - dup '[ [ _ get-ast ast>dfa dfa>quotation ] unless* ] change-dfa ; +GENERIC: compile-regexp ( regex -- regexp ) -: ( ast -- reversed ) - "r" string>options ; +: regexp-initial-word ( i string regexp -- i/f ) + compile-regexp match-index-from ; -: maybe-negated ( lookaround quot -- regexp-quot ) - '[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline - -M: lookahead question>quot ! Returns ( index string -- ? ) - [ ast>dfa dfa>shortest-quotation ] maybe-negated ; - -M: lookbehind question>quot ! Returns ( index string -- ? ) - [ - - ast>dfa dfa>reverse-shortest-quotation - [ [ 1- ] dip ] prepose - ] maybe-negated ; - -: compile-reverse ( regexp -- regexp ) +: do-compile-regexp ( regexp -- regexp ) dup '[ - [ - _ get-ast - ast>dfa dfa>reverse-quotation - ] unless* - ] change-reverse-dfa ; + dup \ regexp-initial-word = + [ drop _ get-ast ast>dfa dfa>word ] when + ] change-dfa ; -M: regexp match-index-from - compile-regexp dfa>> match-index-from ; +M: regexp compile-regexp ( regexp -- regexp ) + do-compile-regexp ; -M: reverse-matcher match-index-from - regexp>> compile-reverse reverse-dfa>> - match-index-from ; +M: reverse-regexp compile-regexp ( regexp -- regexp ) + t backwards? [ do-compile-regexp ] with-variable ; + +GENERIC: compile-next-match ( regexp -- regexp ) + +: next-initial-word ( i string regexp -- i slice/f ) + compile-next-match do-next-match ; + +M: regexp compile-next-match ( regexp -- regexp ) + dup '[ + dup \ next-initial-word = [ + drop _ compile-regexp dfa>> + '[ _ '[ _ _ execute ] next-match ] + (( i string -- i match/f )) simple-define-temp + ] when + ] change-next-match ; + +! Write M: reverse-regexp compile-next-match + +PRIVATE> + +: new-regexp ( string ast options class -- regexp ) + [ \ regexp-initial-word \ next-initial-word ] dip boa ; inline + +: make-regexp ( string ast -- regexp ) + f f regexp new-regexp ; + +: ( string options -- regexp ) + [ dup parse-regexp ] [ string>options ] bi* + dup on>> reversed-regexp swap member? + [ reverse-regexp new-regexp ] + [ regexp new-regexp ] if ; + +: ( string -- regexp ) "" ; + + compile-regexp parsed ; + compile-next-match parsed ; PRIVATE> @@ -120,3 +211,4 @@ M: regexp pprint* [ options>> options>string % ] bi ] "" make ] keep present-text ; +