Many regexp changes, improving speed and organization

db4
Daniel Ehrenberg 2009-03-10 18:27:04 -05:00
parent 1d47c232f7
commit 638cef2824
6 changed files with 175 additions and 84 deletions

View File

@ -1,6 +1,6 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.combinators tools.test regexp kernel sequences regexp.matchers ;
USING: regexp.combinators tools.test regexp kernel sequences ;
IN: regexp.combinators.tests
: strings ( -- regexp )

View File

@ -1,19 +1,19 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.classes kernel sequences regexp.negation
quotations regexp.minimize assocs fry math locals combinators
quotations assocs fry math locals combinators
accessors words compiler.units kernel.private strings
sequences.private arrays regexp.matchers call namespaces
sequences.private arrays call namespaces
regexp.transition-tables combinators.short-circuit ;
IN: regexp.compiler
GENERIC: question>quot ( question -- quot )
<PRIVATE
SYMBOL: shortest?
SYMBOL: backwards?
<PRIVATE
M: t question>quot drop [ 2drop t ] ;
M: beginning-of-input question>quot
@ -122,34 +122,23 @@ C: <box> box
[ values ]
bi swap ;
: dfa>word ( dfa -- word )
: dfa>main-word ( dfa -- word )
states>words [ states>code ] keep start-state>> ;
: check-string ( string -- string )
! Make this configurable
dup string? [ "String required" throw ] unless ;
: setup-regexp ( start-index string -- f start-index string )
[ f ] [ >fixnum ] [ check-string ] tri* ; inline
PRIVATE>
! The quotation returned is ( start-index string -- i/f )
: simple-define-temp ( quot effect -- word )
[ [ define-temp ] with-compilation-unit ] call( quot effect -- word ) ;
: dfa>quotation ( dfa -- quot )
dfa>word execution-quot '[ setup-regexp @ ] ;
: dfa>word ( dfa -- quot )
dfa>main-word execution-quot '[ drop [ f ] 2dip @ ]
(( start-index string regexp -- i/f )) simple-define-temp ;
: dfa>shortest-quotation ( dfa -- quot )
t shortest? [ dfa>quotation ] with-variable ;
: dfa>shortest-word ( dfa -- word )
t shortest? [ dfa>word ] with-variable ;
: dfa>reverse-quotation ( dfa -- quot )
t backwards? [ dfa>quotation ] with-variable ;
: dfa>reverse-word ( dfa -- word )
t backwards? [ dfa>word ] with-variable ;
: dfa>reverse-shortest-quotation ( dfa -- quot )
t backwards? [ dfa>shortest-quotation ] with-variable ;
TUPLE: quot-matcher quot ;
C: <quot-matcher> quot-matcher
M: quot-matcher match-index-from
quot>> call( index string -- i/f ) ;
: dfa>reverse-shortest-word ( dfa -- word )
t backwards? [ dfa>shortest-word ] with-variable ;

View File

@ -6,9 +6,6 @@ regexp.ast regexp.transition-tables regexp.minimize
regexp.dfa namespaces ;
IN: regexp.negation
: ast>dfa ( parse-tree -- minimal-dfa )
construct-nfa disambiguate construct-dfa minimize ;
CONSTANT: fail-state -1
: add-default-transition ( state's-transitions -- new-state's-transitions )
@ -49,5 +46,8 @@ CONSTANT: fail-state -1
[ final-states>> keys first ]
[ nfa-table get [ transitions>> ] bi@ swap update ] tri ;
: ast>dfa ( parse-tree -- minimal-dfa )
construct-nfa disambiguate construct-dfa minimize ;
M: negation nfa-node ( node -- start end )
term>> ast>dfa negate-table adjoin-dfa ;

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings help.markup help.syntax regexp.matchers math ;
USING: kernel strings help.markup help.syntax math ;
IN: regexp
ABOUT: "regexp"

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: regexp tools.test kernel sequences regexp.parser regexp.private
eval strings multiline accessors regexp.matchers ;
eval strings multiline accessors ;
IN: regexp-tests
\ <regexp> must-infer
@ -239,11 +239,11 @@ IN: regexp-tests
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test
[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test
[ t ] [ "abc" R/ abc/r matches? ] unit-test
[ t ] [ "abc" R/ a[bB][cC]/r matches? ] unit-test
[ t ] [ 3 "xabc" R/ abc/ <reverse-matcher> match-index-from >boolean ] unit-test
[ t ] [ 3 "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-from >boolean ] unit-test
[ t ] [ 3 "xabc" R/ abc/r match-index-from >boolean ] unit-test
[ t ] [ 3 "xabc" R/ a[bB][cC]/r match-index-from >boolean ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -341,9 +341,19 @@ IN: regexp-tests
[ t ] [ "aaaa" R/ .*a./ matches? ] unit-test
[ f ] [ "ab" R/ (?~ac|\p{Lower}b)/ matches? ] unit-test
[ f ] [ "ab" R/ (?~ac|[a-z]b)/ matches? ] unit-test
[ f ] [ "ac" R/ (?~ac|\p{Lower}b)/ matches? ] unit-test
[ f ] [ "ac" R/ (?~ac|[a-z]b)/ matches? ] unit-test
[ f ] [ "ac" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
[ f ] [ "ab" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
[ f ] [ "πb" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
[ t ] [ "πc" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
[ t ] [ "Ab" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
! DFA is compiled when needed, or when literal
[ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test
[ t ] [ R/ foo/ dfa>> >boolean ] unit-test
[ regexp-initial-word ] [ "foo" <regexp> dfa>> ] unit-test
[ f ] [ R/ foo/ dfa>> \ regexp-initial-word = ] unit-test
[ t ] [ "a" R/ ^a/ matches? ] unit-test
[ f ] [ "\na" R/ ^a/ matches? ] unit-test

View File

@ -2,71 +2,162 @@
! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry locals regexp.minimize
regexp.parser regexp.nfa regexp.dfa regexp.classes
regexp.transition-tables splitting sorting regexp.ast
regexp.negation regexp.matchers regexp.compiler ;
namespaces parser arrays fry locals regexp.parser splitting
sorting regexp.ast regexp.negation regexp.compiler words
call call.private math.ranges ;
IN: regexp
TUPLE: regexp
{ raw read-only }
{ parse-tree read-only }
{ options read-only }
dfa reverse-dfa ;
dfa next-match ;
: make-regexp ( string ast -- regexp )
f f <options> f f regexp boa ; foldable
! Foldable because, when the dfa slot is set,
! it'll be set to the same thing regardless of who sets it
TUPLE: reverse-regexp < regexp ;
: <optioned-regexp> ( string options -- regexp )
[ dup parse-regexp ] [ string>options ] bi*
f f regexp boa ;
<PRIVATE
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
: maybe-negated ( lookaround quot -- regexp-quot )
'[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
TUPLE: reverse-matcher regexp ;
C: <reverse-matcher> reverse-matcher
! Reverse matchers won't work properly with most combinators, for now
M: lookahead question>quot ! Returns ( index string -- ? )
[ ast>dfa dfa>shortest-word '[ f _ execute ] ] maybe-negated ;
: <reversed-option> ( ast -- reversed )
"r" string>options <with-options> ;
M: lookbehind question>quot ! Returns ( index string -- ? )
[
<reversed-option>
ast>dfa dfa>reverse-shortest-word
'[ [ 1- ] dip f _ execute ]
] maybe-negated ;
<PRIVATE
: check-string ( string -- string )
! Make this configurable
dup string? [ "String required" throw ] unless ;
: match-index-from ( i string regexp -- index/f )
! This word is unsafe. It assumes that i is a fixnum
! and that string is a string.
dup dfa>> execute( index string regexp -- i/f ) ;
: match-index-head ( string regexp -- index/f )
[ 0 ] 2dip [ check-string ] dip match-index-from ;
PRIVATE>
: matches? ( string regexp -- ? )
dupd match-index-head
[ swap length = ] [ drop f ] if* ;
<PRIVATE
: match-slice ( i string quot -- slice/f )
[ 2dup ] dip call
[ swap <slice> ] [ 2drop f ] if* ; inline
: match-from ( i string quot -- slice/f )
[ [ length [a,b) ] keep ] dip
'[ _ _ match-slice ] map-find drop ; inline
: next-match ( i string quot -- i match/f )
match-from [ dup [ to>> ] when ] keep ; inline
: do-next-match ( i string regexp -- i match/f )
dup next-match>> execute( i string regexp -- i match/f ) ;
PRIVATE>
: all-matches ( string regexp -- seq )
[ check-string ] dip
[ 0 [ dup ] ] 2dip '[ _ _ do-next-match ] produce
nip but-last ;
: count-matches ( string regexp -- n )
all-matches length ;
<PRIVATE
:: split-slices ( string slices -- new-slices )
slices [ to>> ] map 0 prefix
slices [ from>> ] map string length suffix
[ string <slice> ] 2map ;
: match-head ( str regexp -- slice/f )
[
[ 0 ] [ check-string ] [ dup dfa>> '[ _ _ execute ] ] tri*
match-from
] call( str regexp -- slice/f ) ;
PRIVATE>
: re-split1 ( string regexp -- before after/f )
dupd match-head [ 1array split-slices first2 ] [ f ] if* ;
: re-split ( string regexp -- seq )
dupd all-matches split-slices ;
: re-replace ( string regexp replacement -- result )
[ re-split ] dip join ;
<PRIVATE
: get-ast ( regexp -- ast )
[ parse-tree>> ] [ options>> ] bi <with-options> ;
: compile-regexp ( regexp -- regexp )
dup '[ [ _ get-ast ast>dfa dfa>quotation ] unless* ] change-dfa ;
GENERIC: compile-regexp ( regex -- regexp )
: <reversed-option> ( ast -- reversed )
"r" string>options <with-options> ;
: regexp-initial-word ( i string regexp -- i/f )
compile-regexp match-index-from ;
: maybe-negated ( lookaround quot -- regexp-quot )
'[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
M: lookahead question>quot ! Returns ( index string -- ? )
[ ast>dfa dfa>shortest-quotation ] maybe-negated ;
M: lookbehind question>quot ! Returns ( index string -- ? )
[
<reversed-option>
ast>dfa dfa>reverse-shortest-quotation
[ [ 1- ] dip ] prepose
] maybe-negated ;
: compile-reverse ( regexp -- regexp )
: do-compile-regexp ( regexp -- regexp )
dup '[
[
_ get-ast <reversed-option>
ast>dfa dfa>reverse-quotation
] unless*
] change-reverse-dfa ;
dup \ regexp-initial-word =
[ drop _ get-ast ast>dfa dfa>word ] when
] change-dfa ;
M: regexp match-index-from
compile-regexp dfa>> <quot-matcher> match-index-from ;
M: regexp compile-regexp ( regexp -- regexp )
do-compile-regexp ;
M: reverse-matcher match-index-from
regexp>> compile-reverse reverse-dfa>>
<quot-matcher> match-index-from ;
M: reverse-regexp compile-regexp ( regexp -- regexp )
t backwards? [ do-compile-regexp ] with-variable ;
GENERIC: compile-next-match ( regexp -- regexp )
: next-initial-word ( i string regexp -- i slice/f )
compile-next-match do-next-match ;
M: regexp compile-next-match ( regexp -- regexp )
dup '[
dup \ next-initial-word = [
drop _ compile-regexp dfa>>
'[ _ '[ _ _ execute ] next-match ]
(( i string -- i match/f )) simple-define-temp
] when
] change-next-match ;
! Write M: reverse-regexp compile-next-match
PRIVATE>
: new-regexp ( string ast options class -- regexp )
[ \ regexp-initial-word \ next-initial-word ] dip boa ; inline
: make-regexp ( string ast -- regexp )
f f <options> regexp new-regexp ;
: <optioned-regexp> ( string options -- regexp )
[ dup parse-regexp ] [ string>options ] bi*
dup on>> reversed-regexp swap member?
[ reverse-regexp new-regexp ]
[ regexp new-regexp ] if ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
<PRIVATE
! The following two should do some caching
@ -97,7 +188,7 @@ M: reverse-matcher match-index-from
: parsing-regexp ( accum end -- accum )
lexer get [ take-until ] [ parse-noblank-token ] bi
<optioned-regexp> compile-regexp parsed ;
<optioned-regexp> compile-next-match parsed ;
PRIVATE>
@ -120,3 +211,4 @@ M: regexp pprint*
[ options>> options>string % ] bi
] "" make
] keep present-text ;