Many regexp changes, improving speed and organization

db4
Daniel Ehrenberg 2009-03-10 18:27:04 -05:00
parent 1d47c232f7
commit 638cef2824
6 changed files with 175 additions and 84 deletions

View File

@ -1,6 +1,6 @@
! Copyright (C) 2009 Daniel Ehrenberg ! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: regexp.combinators tools.test regexp kernel sequences regexp.matchers ; USING: regexp.combinators tools.test regexp kernel sequences ;
IN: regexp.combinators.tests IN: regexp.combinators.tests
: strings ( -- regexp ) : strings ( -- regexp )

View File

@ -1,19 +1,19 @@
! Copyright (C) 2009 Daniel Ehrenberg. ! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: regexp.classes kernel sequences regexp.negation USING: regexp.classes kernel sequences regexp.negation
quotations regexp.minimize assocs fry math locals combinators quotations assocs fry math locals combinators
accessors words compiler.units kernel.private strings accessors words compiler.units kernel.private strings
sequences.private arrays regexp.matchers call namespaces sequences.private arrays call namespaces
regexp.transition-tables combinators.short-circuit ; regexp.transition-tables combinators.short-circuit ;
IN: regexp.compiler IN: regexp.compiler
GENERIC: question>quot ( question -- quot ) GENERIC: question>quot ( question -- quot )
<PRIVATE
SYMBOL: shortest? SYMBOL: shortest?
SYMBOL: backwards? SYMBOL: backwards?
<PRIVATE
M: t question>quot drop [ 2drop t ] ; M: t question>quot drop [ 2drop t ] ;
M: beginning-of-input question>quot M: beginning-of-input question>quot
@ -122,34 +122,23 @@ C: <box> box
[ values ] [ values ]
bi swap ; bi swap ;
: dfa>word ( dfa -- word ) : dfa>main-word ( dfa -- word )
states>words [ states>code ] keep start-state>> ; states>words [ states>code ] keep start-state>> ;
: check-string ( string -- string )
! Make this configurable
dup string? [ "String required" throw ] unless ;
: setup-regexp ( start-index string -- f start-index string )
[ f ] [ >fixnum ] [ check-string ] tri* ; inline
PRIVATE> PRIVATE>
! The quotation returned is ( start-index string -- i/f ) : simple-define-temp ( quot effect -- word )
[ [ define-temp ] with-compilation-unit ] call( quot effect -- word ) ;
: dfa>quotation ( dfa -- quot ) : dfa>word ( dfa -- quot )
dfa>word execution-quot '[ setup-regexp @ ] ; dfa>main-word execution-quot '[ drop [ f ] 2dip @ ]
(( start-index string regexp -- i/f )) simple-define-temp ;
: dfa>shortest-quotation ( dfa -- quot ) : dfa>shortest-word ( dfa -- word )
t shortest? [ dfa>quotation ] with-variable ; t shortest? [ dfa>word ] with-variable ;
: dfa>reverse-quotation ( dfa -- quot ) : dfa>reverse-word ( dfa -- word )
t backwards? [ dfa>quotation ] with-variable ; t backwards? [ dfa>word ] with-variable ;
: dfa>reverse-shortest-quotation ( dfa -- quot ) : dfa>reverse-shortest-word ( dfa -- word )
t backwards? [ dfa>shortest-quotation ] with-variable ; t backwards? [ dfa>shortest-word ] with-variable ;
TUPLE: quot-matcher quot ;
C: <quot-matcher> quot-matcher
M: quot-matcher match-index-from
quot>> call( index string -- i/f ) ;

View File

@ -6,9 +6,6 @@ regexp.ast regexp.transition-tables regexp.minimize
regexp.dfa namespaces ; regexp.dfa namespaces ;
IN: regexp.negation IN: regexp.negation
: ast>dfa ( parse-tree -- minimal-dfa )
construct-nfa disambiguate construct-dfa minimize ;
CONSTANT: fail-state -1 CONSTANT: fail-state -1
: add-default-transition ( state's-transitions -- new-state's-transitions ) : add-default-transition ( state's-transitions -- new-state's-transitions )
@ -49,5 +46,8 @@ CONSTANT: fail-state -1
[ final-states>> keys first ] [ final-states>> keys first ]
[ nfa-table get [ transitions>> ] bi@ swap update ] tri ; [ nfa-table get [ transitions>> ] bi@ swap update ] tri ;
: ast>dfa ( parse-tree -- minimal-dfa )
construct-nfa disambiguate construct-dfa minimize ;
M: negation nfa-node ( node -- start end ) M: negation nfa-node ( node -- start end )
term>> ast>dfa negate-table adjoin-dfa ; term>> ast>dfa negate-table adjoin-dfa ;

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings help.markup help.syntax regexp.matchers math ; USING: kernel strings help.markup help.syntax math ;
IN: regexp IN: regexp
ABOUT: "regexp" ABOUT: "regexp"

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: regexp tools.test kernel sequences regexp.parser regexp.private USING: regexp tools.test kernel sequences regexp.parser regexp.private
eval strings multiline accessors regexp.matchers ; eval strings multiline accessors ;
IN: regexp-tests IN: regexp-tests
\ <regexp> must-infer \ <regexp> must-infer
@ -239,11 +239,11 @@ IN: regexp-tests
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test [ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test [ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test [ t ] [ "abc" R/ abc/r matches? ] unit-test
[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test [ t ] [ "abc" R/ a[bB][cC]/r matches? ] unit-test
[ t ] [ 3 "xabc" R/ abc/ <reverse-matcher> match-index-from >boolean ] unit-test [ t ] [ 3 "xabc" R/ abc/r match-index-from >boolean ] unit-test
[ t ] [ 3 "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-from >boolean ] unit-test [ t ] [ 3 "xabc" R/ a[bB][cC]/r match-index-from >boolean ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -341,9 +341,19 @@ IN: regexp-tests
[ t ] [ "aaaa" R/ .*a./ matches? ] unit-test [ t ] [ "aaaa" R/ .*a./ matches? ] unit-test
[ f ] [ "ab" R/ (?~ac|\p{Lower}b)/ matches? ] unit-test
[ f ] [ "ab" R/ (?~ac|[a-z]b)/ matches? ] unit-test
[ f ] [ "ac" R/ (?~ac|\p{Lower}b)/ matches? ] unit-test
[ f ] [ "ac" R/ (?~ac|[a-z]b)/ matches? ] unit-test
[ f ] [ "ac" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
[ f ] [ "ab" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
[ f ] [ "πb" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
[ t ] [ "πc" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
[ t ] [ "Ab" R/ (?~[a-zA-Z]c|\p{Lower}b)/ matches? ] unit-test
! DFA is compiled when needed, or when literal ! DFA is compiled when needed, or when literal
[ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test [ regexp-initial-word ] [ "foo" <regexp> dfa>> ] unit-test
[ t ] [ R/ foo/ dfa>> >boolean ] unit-test [ f ] [ R/ foo/ dfa>> \ regexp-initial-word = ] unit-test
[ t ] [ "a" R/ ^a/ matches? ] unit-test [ t ] [ "a" R/ ^a/ matches? ] unit-test
[ f ] [ "\na" R/ ^a/ matches? ] unit-test [ f ] [ "\na" R/ ^a/ matches? ] unit-test

View File

@ -2,71 +2,162 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences strings sets USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry locals regexp.minimize namespaces parser arrays fry locals regexp.parser splitting
regexp.parser regexp.nfa regexp.dfa regexp.classes sorting regexp.ast regexp.negation regexp.compiler words
regexp.transition-tables splitting sorting regexp.ast call call.private math.ranges ;
regexp.negation regexp.matchers regexp.compiler ;
IN: regexp IN: regexp
TUPLE: regexp TUPLE: regexp
{ raw read-only } { raw read-only }
{ parse-tree read-only } { parse-tree read-only }
{ options read-only } { options read-only }
dfa reverse-dfa ; dfa next-match ;
: make-regexp ( string ast -- regexp ) TUPLE: reverse-regexp < regexp ;
f f <options> f f regexp boa ; foldable
! Foldable because, when the dfa slot is set,
! it'll be set to the same thing regardless of who sets it
: <optioned-regexp> ( string options -- regexp ) <PRIVATE
[ dup parse-regexp ] [ string>options ] bi*
f f regexp boa ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ; : maybe-negated ( lookaround quot -- regexp-quot )
'[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
TUPLE: reverse-matcher regexp ; M: lookahead question>quot ! Returns ( index string -- ? )
C: <reverse-matcher> reverse-matcher [ ast>dfa dfa>shortest-word '[ f _ execute ] ] maybe-negated ;
! Reverse matchers won't work properly with most combinators, for now
: <reversed-option> ( ast -- reversed )
"r" string>options <with-options> ;
M: lookbehind question>quot ! Returns ( index string -- ? )
[
<reversed-option>
ast>dfa dfa>reverse-shortest-word
'[ [ 1- ] dip f _ execute ]
] maybe-negated ;
<PRIVATE
: check-string ( string -- string )
! Make this configurable
dup string? [ "String required" throw ] unless ;
: match-index-from ( i string regexp -- index/f )
! This word is unsafe. It assumes that i is a fixnum
! and that string is a string.
dup dfa>> execute( index string regexp -- i/f ) ;
: match-index-head ( string regexp -- index/f )
[ 0 ] 2dip [ check-string ] dip match-index-from ;
PRIVATE>
: matches? ( string regexp -- ? )
dupd match-index-head
[ swap length = ] [ drop f ] if* ;
<PRIVATE
: match-slice ( i string quot -- slice/f )
[ 2dup ] dip call
[ swap <slice> ] [ 2drop f ] if* ; inline
: match-from ( i string quot -- slice/f )
[ [ length [a,b) ] keep ] dip
'[ _ _ match-slice ] map-find drop ; inline
: next-match ( i string quot -- i match/f )
match-from [ dup [ to>> ] when ] keep ; inline
: do-next-match ( i string regexp -- i match/f )
dup next-match>> execute( i string regexp -- i match/f ) ;
PRIVATE>
: all-matches ( string regexp -- seq )
[ check-string ] dip
[ 0 [ dup ] ] 2dip '[ _ _ do-next-match ] produce
nip but-last ;
: count-matches ( string regexp -- n )
all-matches length ;
<PRIVATE
:: split-slices ( string slices -- new-slices )
slices [ to>> ] map 0 prefix
slices [ from>> ] map string length suffix
[ string <slice> ] 2map ;
: match-head ( str regexp -- slice/f )
[
[ 0 ] [ check-string ] [ dup dfa>> '[ _ _ execute ] ] tri*
match-from
] call( str regexp -- slice/f ) ;
PRIVATE>
: re-split1 ( string regexp -- before after/f )
dupd match-head [ 1array split-slices first2 ] [ f ] if* ;
: re-split ( string regexp -- seq )
dupd all-matches split-slices ;
: re-replace ( string regexp replacement -- result )
[ re-split ] dip join ;
<PRIVATE <PRIVATE
: get-ast ( regexp -- ast ) : get-ast ( regexp -- ast )
[ parse-tree>> ] [ options>> ] bi <with-options> ; [ parse-tree>> ] [ options>> ] bi <with-options> ;
: compile-regexp ( regexp -- regexp ) GENERIC: compile-regexp ( regex -- regexp )
dup '[ [ _ get-ast ast>dfa dfa>quotation ] unless* ] change-dfa ;
: <reversed-option> ( ast -- reversed ) : regexp-initial-word ( i string regexp -- i/f )
"r" string>options <with-options> ; compile-regexp match-index-from ;
: maybe-negated ( lookaround quot -- regexp-quot ) : do-compile-regexp ( regexp -- regexp )
'[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
M: lookahead question>quot ! Returns ( index string -- ? )
[ ast>dfa dfa>shortest-quotation ] maybe-negated ;
M: lookbehind question>quot ! Returns ( index string -- ? )
[
<reversed-option>
ast>dfa dfa>reverse-shortest-quotation
[ [ 1- ] dip ] prepose
] maybe-negated ;
: compile-reverse ( regexp -- regexp )
dup '[ dup '[
[ dup \ regexp-initial-word =
_ get-ast <reversed-option> [ drop _ get-ast ast>dfa dfa>word ] when
ast>dfa dfa>reverse-quotation ] change-dfa ;
] unless*
] change-reverse-dfa ;
M: regexp match-index-from M: regexp compile-regexp ( regexp -- regexp )
compile-regexp dfa>> <quot-matcher> match-index-from ; do-compile-regexp ;
M: reverse-matcher match-index-from M: reverse-regexp compile-regexp ( regexp -- regexp )
regexp>> compile-reverse reverse-dfa>> t backwards? [ do-compile-regexp ] with-variable ;
<quot-matcher> match-index-from ;
GENERIC: compile-next-match ( regexp -- regexp )
: next-initial-word ( i string regexp -- i slice/f )
compile-next-match do-next-match ;
M: regexp compile-next-match ( regexp -- regexp )
dup '[
dup \ next-initial-word = [
drop _ compile-regexp dfa>>
'[ _ '[ _ _ execute ] next-match ]
(( i string -- i match/f )) simple-define-temp
] when
] change-next-match ;
! Write M: reverse-regexp compile-next-match
PRIVATE>
: new-regexp ( string ast options class -- regexp )
[ \ regexp-initial-word \ next-initial-word ] dip boa ; inline
: make-regexp ( string ast -- regexp )
f f <options> regexp new-regexp ;
: <optioned-regexp> ( string options -- regexp )
[ dup parse-regexp ] [ string>options ] bi*
dup on>> reversed-regexp swap member?
[ reverse-regexp new-regexp ]
[ regexp new-regexp ] if ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
<PRIVATE
! The following two should do some caching ! The following two should do some caching
@ -97,7 +188,7 @@ M: reverse-matcher match-index-from
: parsing-regexp ( accum end -- accum ) : parsing-regexp ( accum end -- accum )
lexer get [ take-until ] [ parse-noblank-token ] bi lexer get [ take-until ] [ parse-noblank-token ] bi
<optioned-regexp> compile-regexp parsed ; <optioned-regexp> compile-next-match parsed ;
PRIVATE> PRIVATE>
@ -120,3 +211,4 @@ M: regexp pprint*
[ options>> options>string % ] bi [ options>> options>string % ] bi
] "" make ] "" make
] keep present-text ; ] keep present-text ;