Merge branch 'master' of git://factorcode.org/git/factor

db4
Slava Pestov 2008-09-22 16:54:49 -05:00
commit b81325dfbc
5 changed files with 69 additions and 58 deletions

View File

@ -68,12 +68,16 @@ IN: regexp.dfa
1vector >>new-states drop ; 1vector >>new-states drop ;
: set-traversal-flags ( regexp -- ) : set-traversal-flags ( regexp -- )
[ dfa-table>> transitions>> keys ] dup
[ nfa-traversal-flags>> ] [ nfa-traversal-flags>> ]
bi 2drop ; [ dfa-table>> transitions>> keys ] bi
[ tuck [ swap at ] with map concat ] with H{ } map>assoc
>>dfa-traversal-flags drop ;
: construct-dfa ( regexp -- ) : construct-dfa ( regexp -- )
[ set-initial-state ] {
[ new-transitions ] [ set-initial-state ]
[ set-final-states ] tri ; [ new-transitions ]
! [ set-traversal-flags ] quad ; [ set-final-states ]
[ set-traversal-flags ]
} cleave ;

View File

@ -1,10 +1,10 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators io io.streams.string USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser multi-methods namespaces qualified sets kernel math math.parser namespaces qualified sets
quotations sequences splitting symbols vectors math.order quotations sequences splitting symbols vectors math.order
unicode.categories strings regexp.backend regexp.utils unicode.categories strings regexp.backend regexp.utils
unicode.case ; unicode.case words ;
IN: regexp.parser IN: regexp.parser
FROM: math.ranges => [a,b] ; FROM: math.ranges => [a,b] ;
@ -25,11 +25,21 @@ TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
TUPLE: negation term ; INSTANCE: negation node TUPLE: negation term ; INSTANCE: negation node
TUPLE: constant char ; INSTANCE: constant node TUPLE: constant char ; INSTANCE: constant node
TUPLE: range from to ; INSTANCE: range node TUPLE: range from to ; INSTANCE: range node
MIXIN: parentheses-group
TUPLE: lookahead term ; INSTANCE: lookahead node TUPLE: lookahead term ; INSTANCE: lookahead node
INSTANCE: lookahead parentheses-group
TUPLE: lookbehind term ; INSTANCE: lookbehind node TUPLE: lookbehind term ; INSTANCE: lookbehind node
INSTANCE: lookbehind parentheses-group
TUPLE: capture-group term ; INSTANCE: capture-group node TUPLE: capture-group term ; INSTANCE: capture-group node
INSTANCE: capture-group parentheses-group
TUPLE: non-capture-group term ; INSTANCE: non-capture-group node TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
INSTANCE: non-capture-group parentheses-group
TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
INSTANCE: independent-group parentheses-group
TUPLE: comment-group term ; INSTANCE: comment-group node
INSTANCE: comment-group parentheses-group
TUPLE: character-class-range from to ; INSTANCE: character-class-range node TUPLE: character-class-range from to ; INSTANCE: character-class-range node
SINGLETON: epsilon INSTANCE: epsilon node SINGLETON: epsilon INSTANCE: epsilon node
SINGLETON: any-char INSTANCE: any-char node SINGLETON: any-char INSTANCE: any-char node
@ -98,25 +108,6 @@ left-parenthesis pipe caret dash ;
ERROR: unmatched-parentheses ; ERROR: unmatched-parentheses ;
: make-positive-lookahead ( string -- )
lookahead boa push-stack ;
: make-negative-lookahead ( string -- )
<negation> lookahead boa push-stack ;
: make-independent-group ( string -- )
#! no backtracking
independent-group boa push-stack ;
: make-positive-lookbehind ( string -- )
lookbehind boa push-stack ;
: make-negative-lookbehind ( string -- )
<negation> lookbehind boa push-stack ;
: make-non-capturing-group ( string -- )
non-capture-group boa push-stack ;
ERROR: bad-option ch ; ERROR: bad-option ch ;
: option ( ch -- singleton ) : option ( ch -- singleton )
@ -141,35 +132,35 @@ ERROR: bad-option ch ;
: parse-options ( string -- ) : parse-options ( string -- )
"-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
DEFER: (parse-regexp)
: parse-special-group ( -- )
beginning-of-group push-stack
(parse-regexp) pop-stack make-non-capturing-group ;
ERROR: bad-special-group string ; ERROR: bad-special-group string ;
DEFER: nested-parse-regexp DEFER: (parse-regexp)
: nested-parse-regexp ( token ? -- )
[ push-stack (parse-regexp) pop-stack ] dip
[ <negation> ] when pop-stack boa push-stack ;
! non-capturing groups
: (parse-special-group) ( -- ) : (parse-special-group) ( -- )
read1 { read1 {
{ [ dup CHAR: # = ] { [ dup CHAR: # = ] ! comment
[ drop nested-parse-regexp pop-stack drop ] } [ drop comment-group f nested-parse-regexp pop-stack drop ] }
{ [ dup CHAR: : = ] { [ dup CHAR: : = ]
[ drop nested-parse-regexp pop-stack make-non-capturing-group ] } [ drop non-capture-group f nested-parse-regexp ] }
{ [ dup CHAR: = = ] { [ dup CHAR: = = ]
[ drop nested-parse-regexp pop-stack make-positive-lookahead ] } [ drop lookahead f nested-parse-regexp ] }
{ [ dup CHAR: ! = ] { [ dup CHAR: ! = ]
[ drop nested-parse-regexp pop-stack make-negative-lookahead ] } [ drop lookahead t nested-parse-regexp ] }
{ [ dup CHAR: > = ] { [ dup CHAR: > = ]
[ drop nested-parse-regexp pop-stack make-independent-group ] } [ drop non-capture-group f nested-parse-regexp ] }
{ [ dup CHAR: < = peek1 CHAR: = = and ] { [ dup CHAR: < = peek1 CHAR: = = and ]
[ drop drop1 nested-parse-regexp pop-stack make-positive-lookbehind ] } [ drop drop1 lookbehind f nested-parse-regexp ] }
{ [ dup CHAR: < = peek1 CHAR: ! = and ] { [ dup CHAR: < = peek1 CHAR: ! = and ]
[ drop drop1 nested-parse-regexp pop-stack make-negative-lookbehind ] } [ drop drop1 lookbehind t nested-parse-regexp ] }
[ [
":)" read-until ":)" read-until
[ swap prefix ] dip [ swap prefix ] dip
{ {
{ CHAR: : [ parse-options parse-special-group ] } { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] }
{ CHAR: ) [ parse-options ] } { CHAR: ) [ parse-options ] }
[ drop bad-special-group ] [ drop bad-special-group ]
} case } case
@ -179,7 +170,7 @@ DEFER: nested-parse-regexp
: handle-left-parenthesis ( -- ) : handle-left-parenthesis ( -- )
peek1 CHAR: ? = peek1 CHAR: ? =
[ drop1 (parse-special-group) ] [ drop1 (parse-special-group) ]
[ nested-parse-regexp ] if ; [ capture-group f nested-parse-regexp ] if ;
: handle-dot ( -- ) any-char push-stack ; : handle-dot ( -- ) any-char push-stack ;
: handle-pipe ( -- ) pipe push-stack ; : handle-pipe ( -- ) pipe push-stack ;
@ -408,14 +399,12 @@ DEFER: handle-left-bracket
[ first|concatenation ] map first|alternation ; [ first|concatenation ] map first|alternation ;
: handle-right-parenthesis ( -- ) : handle-right-parenthesis ( -- )
stack beginning-of-group over last-index cut rest stack dup [ parentheses-group "members" word-prop member? ] find-last -rot cut rest
[ current-regexp get swap >>stack drop ] [ [ push ] keep current-regexp get (>>stack) ]
[ finish-regexp-parse <capture-group> push-stack ] bi* ; [ finish-regexp-parse push-stack ] bi* ;
: nested-parse-regexp ( -- )
beginning-of-group push-stack (parse-regexp) ;
: ((parse-regexp)) ( token -- ? ) : parse-regexp-token ( token -- ? )
{ {
{ CHAR: . [ handle-dot t ] } { CHAR: . [ handle-dot t ] }
{ CHAR: ( [ handle-left-parenthesis t ] } { CHAR: ( [ handle-left-parenthesis t ] }
@ -433,7 +422,7 @@ DEFER: handle-left-bracket
} case ; } case ;
: (parse-regexp) ( -- ) : (parse-regexp) ( -- )
read1 [ ((parse-regexp)) [ (parse-regexp) ] when ] when* ; read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
: parse-regexp ( regexp -- ) : parse-regexp ( regexp -- )
dup current-regexp [ dup current-regexp [

View File

@ -251,8 +251,8 @@ IN: regexp-tests
! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test ! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test ! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
! [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
! [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test

View File

@ -1,8 +1,8 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs combinators kernel math math.ranges USING: accessors assocs combinators kernel math math.ranges
quotations sequences regexp.parser regexp.classes quotations sequences regexp.parser regexp.classes fry
combinators.short-circuit regexp.utils ; combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
IN: regexp.traversal IN: regexp.traversal
TUPLE: dfa-traverser TUPLE: dfa-traverser
@ -10,7 +10,7 @@ TUPLE: dfa-traverser
traversal-flags traversal-flags
capture-groups capture-groups
{ capture-group-index integer } { capture-group-index integer }
{ lookahead-counter integer } lookahead-counters
last-state current-state last-state current-state
text text
start-index current-index start-index current-index
@ -26,7 +26,8 @@ TUPLE: dfa-traverser
0 >>start-index 0 >>start-index
0 >>current-index 0 >>current-index
V{ } clone >>matches V{ } clone >>matches
V{ } clone >>capture-groups ; V{ } clone >>capture-groups
V{ } clone >>lookahead-counters ;
: final-state? ( dfa-traverser -- ? ) : final-state? ( dfa-traverser -- ? )
[ current-state>> ] [ dfa-table>> final-states>> ] bi [ current-state>> ] [ dfa-table>> final-states>> ] bi
@ -43,9 +44,21 @@ TUPLE: dfa-traverser
dup save-final-state dup save-final-state
] when text-finished? ; ] when text-finished? ;
: print-flags ( dfa-traverser -- dfa-traverser ) GENERIC: flag-action ( dfa-traverser flag -- )
M: lookahead-on flag-action ( dfa-traverser flag -- )
drop
lookahead-counters>> 0 swap push ;
M: lookahead-off flag-action ( dfa-traverser flag -- )
drop
dup lookahead-counters>> pop
'[ _ - ] change-current-index drop ;
: process-flags ( dfa-traverser -- )
[ [ 1+ ] map ] change-lookahead-counters
dup [ current-state>> ] [ traversal-flags>> ] bi dup [ current-state>> ] [ traversal-flags>> ] bi
; at [ dup . flag-action ] with each ;
: increment-state ( dfa-traverser state -- dfa-traverser ) : increment-state ( dfa-traverser state -- dfa-traverser )
[ [
@ -79,6 +92,7 @@ TUPLE: dfa-traverser
[ nth ] 2dip ; [ nth ] 2dip ;
: do-match ( dfa-traverser -- dfa-traverser ) : do-match ( dfa-traverser -- dfa-traverser )
dup process-flags
dup match-done? [ dup match-done? [
dup setup-match match-transition dup setup-match match-transition
[ increment-state do-match ] when* [ increment-state do-match ] when*

View File

@ -212,11 +212,15 @@ PRIVATE>
[ [ host>> ] [ port>> ] bi <inet> ] [ protocol>> ] bi [ [ host>> ] [ port>> ] bi <inet> ] [ protocol>> ] bi
secure-protocol? [ <secure> ] when ; secure-protocol? [ <secure> ] when ;
ERROR: no-protocol-found protocol ;
: protocol-port ( protocol -- port ) : protocol-port ( protocol -- port )
{ {
{ "http" [ 80 ] } { "http" [ 80 ] }
{ "https" [ 443 ] } { "https" [ 443 ] }
{ "feed" [ 80 ] }
{ "ftp" [ 21 ] } { "ftp" [ 21 ] }
[ no-protocol-found ]
} case ; } case ;
: ensure-port ( url -- url' ) : ensure-port ( url -- url' )