Merge branch 'master' of git://factorcode.org/git/factor
commit
b81325dfbc
|
@ -68,12 +68,16 @@ IN: regexp.dfa
|
|||
1vector >>new-states drop ;
|
||||
|
||||
: set-traversal-flags ( regexp -- )
|
||||
[ dfa-table>> transitions>> keys ]
|
||||
dup
|
||||
[ nfa-traversal-flags>> ]
|
||||
bi 2drop ;
|
||||
[ dfa-table>> transitions>> keys ] bi
|
||||
[ tuck [ swap at ] with map concat ] with H{ } map>assoc
|
||||
>>dfa-traversal-flags drop ;
|
||||
|
||||
: construct-dfa ( regexp -- )
|
||||
[ set-initial-state ]
|
||||
[ new-transitions ]
|
||||
[ set-final-states ] tri ;
|
||||
! [ set-traversal-flags ] quad ;
|
||||
{
|
||||
[ set-initial-state ]
|
||||
[ new-transitions ]
|
||||
[ set-final-states ]
|
||||
[ set-traversal-flags ]
|
||||
} cleave ;
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
! Copyright (C) 2008 Doug Coleman.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors arrays assocs combinators io io.streams.string
|
||||
kernel math math.parser multi-methods namespaces qualified sets
|
||||
kernel math math.parser namespaces qualified sets
|
||||
quotations sequences splitting symbols vectors math.order
|
||||
unicode.categories strings regexp.backend regexp.utils
|
||||
unicode.case ;
|
||||
unicode.case words ;
|
||||
IN: regexp.parser
|
||||
|
||||
FROM: math.ranges => [a,b] ;
|
||||
|
@ -25,11 +25,21 @@ TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
|
|||
TUPLE: negation term ; INSTANCE: negation node
|
||||
TUPLE: constant char ; INSTANCE: constant node
|
||||
TUPLE: range from to ; INSTANCE: range node
|
||||
|
||||
MIXIN: parentheses-group
|
||||
TUPLE: lookahead term ; INSTANCE: lookahead node
|
||||
INSTANCE: lookahead parentheses-group
|
||||
TUPLE: lookbehind term ; INSTANCE: lookbehind node
|
||||
INSTANCE: lookbehind parentheses-group
|
||||
TUPLE: capture-group term ; INSTANCE: capture-group node
|
||||
INSTANCE: capture-group parentheses-group
|
||||
TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
|
||||
INSTANCE: non-capture-group parentheses-group
|
||||
TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
|
||||
INSTANCE: independent-group parentheses-group
|
||||
TUPLE: comment-group term ; INSTANCE: comment-group node
|
||||
INSTANCE: comment-group parentheses-group
|
||||
|
||||
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
|
||||
SINGLETON: epsilon INSTANCE: epsilon node
|
||||
SINGLETON: any-char INSTANCE: any-char node
|
||||
|
@ -98,25 +108,6 @@ left-parenthesis pipe caret dash ;
|
|||
|
||||
ERROR: unmatched-parentheses ;
|
||||
|
||||
: make-positive-lookahead ( string -- )
|
||||
lookahead boa push-stack ;
|
||||
|
||||
: make-negative-lookahead ( string -- )
|
||||
<negation> lookahead boa push-stack ;
|
||||
|
||||
: make-independent-group ( string -- )
|
||||
#! no backtracking
|
||||
independent-group boa push-stack ;
|
||||
|
||||
: make-positive-lookbehind ( string -- )
|
||||
lookbehind boa push-stack ;
|
||||
|
||||
: make-negative-lookbehind ( string -- )
|
||||
<negation> lookbehind boa push-stack ;
|
||||
|
||||
: make-non-capturing-group ( string -- )
|
||||
non-capture-group boa push-stack ;
|
||||
|
||||
ERROR: bad-option ch ;
|
||||
|
||||
: option ( ch -- singleton )
|
||||
|
@ -141,35 +132,35 @@ ERROR: bad-option ch ;
|
|||
: parse-options ( string -- )
|
||||
"-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
|
||||
|
||||
DEFER: (parse-regexp)
|
||||
: parse-special-group ( -- )
|
||||
beginning-of-group push-stack
|
||||
(parse-regexp) pop-stack make-non-capturing-group ;
|
||||
|
||||
ERROR: bad-special-group string ;
|
||||
|
||||
DEFER: nested-parse-regexp
|
||||
DEFER: (parse-regexp)
|
||||
: nested-parse-regexp ( token ? -- )
|
||||
[ push-stack (parse-regexp) pop-stack ] dip
|
||||
[ <negation> ] when pop-stack boa push-stack ;
|
||||
|
||||
! non-capturing groups
|
||||
: (parse-special-group) ( -- )
|
||||
read1 {
|
||||
{ [ dup CHAR: # = ]
|
||||
[ drop nested-parse-regexp pop-stack drop ] }
|
||||
{ [ dup CHAR: # = ] ! comment
|
||||
[ drop comment-group f nested-parse-regexp pop-stack drop ] }
|
||||
{ [ dup CHAR: : = ]
|
||||
[ drop nested-parse-regexp pop-stack make-non-capturing-group ] }
|
||||
[ drop non-capture-group f nested-parse-regexp ] }
|
||||
{ [ dup CHAR: = = ]
|
||||
[ drop nested-parse-regexp pop-stack make-positive-lookahead ] }
|
||||
[ drop lookahead f nested-parse-regexp ] }
|
||||
{ [ dup CHAR: ! = ]
|
||||
[ drop nested-parse-regexp pop-stack make-negative-lookahead ] }
|
||||
[ drop lookahead t nested-parse-regexp ] }
|
||||
{ [ dup CHAR: > = ]
|
||||
[ drop nested-parse-regexp pop-stack make-independent-group ] }
|
||||
[ drop non-capture-group f nested-parse-regexp ] }
|
||||
{ [ dup CHAR: < = peek1 CHAR: = = and ]
|
||||
[ drop drop1 nested-parse-regexp pop-stack make-positive-lookbehind ] }
|
||||
[ drop drop1 lookbehind f nested-parse-regexp ] }
|
||||
{ [ dup CHAR: < = peek1 CHAR: ! = and ]
|
||||
[ drop drop1 nested-parse-regexp pop-stack make-negative-lookbehind ] }
|
||||
[ drop drop1 lookbehind t nested-parse-regexp ] }
|
||||
[
|
||||
":)" read-until
|
||||
[ swap prefix ] dip
|
||||
{
|
||||
{ CHAR: : [ parse-options parse-special-group ] }
|
||||
{ CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] }
|
||||
{ CHAR: ) [ parse-options ] }
|
||||
[ drop bad-special-group ]
|
||||
} case
|
||||
|
@ -179,7 +170,7 @@ DEFER: nested-parse-regexp
|
|||
: handle-left-parenthesis ( -- )
|
||||
peek1 CHAR: ? =
|
||||
[ drop1 (parse-special-group) ]
|
||||
[ nested-parse-regexp ] if ;
|
||||
[ capture-group f nested-parse-regexp ] if ;
|
||||
|
||||
: handle-dot ( -- ) any-char push-stack ;
|
||||
: handle-pipe ( -- ) pipe push-stack ;
|
||||
|
@ -408,14 +399,12 @@ DEFER: handle-left-bracket
|
|||
[ first|concatenation ] map first|alternation ;
|
||||
|
||||
: handle-right-parenthesis ( -- )
|
||||
stack beginning-of-group over last-index cut rest
|
||||
[ current-regexp get swap >>stack drop ]
|
||||
[ finish-regexp-parse <capture-group> push-stack ] bi* ;
|
||||
stack dup [ parentheses-group "members" word-prop member? ] find-last -rot cut rest
|
||||
[ [ push ] keep current-regexp get (>>stack) ]
|
||||
[ finish-regexp-parse push-stack ] bi* ;
|
||||
|
||||
: nested-parse-regexp ( -- )
|
||||
beginning-of-group push-stack (parse-regexp) ;
|
||||
|
||||
: ((parse-regexp)) ( token -- ? )
|
||||
: parse-regexp-token ( token -- ? )
|
||||
{
|
||||
{ CHAR: . [ handle-dot t ] }
|
||||
{ CHAR: ( [ handle-left-parenthesis t ] }
|
||||
|
@ -433,7 +422,7 @@ DEFER: handle-left-bracket
|
|||
} case ;
|
||||
|
||||
: (parse-regexp) ( -- )
|
||||
read1 [ ((parse-regexp)) [ (parse-regexp) ] when ] when* ;
|
||||
read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
|
||||
|
||||
: parse-regexp ( regexp -- )
|
||||
dup current-regexp [
|
||||
|
|
|
@ -251,8 +251,8 @@ IN: regexp-tests
|
|||
! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||
! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||
|
||||
! [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
|
||||
! [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
|
||||
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
|
||||
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
|
||||
|
||||
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
|
||||
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
! Copyright (C) 2008 Doug Coleman.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors assocs combinators kernel math math.ranges
|
||||
quotations sequences regexp.parser regexp.classes
|
||||
combinators.short-circuit regexp.utils ;
|
||||
quotations sequences regexp.parser regexp.classes fry
|
||||
combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
|
||||
IN: regexp.traversal
|
||||
|
||||
TUPLE: dfa-traverser
|
||||
|
@ -10,7 +10,7 @@ TUPLE: dfa-traverser
|
|||
traversal-flags
|
||||
capture-groups
|
||||
{ capture-group-index integer }
|
||||
{ lookahead-counter integer }
|
||||
lookahead-counters
|
||||
last-state current-state
|
||||
text
|
||||
start-index current-index
|
||||
|
@ -26,7 +26,8 @@ TUPLE: dfa-traverser
|
|||
0 >>start-index
|
||||
0 >>current-index
|
||||
V{ } clone >>matches
|
||||
V{ } clone >>capture-groups ;
|
||||
V{ } clone >>capture-groups
|
||||
V{ } clone >>lookahead-counters ;
|
||||
|
||||
: final-state? ( dfa-traverser -- ? )
|
||||
[ current-state>> ] [ dfa-table>> final-states>> ] bi
|
||||
|
@ -43,9 +44,21 @@ TUPLE: dfa-traverser
|
|||
dup save-final-state
|
||||
] when text-finished? ;
|
||||
|
||||
: print-flags ( dfa-traverser -- dfa-traverser )
|
||||
GENERIC: flag-action ( dfa-traverser flag -- )
|
||||
|
||||
M: lookahead-on flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
lookahead-counters>> 0 swap push ;
|
||||
|
||||
M: lookahead-off flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
dup lookahead-counters>> pop
|
||||
'[ _ - ] change-current-index drop ;
|
||||
|
||||
: process-flags ( dfa-traverser -- )
|
||||
[ [ 1+ ] map ] change-lookahead-counters
|
||||
dup [ current-state>> ] [ traversal-flags>> ] bi
|
||||
;
|
||||
at [ dup . flag-action ] with each ;
|
||||
|
||||
: increment-state ( dfa-traverser state -- dfa-traverser )
|
||||
[
|
||||
|
@ -79,6 +92,7 @@ TUPLE: dfa-traverser
|
|||
[ nth ] 2dip ;
|
||||
|
||||
: do-match ( dfa-traverser -- dfa-traverser )
|
||||
dup process-flags
|
||||
dup match-done? [
|
||||
dup setup-match match-transition
|
||||
[ increment-state do-match ] when*
|
||||
|
|
|
@ -212,11 +212,15 @@ PRIVATE>
|
|||
[ [ host>> ] [ port>> ] bi <inet> ] [ protocol>> ] bi
|
||||
secure-protocol? [ <secure> ] when ;
|
||||
|
||||
ERROR: no-protocol-found protocol ;
|
||||
|
||||
: protocol-port ( protocol -- port )
|
||||
{
|
||||
{ "http" [ 80 ] }
|
||||
{ "https" [ 443 ] }
|
||||
{ "feed" [ 80 ] }
|
||||
{ "ftp" [ 21 ] }
|
||||
[ no-protocol-found ]
|
||||
} case ;
|
||||
|
||||
: ensure-port ( url -- url' )
|
||||
|
|
Loading…
Reference in New Issue