fix the parser for groups

db4
Doug Coleman 2008-09-22 10:48:01 -05:00
parent c61c9eb625
commit da35e13153
1 changed files with 35 additions and 41 deletions

View File

@ -1,10 +1,10 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators io io.streams.string USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser multi-methods namespaces qualified sets kernel math math.parser namespaces qualified sets
quotations sequences splitting symbols vectors math.order quotations sequences splitting symbols vectors math.order
unicode.categories strings regexp.backend regexp.utils unicode.categories strings regexp.backend regexp.utils
unicode.case ; unicode.case words ;
IN: regexp.parser IN: regexp.parser
FROM: math.ranges => [a,b] ; FROM: math.ranges => [a,b] ;
@ -25,11 +25,21 @@ TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
TUPLE: negation term ; INSTANCE: negation node TUPLE: negation term ; INSTANCE: negation node
TUPLE: constant char ; INSTANCE: constant node TUPLE: constant char ; INSTANCE: constant node
TUPLE: range from to ; INSTANCE: range node TUPLE: range from to ; INSTANCE: range node
MIXIN: parentheses-group
TUPLE: lookahead term ; INSTANCE: lookahead node TUPLE: lookahead term ; INSTANCE: lookahead node
INSTANCE: lookahead parentheses-group
TUPLE: lookbehind term ; INSTANCE: lookbehind node TUPLE: lookbehind term ; INSTANCE: lookbehind node
INSTANCE: lookbehind parentheses-group
TUPLE: capture-group term ; INSTANCE: capture-group node TUPLE: capture-group term ; INSTANCE: capture-group node
INSTANCE: capture-group parentheses-group
TUPLE: non-capture-group term ; INSTANCE: non-capture-group node TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
INSTANCE: non-capture-group parentheses-group
TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
INSTANCE: independent-group parentheses-group
TUPLE: comment-group term ; INSTANCE: comment-group node
INSTANCE: comment-group parentheses-group
TUPLE: character-class-range from to ; INSTANCE: character-class-range node TUPLE: character-class-range from to ; INSTANCE: character-class-range node
SINGLETON: epsilon INSTANCE: epsilon node SINGLETON: epsilon INSTANCE: epsilon node
SINGLETON: any-char INSTANCE: any-char node SINGLETON: any-char INSTANCE: any-char node
@ -98,25 +108,6 @@ left-parenthesis pipe caret dash ;
ERROR: unmatched-parentheses ; ERROR: unmatched-parentheses ;
: make-positive-lookahead ( string -- )
lookahead boa push-stack ;
: make-negative-lookahead ( string -- )
<negation> lookahead boa push-stack ;
: make-independent-group ( string -- )
#! no backtracking
independent-group boa push-stack ;
: make-positive-lookbehind ( string -- )
lookbehind boa push-stack ;
: make-negative-lookbehind ( string -- )
<negation> lookbehind boa push-stack ;
: make-non-capturing-group ( string -- )
non-capture-group boa push-stack ;
ERROR: bad-option ch ; ERROR: bad-option ch ;
: option ( ch -- singleton ) : option ( ch -- singleton )
@ -143,33 +134,38 @@ ERROR: bad-option ch ;
DEFER: (parse-regexp) DEFER: (parse-regexp)
: parse-special-group ( -- ) : parse-special-group ( -- )
beginning-of-group push-stack ;
(parse-regexp) pop-stack make-non-capturing-group ; ! beginning-of-group push-stack
! (parse-regexp) pop-stack make-non-capturing-group ;
ERROR: bad-special-group string ; ERROR: bad-special-group string ;
DEFER: nested-parse-regexp : nested-parse-regexp ( token ? -- )
[ push-stack (parse-regexp) pop-stack ] dip
[ <negation> ] when pop-stack boa push-stack ;
! non-capturing groups
: (parse-special-group) ( -- ) : (parse-special-group) ( -- )
read1 { read1 {
{ [ dup CHAR: # = ] { [ dup CHAR: # = ] ! comment
[ drop nested-parse-regexp pop-stack drop ] } [ drop comment-group f nested-parse-regexp pop-stack drop ] }
{ [ dup CHAR: : = ] { [ dup CHAR: : = ]
[ drop nested-parse-regexp pop-stack make-non-capturing-group ] } [ drop non-capture-group f nested-parse-regexp ] }
{ [ dup CHAR: = = ] { [ dup CHAR: = = ]
[ drop nested-parse-regexp pop-stack make-positive-lookahead ] } [ drop lookahead f nested-parse-regexp ] }
{ [ dup CHAR: ! = ] { [ dup CHAR: ! = ]
[ drop nested-parse-regexp pop-stack make-negative-lookahead ] } [ drop lookahead t nested-parse-regexp ] }
{ [ dup CHAR: > = ] { [ dup CHAR: > = ]
[ drop nested-parse-regexp pop-stack make-independent-group ] } [ drop non-capture-group f nested-parse-regexp ] }
{ [ dup CHAR: < = peek1 CHAR: = = and ] { [ dup CHAR: < = peek1 CHAR: = = and ]
[ drop drop1 nested-parse-regexp pop-stack make-positive-lookbehind ] } [ drop drop1 lookbehind f nested-parse-regexp ] }
{ [ dup CHAR: < = peek1 CHAR: ! = and ] { [ dup CHAR: < = peek1 CHAR: ! = and ]
[ drop drop1 nested-parse-regexp pop-stack make-negative-lookbehind ] } [ drop drop1 lookbehind t nested-parse-regexp ] }
[ [
":)" read-until ":)" read-until
[ swap prefix ] dip [ swap prefix ] dip
{ {
{ CHAR: : [ parse-options parse-special-group ] } { CHAR: : [ parse-options (parse-special-group) ] }
{ CHAR: ) [ parse-options ] } { CHAR: ) [ parse-options ] }
[ drop bad-special-group ] [ drop bad-special-group ]
} case } case
@ -179,7 +175,7 @@ DEFER: nested-parse-regexp
: handle-left-parenthesis ( -- ) : handle-left-parenthesis ( -- )
peek1 CHAR: ? = peek1 CHAR: ? =
[ drop1 (parse-special-group) ] [ drop1 (parse-special-group) ]
[ nested-parse-regexp ] if ; [ capture-group f nested-parse-regexp ] if ;
: handle-dot ( -- ) any-char push-stack ; : handle-dot ( -- ) any-char push-stack ;
: handle-pipe ( -- ) pipe push-stack ; : handle-pipe ( -- ) pipe push-stack ;
@ -408,14 +404,12 @@ DEFER: handle-left-bracket
[ first|concatenation ] map first|alternation ; [ first|concatenation ] map first|alternation ;
: handle-right-parenthesis ( -- ) : handle-right-parenthesis ( -- )
stack beginning-of-group over last-index cut rest stack dup [ parentheses-group "members" word-prop member? ] find-last -rot cut rest
[ current-regexp get swap >>stack drop ] [ [ push ] keep current-regexp get (>>stack) ]
[ finish-regexp-parse <capture-group> push-stack ] bi* ; [ finish-regexp-parse push-stack ] bi* ;
: nested-parse-regexp ( -- )
beginning-of-group push-stack (parse-regexp) ;
: ((parse-regexp)) ( token -- ? ) : parse-regexp-token ( token -- ? )
{ {
{ CHAR: . [ handle-dot t ] } { CHAR: . [ handle-dot t ] }
{ CHAR: ( [ handle-left-parenthesis t ] } { CHAR: ( [ handle-left-parenthesis t ] }
@ -433,7 +427,7 @@ DEFER: handle-left-bracket
} case ; } case ;
: (parse-regexp) ( -- ) : (parse-regexp) ( -- )
read1 [ ((parse-regexp)) [ (parse-regexp) ] when ] when* ; read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
: parse-regexp ( regexp -- ) : parse-regexp ( regexp -- )
dup current-regexp [ dup current-regexp [