From 46aa56730b4a5c79cc326813026862b5a6e69649 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Thu, 6 Nov 2008 16:53:00 -0600 Subject: [PATCH] better parsing for anchors --- basis/regexp/parser/parser.factor | 44 ++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index d2ed346bf2..d04016b93a 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -233,15 +233,22 @@ ERROR: invalid-range a b ; SINGLETON: beginning-of-input SINGLETON: end-of-input -! : beginning-of-input ( -- obj ) -: handle-front-anchor ( -- ) front-anchor push-stack ; -: end-of-line ( -- obj ) - end-of-input +: newlines ( -- obj1 obj2 obj3 ) CHAR: \r CHAR: \n - 2dup 2array 4array lookahead boa ; + 2dup 2array ; -: handle-back-anchor ( -- ) end-of-line push-stack ; +: beginning-of-line ( -- obj ) + beginning-of-input newlines 4array lookbehind boa ; + +: end-of-line ( -- obj ) + end-of-input newlines 4array lookahead boa ; + +: handle-front-anchor ( -- ) + get-multiline beginning-of-line beginning-of-input ? push-stack ; + +: handle-back-anchor ( -- ) + get-multiline end-of-line end-of-input ? push-stack ; ERROR: bad-character-class obj ; ERROR: expected-posix-class ; @@ -412,16 +419,11 @@ DEFER: handle-left-bracket [ [ push ] keep current-regexp get (>>stack) ] [ finish-regexp-parse push-stack ] bi* ; - : parse-regexp-token ( token -- ? ) { -! todo: only match these at beginning/end of regexp - { CHAR: ^ [ handle-front-anchor t ] } - { CHAR: $ [ handle-back-anchor t ] } - - { CHAR: . [ handle-dot t ] } - { CHAR: ( [ handle-left-parenthesis t ] } + { CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning? { CHAR: ) [ handle-right-parenthesis f ] } + { CHAR: . [ handle-dot t ] } { CHAR: | [ handle-pipe t ] } { CHAR: ? [ handle-question t ] } { CHAR: * [ handle-star t ] } @@ -429,16 +431,28 @@ DEFER: handle-left-bracket { CHAR: { [ handle-left-brace t ] } { CHAR: [ [ handle-left-bracket t ] } { CHAR: \ [ handle-escape t ] } - [ push-stack t ] + [ + dup CHAR: $ = peek1 f = and [ + drop + handle-back-anchor f + ] [ + push-stack t + ] if + ] } case ; : (parse-regexp) ( -- ) read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ; +: parse-regexp-beginning ( -- ) + peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ; + : parse-regexp ( regexp -- ) dup current-regexp [ raw>> [ - [ (parse-regexp) ] with-input-stream + [ + parse-regexp-beginning (parse-regexp) + ] with-input-stream ] unless-empty current-regexp get stack finish-regexp-parse