From 105ef28433925637e257b4e05a7faa7754c61270 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 16 Feb 2009 20:23:00 -0600 Subject: [PATCH] Rewriting regexp parser --- basis/regexp/nfa/nfa.factor | 60 +-- basis/regexp/parser/parser-tests.factor | 50 +-- basis/regexp/parser/parser.factor | 538 +++++++----------------- basis/regexp/regexp.factor | 5 +- basis/regexp/traversal/traversal.factor | 2 +- 5 files changed, 167 insertions(+), 488 deletions(-) diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 44481454fc..c8ee1187bc 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -11,22 +11,10 @@ IN: regexp.nfa ERROR: feature-is-broken feature ; -SYMBOL: negation-mode -: negated? ( -- ? ) negation-mode get 0 or odd? ; +SYMBOL: negated? SINGLETON: eps -MIXIN: traversal-flag -SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag -SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag -SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag -SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag -SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag -SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag -SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag -SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag -SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag - : options ( -- obj ) current-regexp get options>> ; : option? ( obj -- ? ) options key? ; @@ -53,7 +41,7 @@ GENERIC: nfa-node ( node -- ) s1 [ regexp next-state ] stack [ regexp stack>> ] table [ regexp nfa-table>> ] | - negated? [ + negated? get [ s0 f obj class make-transition table add-transition s0 s1 table add-transition ] [ @@ -62,10 +50,6 @@ GENERIC: nfa-node ( node -- ) s0 s1 2array stack push t s1 table final-states>> set-at ] ; -: add-traversal-flag ( flag -- ) - stack peek second - current-regexp get nfa-traversal-flags>> push-at ; - :: concatenate-nodes ( -- ) [let* | regexp [ current-regexp get ] stack [ regexp stack>> ] @@ -97,7 +81,7 @@ GENERIC: nfa-node ( node -- ) t s5 table final-states>> set-at s4 s5 2array stack push ] ; -M: kleene-star nfa-node ( node -- ) +M: star nfa-node ( node -- ) term>> nfa-node [let* | regexp [ current-regexp get ] stack [ regexp stack>> ] @@ -139,17 +123,12 @@ M: constant nfa-node ( node -- ) char>> literal-transition add-simple-entry ] if ; -M: epsilon nfa-node ( node -- ) - drop eps literal-transition add-simple-entry ; - M: word nfa-node ( node -- ) class-transition add-simple-entry ; M: any-char nfa-node ( node -- ) [ dotall option? ] dip any-char-no-nl ? class-transition add-simple-entry ; -! M: beginning-of-text nfa-node ( node -- ) ; - M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ; M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ; @@ -182,38 +161,6 @@ M: character-class-range nfa-node ( node -- ) class-transition add-simple-entry ] if ; -M: capture-group nfa-node ( node -- ) - term>> nfa-node ; - -M: non-capture-group nfa-node ( node -- ) - term>> nfa-node ; - -M: reluctant-kleene-star nfa-node ( node -- ) - term>> nfa-node ; - -M: negation nfa-node ( node -- ) - negation-mode inc - term>> nfa-node - negation-mode dec ; - -M: lookahead nfa-node ( node -- ) - "lookahead" feature-is-broken - eps literal-transition add-simple-entry - lookahead-on add-traversal-flag - term>> nfa-node - eps literal-transition add-simple-entry - lookahead-off add-traversal-flag - 2 [ concatenate-nodes ] times ; - -M: lookbehind nfa-node ( node -- ) - "lookbehind" feature-is-broken - eps literal-transition add-simple-entry - lookbehind-on add-traversal-flag - term>> nfa-node - eps literal-transition add-simple-entry - lookbehind-off add-traversal-flag - 2 [ concatenate-nodes ] times ; - M: option nfa-node ( node -- ) [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if eps literal-transition add-simple-entry ; @@ -221,7 +168,6 @@ M: option nfa-node ( node -- ) : construct-nfa ( regexp -- ) [ reset-regexp - negation-mode off [ current-regexp set ] [ parse-tree>> nfa-node ] [ set-start-state ] tri diff --git a/basis/regexp/parser/parser-tests.factor b/basis/regexp/parser/parser-tests.factor index fe4d2f1d1a..d606015f61 100644 --- a/basis/regexp/parser/parser-tests.factor +++ b/basis/regexp/parser/parser-tests.factor @@ -1,34 +1,24 @@ -USING: kernel tools.test regexp.backend regexp ; -IN: regexp.parser +USING: kernel tools.test regexp.parser fry sequences ; +IN: regexp.parser.tests -: test-regexp ( string -- ) - default-regexp parse-regexp ; +: regexp-parses ( string -- ) + [ [ ] ] dip '[ _ parse-regexp drop ] unit-test ; -! [ "(" ] [ unmatched-parentheses? ] must-fail-with +: regexp-fails ( string -- ) + '[ _ parse-regexp ] must-fail ; -[ ] [ "a|b" test-regexp ] unit-test -[ ] [ "a.b" test-regexp ] unit-test -[ ] [ "a|b|c" test-regexp ] unit-test -[ ] [ "abc|b" test-regexp ] unit-test -[ ] [ "a|bcd" test-regexp ] unit-test -[ ] [ "a|(b)" test-regexp ] unit-test -[ ] [ "(a)|b" test-regexp ] unit-test -[ ] [ "(a|b)" test-regexp ] unit-test -[ ] [ "((a)|(b))" test-regexp ] unit-test +{ + "a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||" + "(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|" + "[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]" + "[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}" + "(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]" + "[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)" + "\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}" + "\\ueeee" "\\0333" "\\xff" "\\\\" "\\w" +} [ regexp-parses ] each -[ ] [ "(?:a)" test-regexp ] unit-test -[ ] [ "(?i:a)" test-regexp ] unit-test -[ ] [ "(?-i:a)" test-regexp ] unit-test -[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with -[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with - -[ ] [ "(?=a)" test-regexp ] unit-test - -[ ] [ "[abc]" test-regexp ] unit-test -[ ] [ "[a-c]" test-regexp ] unit-test -[ ] [ "[^a-c]" test-regexp ] unit-test -[ "[^]" test-regexp ] must-fail - -[ ] [ "|b" test-regexp ] unit-test -[ ] [ "b|" test-regexp ] unit-test -[ ] [ "||" test-regexp ] unit-test +{ + "[^]" "[]" "a{foo}" "a{,}" "a{}" "(?)" "\\p{foo}" "\\P{foo}" + "\\ueeeg" "\\0339" "\\xfg" +} [ regexp-fails ] each diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 377535eccd..65965fdeb9 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -1,437 +1,183 @@ -! Copyright (C) 2008 Doug Coleman. +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs combinators io io.streams.string -kernel math math.parser namespaces sets -quotations sequences splitting vectors math.order -strings regexp.backend regexp.utils -unicode.case unicode.categories words locals regexp.classes ; +USING: peg.ebnf kernel math.parser sequences assocs arrays +combinators regexp.classes strings splitting peg locals ; IN: regexp.parser -FROM: math.ranges => [a,b] ; +TUPLE: range from to ; +TUPLE: char-class ranges ; +TUPLE: primitive-class class ; +TUPLE: not-char-class ranges ; +TUPLE: not-primitive-class class ; +TUPLE: from-to n m ; +TUPLE: at-least n ; +TUPLE: up-to n ; +TUPLE: exactly n ; +TUPLE: times expression times ; +TUPLE: concatenation seq ; +TUPLE: alternation seq ; +TUPLE: maybe term ; +TUPLE: star term ; +TUPLE: plus term ; +TUPLE: with-options tree options ; +TUPLE: ast ^? $? tree ; +SINGLETON: any-char -TUPLE: concatenation seq ; INSTANCE: concatenation node -TUPLE: alternation seq ; INSTANCE: alternation node -TUPLE: kleene-star term ; INSTANCE: kleene-star node +: allowed-char? ( ch -- ? ) + ".()|[*+?" member? not ; -! !!!!!!!! -TUPLE: possessive-question term ; INSTANCE: possessive-question node -TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node +ERROR: bad-number ; -! !!!!!!!! -TUPLE: reluctant-question term ; INSTANCE: reluctant-question node -TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node +: ensure-number ( n -- n ) + [ bad-number ] unless* ; -TUPLE: negation term ; INSTANCE: negation node -TUPLE: constant char ; INSTANCE: constant node -TUPLE: range from to ; INSTANCE: range node +:: at-error ( key assoc quot: ( key -- replacement ) -- value ) + key assoc at* [ drop key quot call ] unless ; inline -MIXIN: parentheses-group -TUPLE: lookahead term ; INSTANCE: lookahead node -INSTANCE: lookahead parentheses-group -TUPLE: lookbehind term ; INSTANCE: lookbehind node -INSTANCE: lookbehind parentheses-group -TUPLE: capture-group term ; INSTANCE: capture-group node -INSTANCE: capture-group parentheses-group -TUPLE: non-capture-group term ; INSTANCE: non-capture-group node -INSTANCE: non-capture-group parentheses-group -TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group -INSTANCE: independent-group parentheses-group -TUPLE: comment-group term ; INSTANCE: comment-group node -INSTANCE: comment-group parentheses-group +ERROR: bad-class name ; -SINGLETON: epsilon INSTANCE: epsilon node +: name>class ( name -- class ) + { + { "Lower" letter-class } + { "Upper" LETTER-class } + { "Alpha" Letter-class } + { "ASCII" ascii-class } + { "Digit" digit-class } + { "Alnum" alpha-class } + { "Punct" punctuation-class } + { "Graph" java-printable-class } + { "Print" java-printable-class } + { "Blank" non-newline-blank-class } + { "Cntrl" control-character-class } + { "XDigit" hex-digit-class } + { "Space" java-blank-class } + ! TODO: unicode-character-class + } [ bad-class ] at-error ; -TUPLE: option option on? ; INSTANCE: option node +: lookup-escape ( char -- ast ) + { + { CHAR: t [ CHAR: \t ] } + { CHAR: n [ CHAR: \n ] } + { CHAR: r [ CHAR: \r ] } + { CHAR: f [ HEX: c ] } + { CHAR: a [ HEX: 7 ] } + { CHAR: e [ HEX: 1b ] } + { CHAR: \\ [ CHAR: \\ ] } + + { CHAR: w [ c-identifier-class primitive-class boa ] } + { CHAR: W [ c-identifier-class not-primitive-class boa ] } + { CHAR: s [ java-blank-class primitive-class boa ] } + { CHAR: S [ java-blank-class not-primitive-class boa ] } + { CHAR: d [ digit-class primitive-class boa ] } + { CHAR: D [ digit-class not-primitive-class boa ] } + + [ ] + } case ; + +TUPLE: options on off ; SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ; -SINGLETONS: beginning-of-character-class end-of-character-class -left-parenthesis pipe caret dash ; - -: push1 ( obj -- ) input-stream get stream>> push ; -: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ; -: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ; -: drop1 ( -- ) read1 drop ; - -: stack ( -- obj ) current-regexp get stack>> ; -: change-whole-stack ( quot -- ) - current-regexp get - [ stack>> swap call ] keep (>>stack) ; inline -: push-stack ( obj -- ) stack push ; -: pop-stack ( -- obj ) stack pop ; -: cut-out ( vector n -- vector' vector ) cut rest ; -ERROR: cut-stack-error ; -: cut-stack ( obj vector -- vector' vector ) - [ nip ] [ last-index ] 2bi [ cut-stack-error ] unless* cut-out swap ; - -: ( obj -- kleene ) possessive-kleene-star boa ; -: ( obj -- kleene ) reluctant-kleene-star boa ; -: ( obj -- kleene ) possessive-question boa ; -: ( obj -- kleene ) reluctant-question boa ; - -: ( obj -- negation ) negation boa ; -: ( seq -- concatenation ) - >vector [ epsilon ] [ concatenation boa ] if-empty ; -: ( seq -- alternation ) >vector alternation boa ; -: ( obj -- capture-group ) capture-group boa ; -: ( obj -- kleene-star ) kleene-star boa ; -: ( obj -- constant ) constant boa ; - -: first|concatenation ( seq -- first/concatenation ) - dup length 1 = [ first ] [ ] if ; - -: first|alternation ( seq -- first/alternation ) - dup length 1 = [ first ] [ ] if ; - -: ( from to -- obj ) - 2dup < - [ character-class-range boa ] [ 2drop unmatchable-class ] if ; - -ERROR: unmatched-parentheses ; - -ERROR: unknown-regexp-option option ; +: options-assoc ( -- assoc ) + H{ + { CHAR: i case-insensitive } + { CHAR: d unix-lines } + { CHAR: m multiline } + { CHAR: n multiline } + { CHAR: r reversed-regexp } + { CHAR: s dotall } + { CHAR: u unicode-case } + { CHAR: x comments } + } ; : ch>option ( ch -- singleton ) - { - { CHAR: i [ case-insensitive ] } - { CHAR: d [ unix-lines ] } - { CHAR: m [ multiline ] } - { CHAR: n [ multiline ] } - { CHAR: r [ reversed-regexp ] } - { CHAR: s [ dotall ] } - { CHAR: u [ unicode-case ] } - { CHAR: x [ comments ] } - [ unknown-regexp-option ] - } case ; + options-assoc at ; : option>ch ( option -- string ) - { - { case-insensitive [ CHAR: i ] } - { multiline [ CHAR: m ] } - { reversed-regexp [ CHAR: r ] } - { dotall [ CHAR: s ] } - [ unknown-regexp-option ] - } case ; + options-assoc value-at ; -: toggle-option ( ch ? -- ) - [ ch>option ] dip option boa push-stack ; +: parse-options ( on off -- options ) + [ [ ch>option ] map ] bi@ options boa ; -: (parse-options) ( string ? -- ) [ toggle-option ] curry each ; +! TODO: make range syntax better (negation, and, etc), +! add syntax for various parenthized things, +! add greedy and nongreedy forms of matching +! (once it's all implemented) -: parse-options ( string -- ) - "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; +EBNF: (parse-regexp) -ERROR: bad-special-group string ; +CharacterInBracket = !("}") Character -DEFER: (parse-regexp) -: nested-parse-regexp ( token ? -- ) - [ push-stack (parse-regexp) pop-stack ] dip - [ ] when pop-stack new swap >>term push-stack ; +Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]] + | "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]] + | "u" Character:a Character:b Character:c Character:d + => [[ { a b c d } hex> ensure-number ]] + | "x" Character:a Character:b + => [[ { a b } hex> ensure-number ]] + | "0" Character:a Character:b Character:c + => [[ { a b c } oct> ensure-number ]] + | . => [[ lookup-escape ]] -! non-capturing groups -: (parse-special-group) ( -- ) - read1 { - { [ dup CHAR: # = ] ! comment - [ drop comment-group f nested-parse-regexp pop-stack drop ] } - { [ dup CHAR: : = ] - [ drop non-capture-group f nested-parse-regexp ] } - { [ dup CHAR: = = ] - [ drop lookahead f nested-parse-regexp ] } - { [ dup CHAR: ! = ] - [ drop lookahead t nested-parse-regexp ] } - { [ dup CHAR: > = ] - [ drop non-capture-group f nested-parse-regexp ] } - { [ dup CHAR: < = peek1 CHAR: = = and ] - [ drop drop1 lookbehind f nested-parse-regexp ] } - { [ dup CHAR: < = peek1 CHAR: ! = and ] - [ drop drop1 lookbehind t nested-parse-regexp ] } - [ - ":)" read-until - [ swap prefix ] dip - { - { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] } - { CHAR: ) [ parse-options ] } - [ drop bad-special-group ] - } case - ] - } cond ; +Character = "\\" Escape:e => [[ e ]] + | . ?[ allowed-char? ]? -: handle-left-parenthesis ( -- ) - peek1 CHAR: ? = - [ drop1 (parse-special-group) ] - [ capture-group f nested-parse-regexp ] if ; +AnyRangeCharacter = Character | "[" -: handle-dot ( -- ) any-char push-stack ; -: handle-pipe ( -- ) pipe push-stack ; -: (handle-star) ( obj -- kleene-star ) - peek1 { - { CHAR: + [ drop1 ] } - { CHAR: ? [ drop1 ] } - [ drop ] - } case ; -: handle-star ( -- ) stack pop (handle-star) push-stack ; -: handle-question ( -- ) - stack pop peek1 { - { CHAR: + [ drop1 ] } - { CHAR: ? [ drop1 ] } - [ drop epsilon 2array ] - } case push-stack ; -: handle-plus ( -- ) - stack pop dup (handle-star) - 2array push-stack ; +RangeCharacter = !("]") AnyRangeCharacter -ERROR: unmatched-brace ; -: parse-repetition ( -- start finish ? ) - "}" read-until [ unmatched-brace ] unless - [ "," split1 [ string>number ] bi@ ] - [ CHAR: , swap index >boolean ] bi ; +Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] + | RangeCharacter -: replicate/concatenate ( n obj -- obj' ) - over zero? [ 2drop epsilon ] - [ first|concatenation ] if ; +StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] + | AnyRangeCharacter -: exactly-n ( n -- ) - stack pop replicate/concatenate push-stack ; +Ranges = StartRange:s Range*:r => [[ r s prefix ]] -: at-least-n ( n -- ) - stack pop - [ replicate/concatenate ] keep - 2array push-stack ; +CharClass = "^" Ranges:e => [[ e not-char-class boa ]] + | Ranges:e => [[ e char-class boa ]] -: at-most-n ( n -- ) - 1+ - stack pop - [ replicate/concatenate ] curry map push-stack ; +Options = [idmsux]* -: from-m-to-n ( m n -- ) - [a,b] - stack pop - [ replicate/concatenate ] curry map - push-stack ; +Parenthized = "?:" Alternation:a => [[ a ]] + | "?" Options:on "-"? Options:off ":" Alternation:a + => [[ a on off parse-options with-options boa ]] + | "?#" [^)]* => [[ ignore ]] + | Alternation -ERROR: invalid-range a b ; +Element = "(" Parenthized:p ")" => [[ p ]] + | "[" CharClass:r "]" => [[ r ]] + | ".":d => [[ any-char ]] + | Character -: handle-left-brace ( -- ) - parse-repetition - [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip - [ - 2dup and [ from-m-to-n ] - [ [ nip at-most-n ] [ at-least-n ] if* ] if - ] [ drop 0 max exactly-n ] if ; +Number = (!(","|"}").)* => [[ string>number ensure-number ]] -: handle-front-anchor ( -- ) beginning-of-line push-stack ; -: handle-back-anchor ( -- ) end-of-line push-stack ; +Times = "," Number:n "}" => [[ n up-to boa ]] + | Number:n ",}" => [[ n at-least boa ]] + | Number:n "}" => [[ n exactly boa ]] + | "}" => [[ bad-number ]] + | Number:n "," Number:m "}" => [[ n m from-to boa ]] -ERROR: bad-character-class obj ; -ERROR: expected-posix-class ; +Repeated = Element:e "{" Times:t => [[ e t times boa ]] + | Element:e "?" => [[ e maybe boa ]] + | Element:e "*" => [[ e star boa ]] + | Element:e "+" => [[ e plus boa ]] + | Element -: parse-posix-class ( -- obj ) - read1 CHAR: { = [ expected-posix-class ] unless - "}" read-until [ bad-character-class ] unless - { - { "Lower" [ letter-class ] } - { "Upper" [ LETTER-class ] } - { "Alpha" [ Letter-class ] } - { "ASCII" [ ascii-class ] } - { "Digit" [ digit-class ] } - { "Alnum" [ alpha-class ] } - { "Punct" [ punctuation-class ] } - { "Graph" [ java-printable-class ] } - { "Print" [ java-printable-class ] } - { "Blank" [ non-newline-blank-class ] } - { "Cntrl" [ control-character-class ] } - { "XDigit" [ hex-digit-class ] } - { "Space" [ java-blank-class ] } - ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss - [ bad-character-class ] - } case ; +Concatenation = Repeated*:r => [[ r concatenation boa ]] -: parse-octal ( -- n ) 3 read oct> check-octal ; -: parse-short-hex ( -- n ) 2 read hex> check-hex ; -: parse-long-hex ( -- n ) 6 read hex> check-hex ; -: parse-control-character ( -- n ) read1 ; +Alternation = Concatenation:c ("|" Concatenation)*:a + => [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]] -ERROR: bad-escaped-literals seq ; +End = !(.) -: parse-til-E ( -- obj ) - "\\E" read-until [ bad-escaped-literals ] unless ; - -:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj ) - parse-til-E - drop1 - [ epsilon ] [ - quot call [ ] V{ } map-as - first|concatenation - ] if-empty ; inline +Main = Alternation End +;EBNF -: parse-escaped-literals ( -- obj ) - [ ] (parse-escaped-literals) ; - -: lower-case-literals ( -- obj ) - [ >lower ] (parse-escaped-literals) ; - -: upper-case-literals ( -- obj ) - [ >upper ] (parse-escaped-literals) ; - -: parse-escaped ( -- obj ) - read1 - { - { CHAR: t [ CHAR: \t ] } - { CHAR: n [ CHAR: \n ] } - { CHAR: r [ CHAR: \r ] } - { CHAR: f [ HEX: c ] } - { CHAR: a [ HEX: 7 ] } - { CHAR: e [ HEX: 1b ] } - - { CHAR: w [ c-identifier-class ] } - { CHAR: W [ c-identifier-class ] } - { CHAR: s [ java-blank-class ] } - { CHAR: S [ java-blank-class ] } - { CHAR: d [ digit-class ] } - { CHAR: D [ digit-class ] } - - { CHAR: p [ parse-posix-class ] } - { CHAR: P [ parse-posix-class ] } - { CHAR: x [ parse-short-hex ] } - { CHAR: u [ parse-long-hex ] } - { CHAR: 0 [ parse-octal ] } - { CHAR: c [ parse-control-character ] } - - { CHAR: Q [ parse-escaped-literals ] } - - ! { CHAR: b [ word-boundary-class ] } - ! { CHAR: B [ word-boundary-class ] } - ! { CHAR: A [ handle-beginning-of-input ] } - ! { CHAR: z [ handle-end-of-input ] } - - ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator - - ! m//g mode - ! { CHAR: G [ end of previous match ] } - - ! Group capture - ! { CHAR: 1 [ CHAR: 1 ] } - ! { CHAR: 2 [ CHAR: 2 ] } - ! { CHAR: 3 [ CHAR: 3 ] } - ! { CHAR: 4 [ CHAR: 4 ] } - ! { CHAR: 5 [ CHAR: 5 ] } - ! { CHAR: 6 [ CHAR: 6 ] } - ! { CHAR: 7 [ CHAR: 7 ] } - ! { CHAR: 8 [ CHAR: 8 ] } - ! { CHAR: 9 [ CHAR: 9 ] } - - ! Perl extensions - ! can't do \l and \u because \u is already a 4-hex - { CHAR: L [ lower-case-literals ] } - { CHAR: U [ upper-case-literals ] } - - [ ] - } case ; - -: handle-escape ( -- ) parse-escaped push-stack ; - -: handle-dash ( vector -- vector' ) - H{ { dash CHAR: - } } substitute ; - -: character-class>alternation ( seq -- alternation ) - [ dup number? [ ] when ] map first|alternation ; - -: handle-caret ( vector -- vector' ) - dup [ length 2 >= ] [ first caret eq? ] bi and [ - rest-slice character-class>alternation - ] [ - character-class>alternation - ] if ; - -: make-character-class ( -- character-class ) - [ beginning-of-character-class swap cut-stack ] change-whole-stack - handle-dash handle-caret ; - -: apply-dash ( -- ) - stack [ pop3 nip ] keep push ; - -: apply-dash? ( -- ? ) - stack dup length 3 >= - [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ; - -ERROR: empty-negated-character-class ; -DEFER: handle-left-bracket -: (parse-character-class) ( -- ) - read1 [ empty-negated-character-class ] unless* { - { CHAR: [ [ handle-left-bracket t ] } - { CHAR: ] [ make-character-class push-stack f ] } - { CHAR: - [ dash push-stack t ] } - { CHAR: \ [ parse-escaped push-stack t ] } - [ push-stack apply-dash? [ apply-dash ] when t ] - } case - [ (parse-character-class) ] when ; - -: push-constant ( ch -- ) push-stack ; - -: parse-character-class-second ( -- ) - read1 { - { CHAR: [ [ CHAR: [ push-constant ] } - { CHAR: ] [ CHAR: ] push-constant ] } - { CHAR: - [ CHAR: - push-constant ] } - [ push1 ] - } case ; - -: parse-character-class-first ( -- ) - read1 { - { CHAR: ^ [ caret push-stack parse-character-class-second ] } - { CHAR: [ [ CHAR: [ push-constant ] } - { CHAR: ] [ CHAR: ] push-constant ] } - { CHAR: - [ CHAR: - push-constant ] } - [ push1 ] - } case ; - -: handle-left-bracket ( -- ) - beginning-of-character-class push-stack - parse-character-class-first (parse-character-class) ; - -: finish-regexp-parse ( stack -- obj ) - { pipe } split - [ first|concatenation ] map first|alternation ; - -: handle-right-parenthesis ( -- ) - stack dup [ parentheses-group "members" word-prop member? ] find-last - -rot cut rest - [ [ push ] keep current-regexp get (>>stack) ] - [ finish-regexp-parse push-stack ] bi* ; - -: parse-regexp-token ( token -- ? ) - { - { CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning? - { CHAR: ) [ handle-right-parenthesis f ] } - { CHAR: . [ handle-dot t ] } - { CHAR: | [ handle-pipe t ] } - { CHAR: ? [ handle-question t ] } - { CHAR: * [ handle-star t ] } - { CHAR: + [ handle-plus t ] } - { CHAR: { [ handle-left-brace t ] } - { CHAR: [ [ handle-left-bracket t ] } - { CHAR: \ [ handle-escape t ] } - [ - dup CHAR: $ = peek1 f = and - [ drop handle-back-anchor f ] - [ push-constant t ] if - ] - } case ; - -: (parse-regexp) ( -- ) - read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ; - -: parse-regexp-beginning ( -- ) - peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ; - -: parse-regexp ( regexp -- ) - dup current-regexp [ - raw>> [ - [ - parse-regexp-beginning (parse-regexp) - ] with-input-stream - ] unless-empty - current-regexp get [ finish-regexp-parse ] change-stack - dup stack>> >>parse-tree drop - ] with-variable ; +: parse-regexp ( string -- regexp ) + ! Hack because I want $ allowable in regexps, + ! but with special behavior at the end + ! This fails if the regexp is stupid, though... + dup first CHAR: ^ = tuck [ rest ] when + dup peek CHAR: $ = tuck [ but-last ] when + (parse-regexp) ast boa ; diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 86f978373b..62ebaab502 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -21,7 +21,7 @@ IN: regexp : construct-regexp ( regexp -- regexp' ) { - [ parse-regexp ] + [ dup raw>> parse-regexp >>parse-tree drop ] [ construct-nfa ] [ construct-dfa ] [ ] @@ -33,9 +33,6 @@ IN: regexp : match ( string regexp -- slice/f ) (match) return-match ; -: match* ( string regexp -- slice/f captured-groups ) - (match) [ return-match ] [ captured-groups>> ] bi ; - : matches? ( string regexp -- ? ) dupd match [ [ length ] bi@ = ] [ drop f ] if* ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index d0a76a6ddc..394bfe0d52 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -68,7 +68,7 @@ TUPLE: dfa-traverser : match-class ( transition from-state table -- to-state/f ) transitions>> at* [ - '[ drop _ swap class-member? ] assoc-find [ nip ] [ drop ] if + swap '[ drop _ swap class-member? ] assoc-find spin ? ] [ drop ] if ; : match-default ( transition from-state table -- to-state/f )