move things around. the parser no longer adjusts nodes based on options, instead opting for nfa to handle it (case-insensitive, multiline, dotall, reversed..)

2008-11-24 22:17:47 -06:00 · 2008-11-24 22:17:47 -06:00 · 384a11ecee
parent e4a2b671d3
commit 384a11ecee
8 changed files with 158 additions and 156 deletions
--- a/basis/regexp/backend/backend.factor
+++ b/basis/regexp/backend/backend.factor
@ -5,12 +5,13 @@ IN: regexp.backend
 TUPLE: regexp
    raw
    { stack vector }
    parse-tree
    { options hashtable }
    stack
    parse-tree
    nfa-table
    dfa-table
    minimized-table
    matchers
    { nfa-traversal-flags hashtable }
    { dfa-traversal-flags hashtable }
    { state integer }
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@ -1,12 +1,25 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors kernel math math.order symbols regexp.parser
+USING: accessors kernel math math.order symbols 
 words regexp.utils unicode.categories combinators.short-circuit ;
 IN: regexp.classes
 SINGLETONS: any-char any-char-no-nl
 letter-class LETTER-class Letter-class digit-class
 alpha-class non-newline-blank-class
 ascii-class punctuation-class java-printable-class blank-class
 control-character-class hex-digit-class java-blank-class c-identifier-class
 unmatchable-class terminator-class word-boundary-class ;
 SINGLETONS: beginning-of-input beginning-of-line
 end-of-input end-of-line ;
 MIXIN: node
 TUPLE: character-class-range from to ; INSTANCE: character-class-range node
 GENERIC: class-member? ( obj class -- ? )
-M: word class-member? ( obj class -- ? ) 2drop f ;
+M: t class-member? ( obj class -- ? ) 2drop f ;
 M: integer class-member? ( obj class -- ? ) 2drop f ;
@ -70,3 +83,9 @@ M: terminator-class class-member? ( obj class -- ? )
        [ CHAR: \u002028 = ]
        [ CHAR: \u002029 = ]
    } 1|| ;
 M: beginning-of-line class-member? ( obj class -- ? )
    2drop f ;
 M: end-of-line class-member? ( obj class -- ? )
    2drop f ;
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@ -3,7 +3,7 @@
 USING: accessors arrays assocs grouping kernel regexp.backend
 locals math namespaces regexp.parser sequences fry quotations
 math.order math.ranges vectors unicode.categories regexp.utils
-regexp.transition-tables words sets ;
+regexp.transition-tables words sets regexp.classes unicode.case ;
 IN: regexp.nfa
 SYMBOL: negation-mode
@ -22,8 +22,13 @@ SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
 SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
 SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
-: add-global-flag ( flag -- )
+: options ( -- obj ) current-regexp get options>> ;
-    current-regexp get nfa-table>> flags>> conjoin ;
+
 : option? ( obj -- ? ) options key? ;
 : option-on ( obj -- ) options conjoin ;
 : option-off ( obj -- ) options delete-at ;
 : next-state ( regexp -- state )
    [ state>> ] [ [ 1+ ] change-state drop ] bi ;
@ -106,6 +111,7 @@ M: kleene-star nfa-node ( node -- )
 M: concatenation nfa-node ( node -- )
    seq>>
    reversed-regexp option? [ <reversed> ] when
    [ [ nfa-node ] each ]
    [ length 1- [ concatenate-nodes ] times ] bi ;
@ -115,16 +121,59 @@ M: alternation nfa-node ( node -- )
    [ length 1- [ alternate-nodes ] times ] bi ;
 M: constant nfa-node ( node -- )
-    char>> literal-transition add-simple-entry ;
+    case-insensitive option? [
        dup char>> [ ch>lower ] [ ch>upper ] bi
        2dup = [
            2drop
            char>> literal-transition add-simple-entry
        ] [
            [ literal-transition add-simple-entry ] bi@
            alternate-nodes drop
        ] if
    ] [
        char>> literal-transition add-simple-entry
    ] if ;
 M: epsilon nfa-node ( node -- )
    drop eps literal-transition add-simple-entry ;
-M: word nfa-node ( node -- )
+M: word nfa-node ( node -- ) class-transition add-simple-entry ;
 M: any-char nfa-node ( node -- )
    [ dotall option? ] dip any-char-no-nl ?
    class-transition add-simple-entry ;
 ! M: beginning-of-text nfa-node ( node -- ) ;
 M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
 M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
 : choose-letter-class ( node -- node' )
    case-insensitive option? Letter-class rot ? ;
 M: letter-class nfa-node ( node -- )
    choose-letter-class class-transition add-simple-entry ;
 M: LETTER-class nfa-node ( node -- )
    choose-letter-class class-transition add-simple-entry ;
 M: character-class-range nfa-node ( node -- )
-    class-transition add-simple-entry ;
+    case-insensitive option? [
        dup [ from>> ] [ to>> ] bi
        2dup [ Letter? ] bi@ and [
            rot drop
            [ [ ch>lower ] bi@ character-class-range boa ]
            [ [ ch>upper ] bi@ character-class-range boa ] 2bi 
            [ class-transition add-simple-entry ] bi@
            alternate-nodes
        ] [
            2drop
            class-transition add-simple-entry
        ] if
    ] [
        class-transition add-simple-entry
    ] if ;
 M: capture-group nfa-node ( node -- )
    eps literal-transition add-simple-entry
@ -141,26 +190,6 @@ M: non-capture-group nfa-node ( node -- )
 M: reluctant-kleene-star nfa-node ( node -- )
    term>> <kleene-star> nfa-node ;
 M: beginning-of-line nfa-node ( node -- )
    drop 
    eps literal-transition add-simple-entry
    beginning-of-line add-global-flag ;
 M: end-of-line nfa-node ( node -- )
    drop
    eps literal-transition add-simple-entry
    end-of-line add-global-flag ;
 M: beginning-of-input nfa-node ( node -- )
    drop
    eps literal-transition add-simple-entry
    beginning-of-input add-global-flag ;
 M: end-of-input nfa-node ( node -- )
    drop
    eps literal-transition add-simple-entry
    end-of-input add-global-flag ;
 M: negation nfa-node ( node -- )
    negation-mode inc
    term>> nfa-node 
@ -182,6 +211,10 @@ M: lookbehind nfa-node ( node -- )
    lookbehind-off add-traversal-flag
    2 [ concatenate-nodes ] times ;
 M: option nfa-node ( node -- )
    [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
    eps literal-transition add-simple-entry ;
 : construct-nfa ( regexp -- )
    [
        reset-regexp
--- a/basis/regexp/parser/parser-tests.factor
+++ b/basis/regexp/parser/parser-tests.factor
@ -19,8 +19,8 @@ IN: regexp.parser
 [ ] [ "(?:a)" test-regexp ] unit-test
 [ ] [ "(?i:a)" test-regexp ] unit-test
 [ ] [ "(?-i:a)" test-regexp ] unit-test
-[ "(?z:a)" test-regexp ] [ bad-option? ] must-fail-with
+[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
-[ "(?-z:a)" test-regexp ] [ bad-option? ] must-fail-with
+[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
 [ ] [ "(?=a)" test-regexp ] unit-test
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@ -4,12 +4,11 @@ USING: accessors arrays assocs combinators io io.streams.string
 kernel math math.parser namespaces qualified sets
 quotations sequences splitting symbols vectors math.order
 unicode.categories strings regexp.backend regexp.utils
-unicode.case words locals ;
+unicode.case words locals regexp.classes ;
 IN: regexp.parser
 FROM: math.ranges => [a,b] ;
 MIXIN: node
 TUPLE: concatenation seq ; INSTANCE: concatenation node
 TUPLE: alternation seq ; INSTANCE: alternation node
 TUPLE: kleene-star term ; INSTANCE: kleene-star node
@ -40,38 +39,31 @@ INSTANCE: independent-group parentheses-group
 TUPLE: comment-group term ; INSTANCE: comment-group node
 INSTANCE: comment-group parentheses-group
 TUPLE: character-class-range from to ; INSTANCE: character-class-range node
 SINGLETON: epsilon INSTANCE: epsilon node
 SINGLETON: any-char INSTANCE: any-char node
 SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
 SINGLETON: beginning-of-input INSTANCE: beginning-of-input node
 SINGLETON: end-of-input INSTANCE: end-of-input node
 SINGLETON: beginning-of-line INSTANCE: beginning-of-line node
 SINGLETON: end-of-line INSTANCE: end-of-line node
-TUPLE: option-on option ; INSTANCE: option-on node
+TUPLE: option option on? ; INSTANCE: option node
-TUPLE: option-off option ; INSTANCE: option-off node
+
 SINGLETONS: unix-lines dotall multiline comments case-insensitive
 unicode-case reversed-regexp ;
-SINGLETONS: letter-class LETTER-class Letter-class digit-class
+SINGLETONS: beginning-of-character-class end-of-character-class
 alpha-class non-newline-blank-class
 ascii-class punctuation-class java-printable-class blank-class
 control-character-class hex-digit-class java-blank-class c-identifier-class
 unmatchable-class terminator-class word-boundary-class ;
 SINGLETONS: beginning-of-group end-of-group
 beginning-of-character-class end-of-character-class
 left-parenthesis pipe caret dash ;
-: get-option ( option -- ? ) current-regexp get options>> at ;
+: push1 ( obj -- ) input-stream get stream>> push ;
-: get-unix-lines ( -- ? ) unix-lines get-option ;
+: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
-: get-dotall ( -- ? ) dotall get-option ;
+: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
-: get-multiline ( -- ? ) multiline get-option ;
+: drop1 ( -- ) read1 drop ;
-: get-comments ( -- ? ) comments get-option ;
+
-: get-case-insensitive ( -- ? ) case-insensitive get-option ;
+: stack ( -- obj ) current-regexp get stack>> ;
-: get-unicode-case ( -- ? ) unicode-case get-option ;
+: change-whole-stack ( quot -- )
-: get-reversed-regexp ( -- ? ) reversed-regexp get-option ;
+    current-regexp get
    [ stack>> swap call ] keep (>>stack) ; inline
 : push-stack ( obj -- ) stack push ;
 : pop-stack ( -- obj ) stack pop ;
 : cut-out ( vector n -- vector' vector ) cut rest ;
 ERROR: cut-stack-error ;
 : cut-stack ( obj vector -- vector' vector )
    tuck last-index [ cut-stack-error ] unless* cut-out swap ;
 : <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
 : <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
@ -80,18 +72,11 @@ left-parenthesis pipe caret dash ;
 : <negation> ( obj -- negation ) negation boa ;
 : <concatenation> ( seq -- concatenation )
-    >vector get-reversed-regexp [ reverse ] when
+    >vector [ epsilon ] [ concatenation boa ] if-empty ;
    [ epsilon ] [ concatenation boa ] if-empty ;
 : <alternation> ( seq -- alternation ) >vector alternation boa ;
 : <capture-group> ( obj -- capture-group ) capture-group boa ;
 : <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
-: <constant> ( obj -- constant )
+: <constant> ( obj -- constant ) constant boa ;
    dup Letter? get-case-insensitive and [
        [ ch>lower ] [ ch>upper ] bi
        [ constant boa ] bi@ 2array <alternation>
    ] [
        constant boa
    ] if ;
 : first|concatenation ( seq -- first/concatenation )
    dup length 1 = [ first ] [ <concatenation> ] if ;
@ -100,21 +85,14 @@ left-parenthesis pipe caret dash ;
    dup length 1 = [ first ] [ <alternation> ] if ;
 : <character-class-range> ( from to -- obj )
    2dup [ Letter? ] bi@ or get-case-insensitive and [
        [ [ ch>lower ] bi@ character-class-range boa ]
        [ [ ch>upper ] bi@ character-class-range boa ] 2bi
        2array [ [ from>> ] [ to>> ] bi < ] filter
        [ unmatchable-class ] [ first|alternation ] if-empty
    ] [
    2dup <
-        [ character-class-range boa ] [ 2drop unmatchable-class ] if
+    [ character-class-range boa ] [ 2drop unmatchable-class ] if ;
    ] if ;
 ERROR: unmatched-parentheses ;
-ERROR: bad-option ch ;
+ERROR: unknown-regexp-option option ;
-: option ( ch -- singleton )
+: ch>option ( ch -- singleton )
    {
        { CHAR: i [ case-insensitive ] }
        { CHAR: d [ unix-lines ] }
@ -124,13 +102,21 @@ ERROR: bad-option ch ;
        { CHAR: s [ dotall ] }
        { CHAR: u [ unicode-case ] }
        { CHAR: x [ comments ] }
-        [ bad-option ]
+        [ unknown-regexp-option ]
    } case ;
-: option-on ( option -- ) current-regexp get options>> conjoin ;
+: option>ch ( option -- string )
-: option-off ( option -- ) current-regexp get options>> delete-at ;
+    {
        { case-insensitive [ CHAR: i ] }
        { multiline [ CHAR: m ] }
        { reversed-regexp [ CHAR: r ] }
        { dotall [ CHAR: s ] }
        [ unknown-regexp-option ]
    } case ;
 : toggle-option ( ch ? -- ) 
    [ ch>option ] dip option boa push-stack ;
 : toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
 : (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
 : parse-options ( string -- )
@ -176,7 +162,7 @@ DEFER: (parse-regexp)
    [ drop1 (parse-special-group) ]
    [ capture-group f nested-parse-regexp ] if ;
-: handle-dot ( -- ) get-dotall any-char any-char-no-nl ? push-stack ;
+: handle-dot ( -- ) any-char push-stack ;
 : handle-pipe ( -- ) pipe push-stack ;
 : (handle-star) ( obj -- kleene-star )
    peek1 {
@ -234,11 +220,8 @@ ERROR: invalid-range a b ;
        [ [ nip at-most-n ] [ at-least-n ] if* ] if
    ] [ drop 0 max exactly-n ] if ;
-: handle-front-anchor ( -- )
+: handle-front-anchor ( -- ) beginning-of-line push-stack ;
-    get-multiline beginning-of-line beginning-of-input ? push-stack ;
+: handle-back-anchor ( -- ) end-of-line push-stack ;
 : handle-back-anchor ( -- )
    get-multiline end-of-line end-of-input ? push-stack ;
 ERROR: bad-character-class obj ;
 ERROR: expected-posix-class ;
@ -247,8 +230,8 @@ ERROR: expected-posix-class ;
    read1 CHAR: { = [ expected-posix-class ] unless
    "}" read-until [ bad-character-class ] unless
    {
-        { "Lower" [ get-case-insensitive Letter-class letter-class ? ] }
+        { "Lower" [ letter-class ] }
-        { "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] }
+        { "Upper" [ LETTER-class ] }
        { "Alpha" [ Letter-class ] }
        { "ASCII" [ ascii-class ] }
        { "Digit" [ digit-class ] }
@ -412,7 +395,8 @@ DEFER: handle-left-bracket
    [ first|concatenation ] map first|alternation ;
 : handle-right-parenthesis ( -- )
-    stack dup [ parentheses-group "members" word-prop member? ] find-last -rot cut rest
+    stack dup [ parentheses-group "members" word-prop member? ] find-last
    -rot cut rest
    [ [ push ] keep current-regexp get (>>stack) ]
    [ finish-regexp-parse push-stack ] bi* ;
@ -429,12 +413,9 @@ DEFER: handle-left-bracket
        { CHAR: [ [ handle-left-bracket t ] }
        { CHAR: \ [ handle-escape t ] }
        [
-            dup CHAR: $ = peek1 f = and [
+            dup CHAR: $ = peek1 f = and
-                drop
+            [ drop handle-back-anchor f ]
-                handle-back-anchor f
+            [ push-constant t ] if
            ] [
                push-constant t
            ] if
        ]
    } case ;
@ -451,7 +432,6 @@ DEFER: handle-left-bracket
                parse-regexp-beginning (parse-regexp)
            ] with-input-stream
        ] unless-empty
-        current-regexp get
+        current-regexp get [ finish-regexp-parse ] change-stack
-        stack finish-regexp-parse
+        dup stack>> >>parse-tree drop
            >>parse-tree drop
    ] with-variable ;
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@ -238,7 +238,7 @@ IN: regexp-tests
 [ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
 [ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
-[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test
+[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/ matches? ] unit-test
 [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
 [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -307,17 +307,30 @@ IN: regexp-tests
 ! Bug in parsing word
 [ t ] [ "a" R' a' matches?  ] unit-test
 ! Convert to lowercase until E
 [ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
 [ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
 ! Convert to uppercase until E
 [ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
 [ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
 ! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
-[ t ] [ "a" R/ ^a/ matches? ] unit-test
+! [ t ] [ "a" R/ ^a/ matches? ] unit-test
-[ f ] [ "\na" R/ ^a/ matches? ] unit-test
+! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
-[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
+! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
-[ f ] [ "\ra" R/ ^a/ matches? ] unit-test
+! [ f ] [ "\ra" R/ ^a/ matches? ] unit-test
-[ t ] [ "a" R/ a$/ matches? ] unit-test
+! [ t ] [ "a" R/ a$/ matches? ] unit-test
-[ f ] [ "a\n" R/ a$/ matches? ] unit-test
+! [ f ] [ "a\n" R/ a$/ matches? ] unit-test
-[ f ] [ "a\r" R/ a$/ matches? ] unit-test
+! [ f ] [ "a\r" R/ a$/ matches? ] unit-test
-[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
+! [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
 ! [ t ] [ "a" R/ a$|b$/ matches? ] unit-test
 ! [ t ] [ "b" R/ a$|b$/ matches? ] unit-test
 ! [ t ] [ "ab" R/ a$|b$/ matches? ] unit-test
 ! [ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test
 ! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
 ! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
@ -347,14 +360,6 @@ IN: regexp-tests
 ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
 ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
 ! Convert to lowercase until E
 [ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
 [ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
 ! Convert to uppercase until E
 [ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
 [ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
 ! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
 ! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
 ! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@ -16,6 +16,7 @@ IN: regexp
        H{ } clone >>nfa-traversal-flags
        H{ } clone >>dfa-traversal-flags
        H{ } clone >>options
        H{ } clone >>matchers
        reset-regexp ;
 : construct-regexp ( regexp -- regexp' )
@ -93,26 +94,6 @@ IN: regexp
        { "R| "  "|"  }
    } swap [ subseq? not nip ] curry assoc-find drop ;
 ERROR: unknown-regexp-option option ;
 : option>ch ( option -- string )
    {
        { case-insensitive [ CHAR: i ] }
        { multiline [ CHAR: m ] }
        { reversed-regexp [ CHAR: r ] }
        { dotall [ CHAR: s ] }
        [ unknown-regexp-option ]
    } case ;
 : ch>option ( ch -- option )
    {
        { CHAR: i [ case-insensitive ] }
        { CHAR: m [ multiline ] }
        { CHAR: r [ reversed-regexp ] }
        { CHAR: s [ dotall ] }
        [ unknown-regexp-option ]
    } case ;
 : string>options ( string -- options )
    [ ch>option dup ] H{ } map>assoc ;
--- a/basis/regexp/utils/utils.factor
+++ b/basis/regexp/utils/utils.factor
@ -26,23 +26,6 @@ IN: regexp.utils
 : ?insert-at ( value key hash/f -- hash )
    [ H{ } clone ] unless* [ insert-at ] keep ;
 : last-state ( regexp -- range ) stack>> peek first2 [a,b] ;
 : push1 ( obj -- ) input-stream get stream>> push ;
 : peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
 : pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
 : drop1 ( -- ) read1 drop ;
 : stack ( -- obj ) current-regexp get stack>> ;
 : change-whole-stack ( quot -- )
    current-regexp get
    [ stack>> swap call ] keep (>>stack) ; inline
 : push-stack ( obj -- ) stack push ;
 : pop-stack ( -- obj ) stack pop ;
 : cut-out ( vector n -- vector' vector ) cut rest ;
 ERROR: cut-stack-error ;
 : cut-stack ( obj vector -- vector' vector )
    tuck last-index [ cut-stack-error ] unless* cut-out swap ;
 ERROR: bad-octal number ;
 ERROR: bad-hex number ;
 : check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;