move things around. the parser no longer adjusts nodes based on options, instead opting for nfa to handle it (case-insensitive, multiline, dotall, reversed..)

db4
Doug Coleman 2008-11-24 22:17:47 -06:00
parent e4a2b671d3
commit 384a11ecee
8 changed files with 158 additions and 156 deletions

View File

@ -5,12 +5,13 @@ IN: regexp.backend
TUPLE: regexp
raw
{ stack vector }
parse-tree
{ options hashtable }
stack
parse-tree
nfa-table
dfa-table
minimized-table
matchers
{ nfa-traversal-flags hashtable }
{ dfa-traversal-flags hashtable }
{ state integer }

View File

@ -1,12 +1,25 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order symbols regexp.parser
USING: accessors kernel math math.order symbols
words regexp.utils unicode.categories combinators.short-circuit ;
IN: regexp.classes
SINGLETONS: any-char any-char-no-nl
letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input beginning-of-line
end-of-input end-of-line ;
MIXIN: node
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
GENERIC: class-member? ( obj class -- ? )
M: word class-member? ( obj class -- ? ) 2drop f ;
M: t class-member? ( obj class -- ? ) 2drop f ;
M: integer class-member? ( obj class -- ? ) 2drop f ;
@ -18,7 +31,7 @@ M: any-char class-member? ( obj class -- ? )
M: any-char-no-nl class-member? ( obj class -- ? )
drop CHAR: \n = not ;
M: letter-class class-member? ( obj class -- ? )
drop letter? ;
@ -70,3 +83,9 @@ M: terminator-class class-member? ( obj class -- ? )
[ CHAR: \u002028 = ]
[ CHAR: \u002029 = ]
} 1|| ;
M: beginning-of-line class-member? ( obj class -- ? )
2drop f ;
M: end-of-line class-member? ( obj class -- ? )
2drop f ;

View File

@ -3,7 +3,7 @@
USING: accessors arrays assocs grouping kernel regexp.backend
locals math namespaces regexp.parser sequences fry quotations
math.order math.ranges vectors unicode.categories regexp.utils
regexp.transition-tables words sets ;
regexp.transition-tables words sets regexp.classes unicode.case ;
IN: regexp.nfa
SYMBOL: negation-mode
@ -22,8 +22,13 @@ SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
: add-global-flag ( flag -- )
current-regexp get nfa-table>> flags>> conjoin ;
: options ( -- obj ) current-regexp get options>> ;
: option? ( obj -- ? ) options key? ;
: option-on ( obj -- ) options conjoin ;
: option-off ( obj -- ) options delete-at ;
: next-state ( regexp -- state )
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
@ -106,6 +111,7 @@ M: kleene-star nfa-node ( node -- )
M: concatenation nfa-node ( node -- )
seq>>
reversed-regexp option? [ <reversed> ] when
[ [ nfa-node ] each ]
[ length 1- [ concatenate-nodes ] times ] bi ;
@ -115,16 +121,59 @@ M: alternation nfa-node ( node -- )
[ length 1- [ alternate-nodes ] times ] bi ;
M: constant nfa-node ( node -- )
char>> literal-transition add-simple-entry ;
case-insensitive option? [
dup char>> [ ch>lower ] [ ch>upper ] bi
2dup = [
2drop
char>> literal-transition add-simple-entry
] [
[ literal-transition add-simple-entry ] bi@
alternate-nodes drop
] if
] [
char>> literal-transition add-simple-entry
] if ;
M: epsilon nfa-node ( node -- )
drop eps literal-transition add-simple-entry ;
M: word nfa-node ( node -- )
M: word nfa-node ( node -- ) class-transition add-simple-entry ;
M: any-char nfa-node ( node -- )
[ dotall option? ] dip any-char-no-nl ?
class-transition add-simple-entry ;
! M: beginning-of-text nfa-node ( node -- ) ;
M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
: choose-letter-class ( node -- node' )
case-insensitive option? Letter-class rot ? ;
M: letter-class nfa-node ( node -- )
choose-letter-class class-transition add-simple-entry ;
M: LETTER-class nfa-node ( node -- )
choose-letter-class class-transition add-simple-entry ;
M: character-class-range nfa-node ( node -- )
class-transition add-simple-entry ;
case-insensitive option? [
dup [ from>> ] [ to>> ] bi
2dup [ Letter? ] bi@ and [
rot drop
[ [ ch>lower ] bi@ character-class-range boa ]
[ [ ch>upper ] bi@ character-class-range boa ] 2bi
[ class-transition add-simple-entry ] bi@
alternate-nodes
] [
2drop
class-transition add-simple-entry
] if
] [
class-transition add-simple-entry
] if ;
M: capture-group nfa-node ( node -- )
eps literal-transition add-simple-entry
@ -141,26 +190,6 @@ M: non-capture-group nfa-node ( node -- )
M: reluctant-kleene-star nfa-node ( node -- )
term>> <kleene-star> nfa-node ;
M: beginning-of-line nfa-node ( node -- )
drop
eps literal-transition add-simple-entry
beginning-of-line add-global-flag ;
M: end-of-line nfa-node ( node -- )
drop
eps literal-transition add-simple-entry
end-of-line add-global-flag ;
M: beginning-of-input nfa-node ( node -- )
drop
eps literal-transition add-simple-entry
beginning-of-input add-global-flag ;
M: end-of-input nfa-node ( node -- )
drop
eps literal-transition add-simple-entry
end-of-input add-global-flag ;
M: negation nfa-node ( node -- )
negation-mode inc
term>> nfa-node
@ -182,6 +211,10 @@ M: lookbehind nfa-node ( node -- )
lookbehind-off add-traversal-flag
2 [ concatenate-nodes ] times ;
M: option nfa-node ( node -- )
[ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
eps literal-transition add-simple-entry ;
: construct-nfa ( regexp -- )
[
reset-regexp

View File

@ -19,8 +19,8 @@ IN: regexp.parser
[ ] [ "(?:a)" test-regexp ] unit-test
[ ] [ "(?i:a)" test-regexp ] unit-test
[ ] [ "(?-i:a)" test-regexp ] unit-test
[ "(?z:a)" test-regexp ] [ bad-option? ] must-fail-with
[ "(?-z:a)" test-regexp ] [ bad-option? ] must-fail-with
[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
[ ] [ "(?=a)" test-regexp ] unit-test

View File

@ -4,12 +4,11 @@ USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser namespaces qualified sets
quotations sequences splitting symbols vectors math.order
unicode.categories strings regexp.backend regexp.utils
unicode.case words locals ;
unicode.case words locals regexp.classes ;
IN: regexp.parser
FROM: math.ranges => [a,b] ;
MIXIN: node
TUPLE: concatenation seq ; INSTANCE: concatenation node
TUPLE: alternation seq ; INSTANCE: alternation node
TUPLE: kleene-star term ; INSTANCE: kleene-star node
@ -40,38 +39,31 @@ INSTANCE: independent-group parentheses-group
TUPLE: comment-group term ; INSTANCE: comment-group node
INSTANCE: comment-group parentheses-group
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
SINGLETON: epsilon INSTANCE: epsilon node
SINGLETON: any-char INSTANCE: any-char node
SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
SINGLETON: beginning-of-input INSTANCE: beginning-of-input node
SINGLETON: end-of-input INSTANCE: end-of-input node
SINGLETON: beginning-of-line INSTANCE: beginning-of-line node
SINGLETON: end-of-line INSTANCE: end-of-line node
TUPLE: option-on option ; INSTANCE: option-on node
TUPLE: option-off option ; INSTANCE: option-off node
TUPLE: option option on? ; INSTANCE: option node
SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ;
SINGLETONS: letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-group end-of-group
beginning-of-character-class end-of-character-class
SINGLETONS: beginning-of-character-class end-of-character-class
left-parenthesis pipe caret dash ;
: get-option ( option -- ? ) current-regexp get options>> at ;
: get-unix-lines ( -- ? ) unix-lines get-option ;
: get-dotall ( -- ? ) dotall get-option ;
: get-multiline ( -- ? ) multiline get-option ;
: get-comments ( -- ? ) comments get-option ;
: get-case-insensitive ( -- ? ) case-insensitive get-option ;
: get-unicode-case ( -- ? ) unicode-case get-option ;
: get-reversed-regexp ( -- ? ) reversed-regexp get-option ;
: push1 ( obj -- ) input-stream get stream>> push ;
: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
: drop1 ( -- ) read1 drop ;
: stack ( -- obj ) current-regexp get stack>> ;
: change-whole-stack ( quot -- )
current-regexp get
[ stack>> swap call ] keep (>>stack) ; inline
: push-stack ( obj -- ) stack push ;
: pop-stack ( -- obj ) stack pop ;
: cut-out ( vector n -- vector' vector ) cut rest ;
ERROR: cut-stack-error ;
: cut-stack ( obj vector -- vector' vector )
tuck last-index [ cut-stack-error ] unless* cut-out swap ;
: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
@ -80,18 +72,11 @@ left-parenthesis pipe caret dash ;
: <negation> ( obj -- negation ) negation boa ;
: <concatenation> ( seq -- concatenation )
>vector get-reversed-regexp [ reverse ] when
[ epsilon ] [ concatenation boa ] if-empty ;
>vector [ epsilon ] [ concatenation boa ] if-empty ;
: <alternation> ( seq -- alternation ) >vector alternation boa ;
: <capture-group> ( obj -- capture-group ) capture-group boa ;
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
: <constant> ( obj -- constant )
dup Letter? get-case-insensitive and [
[ ch>lower ] [ ch>upper ] bi
[ constant boa ] bi@ 2array <alternation>
] [
constant boa
] if ;
: <constant> ( obj -- constant ) constant boa ;
: first|concatenation ( seq -- first/concatenation )
dup length 1 = [ first ] [ <concatenation> ] if ;
@ -100,21 +85,14 @@ left-parenthesis pipe caret dash ;
dup length 1 = [ first ] [ <alternation> ] if ;
: <character-class-range> ( from to -- obj )
2dup [ Letter? ] bi@ or get-case-insensitive and [
[ [ ch>lower ] bi@ character-class-range boa ]
[ [ ch>upper ] bi@ character-class-range boa ] 2bi
2array [ [ from>> ] [ to>> ] bi < ] filter
[ unmatchable-class ] [ first|alternation ] if-empty
] [
2dup <
[ character-class-range boa ] [ 2drop unmatchable-class ] if
] if ;
2dup <
[ character-class-range boa ] [ 2drop unmatchable-class ] if ;
ERROR: unmatched-parentheses ;
ERROR: bad-option ch ;
ERROR: unknown-regexp-option option ;
: option ( ch -- singleton )
: ch>option ( ch -- singleton )
{
{ CHAR: i [ case-insensitive ] }
{ CHAR: d [ unix-lines ] }
@ -124,13 +102,21 @@ ERROR: bad-option ch ;
{ CHAR: s [ dotall ] }
{ CHAR: u [ unicode-case ] }
{ CHAR: x [ comments ] }
[ bad-option ]
[ unknown-regexp-option ]
} case ;
: option-on ( option -- ) current-regexp get options>> conjoin ;
: option-off ( option -- ) current-regexp get options>> delete-at ;
: option>ch ( option -- string )
{
{ case-insensitive [ CHAR: i ] }
{ multiline [ CHAR: m ] }
{ reversed-regexp [ CHAR: r ] }
{ dotall [ CHAR: s ] }
[ unknown-regexp-option ]
} case ;
: toggle-option ( ch ? -- )
[ ch>option ] dip option boa push-stack ;
: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
: parse-options ( string -- )
@ -176,7 +162,7 @@ DEFER: (parse-regexp)
[ drop1 (parse-special-group) ]
[ capture-group f nested-parse-regexp ] if ;
: handle-dot ( -- ) get-dotall any-char any-char-no-nl ? push-stack ;
: handle-dot ( -- ) any-char push-stack ;
: handle-pipe ( -- ) pipe push-stack ;
: (handle-star) ( obj -- kleene-star )
peek1 {
@ -234,11 +220,8 @@ ERROR: invalid-range a b ;
[ [ nip at-most-n ] [ at-least-n ] if* ] if
] [ drop 0 max exactly-n ] if ;
: handle-front-anchor ( -- )
get-multiline beginning-of-line beginning-of-input ? push-stack ;
: handle-back-anchor ( -- )
get-multiline end-of-line end-of-input ? push-stack ;
: handle-front-anchor ( -- ) beginning-of-line push-stack ;
: handle-back-anchor ( -- ) end-of-line push-stack ;
ERROR: bad-character-class obj ;
ERROR: expected-posix-class ;
@ -247,8 +230,8 @@ ERROR: expected-posix-class ;
read1 CHAR: { = [ expected-posix-class ] unless
"}" read-until [ bad-character-class ] unless
{
{ "Lower" [ get-case-insensitive Letter-class letter-class ? ] }
{ "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] }
{ "Lower" [ letter-class ] }
{ "Upper" [ LETTER-class ] }
{ "Alpha" [ Letter-class ] }
{ "ASCII" [ ascii-class ] }
{ "Digit" [ digit-class ] }
@ -412,7 +395,8 @@ DEFER: handle-left-bracket
[ first|concatenation ] map first|alternation ;
: handle-right-parenthesis ( -- )
stack dup [ parentheses-group "members" word-prop member? ] find-last -rot cut rest
stack dup [ parentheses-group "members" word-prop member? ] find-last
-rot cut rest
[ [ push ] keep current-regexp get (>>stack) ]
[ finish-regexp-parse push-stack ] bi* ;
@ -429,12 +413,9 @@ DEFER: handle-left-bracket
{ CHAR: [ [ handle-left-bracket t ] }
{ CHAR: \ [ handle-escape t ] }
[
dup CHAR: $ = peek1 f = and [
drop
handle-back-anchor f
] [
push-constant t
] if
dup CHAR: $ = peek1 f = and
[ drop handle-back-anchor f ]
[ push-constant t ] if
]
} case ;
@ -451,7 +432,6 @@ DEFER: handle-left-bracket
parse-regexp-beginning (parse-regexp)
] with-input-stream
] unless-empty
current-regexp get
stack finish-regexp-parse
>>parse-tree drop
current-regexp get [ finish-regexp-parse ] change-stack
dup stack>> >>parse-tree drop
] with-variable ;

View File

@ -238,7 +238,7 @@ IN: regexp-tests
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test
[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/ matches? ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -307,17 +307,30 @@ IN: regexp-tests
! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test
! Convert to lowercase until E
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
! Convert to uppercase until E
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
[ t ] [ "a" R/ ^a/ matches? ] unit-test
[ f ] [ "\na" R/ ^a/ matches? ] unit-test
[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
[ f ] [ "\ra" R/ ^a/ matches? ] unit-test
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\ra" R/ ^a/ matches? ] unit-test
[ t ] [ "a" R/ a$/ matches? ] unit-test
[ f ] [ "a\n" R/ a$/ matches? ] unit-test
[ f ] [ "a\r" R/ a$/ matches? ] unit-test
[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
! [ t ] [ "a" R/ a$/ matches? ] unit-test
! [ f ] [ "a\n" R/ a$/ matches? ] unit-test
! [ f ] [ "a\r" R/ a$/ matches? ] unit-test
! [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
! [ t ] [ "a" R/ a$|b$/ matches? ] unit-test
! [ t ] [ "b" R/ a$|b$/ matches? ] unit-test
! [ t ] [ "ab" R/ a$|b$/ matches? ] unit-test
! [ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
@ -347,14 +360,6 @@ IN: regexp-tests
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
! Convert to lowercase until E
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
! Convert to uppercase until E
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test

View File

@ -16,6 +16,7 @@ IN: regexp
H{ } clone >>nfa-traversal-flags
H{ } clone >>dfa-traversal-flags
H{ } clone >>options
H{ } clone >>matchers
reset-regexp ;
: construct-regexp ( regexp -- regexp' )
@ -93,26 +94,6 @@ IN: regexp
{ "R| " "|" }
} swap [ subseq? not nip ] curry assoc-find drop ;
ERROR: unknown-regexp-option option ;
: option>ch ( option -- string )
{
{ case-insensitive [ CHAR: i ] }
{ multiline [ CHAR: m ] }
{ reversed-regexp [ CHAR: r ] }
{ dotall [ CHAR: s ] }
[ unknown-regexp-option ]
} case ;
: ch>option ( ch -- option )
{
{ CHAR: i [ case-insensitive ] }
{ CHAR: m [ multiline ] }
{ CHAR: r [ reversed-regexp ] }
{ CHAR: s [ dotall ] }
[ unknown-regexp-option ]
} case ;
: string>options ( string -- options )
[ ch>option dup ] H{ } map>assoc ;

View File

@ -26,23 +26,6 @@ IN: regexp.utils
: ?insert-at ( value key hash/f -- hash )
[ H{ } clone ] unless* [ insert-at ] keep ;
: last-state ( regexp -- range ) stack>> peek first2 [a,b] ;
: push1 ( obj -- ) input-stream get stream>> push ;
: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
: drop1 ( -- ) read1 drop ;
: stack ( -- obj ) current-regexp get stack>> ;
: change-whole-stack ( quot -- )
current-regexp get
[ stack>> swap call ] keep (>>stack) ; inline
: push-stack ( obj -- ) stack push ;
: pop-stack ( -- obj ) stack pop ;
: cut-out ( vector n -- vector' vector ) cut rest ;
ERROR: cut-stack-error ;
: cut-stack ( obj vector -- vector' vector )
tuck last-index [ cut-stack-error ] unless* cut-out swap ;
ERROR: bad-octal number ;
ERROR: bad-hex number ;
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;