move things around. the parser no longer adjusts nodes based on options, instead opting for nfa to handle it (case-insensitive, multiline, dotall, reversed..)

db4
Doug Coleman 2008-11-24 22:17:47 -06:00
parent e4a2b671d3
commit 384a11ecee
8 changed files with 158 additions and 156 deletions

View File

@ -5,12 +5,13 @@ IN: regexp.backend
TUPLE: regexp TUPLE: regexp
raw raw
{ stack vector }
parse-tree
{ options hashtable } { options hashtable }
stack
parse-tree
nfa-table nfa-table
dfa-table dfa-table
minimized-table minimized-table
matchers
{ nfa-traversal-flags hashtable } { nfa-traversal-flags hashtable }
{ dfa-traversal-flags hashtable } { dfa-traversal-flags hashtable }
{ state integer } { state integer }

View File

@ -1,12 +1,25 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order symbols regexp.parser USING: accessors kernel math math.order symbols
words regexp.utils unicode.categories combinators.short-circuit ; words regexp.utils unicode.categories combinators.short-circuit ;
IN: regexp.classes IN: regexp.classes
SINGLETONS: any-char any-char-no-nl
letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input beginning-of-line
end-of-input end-of-line ;
MIXIN: node
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
GENERIC: class-member? ( obj class -- ? ) GENERIC: class-member? ( obj class -- ? )
M: word class-member? ( obj class -- ? ) 2drop f ; M: t class-member? ( obj class -- ? ) 2drop f ;
M: integer class-member? ( obj class -- ? ) 2drop f ; M: integer class-member? ( obj class -- ? ) 2drop f ;
@ -70,3 +83,9 @@ M: terminator-class class-member? ( obj class -- ? )
[ CHAR: \u002028 = ] [ CHAR: \u002028 = ]
[ CHAR: \u002029 = ] [ CHAR: \u002029 = ]
} 1|| ; } 1|| ;
M: beginning-of-line class-member? ( obj class -- ? )
2drop f ;
M: end-of-line class-member? ( obj class -- ? )
2drop f ;

View File

@ -3,7 +3,7 @@
USING: accessors arrays assocs grouping kernel regexp.backend USING: accessors arrays assocs grouping kernel regexp.backend
locals math namespaces regexp.parser sequences fry quotations locals math namespaces regexp.parser sequences fry quotations
math.order math.ranges vectors unicode.categories regexp.utils math.order math.ranges vectors unicode.categories regexp.utils
regexp.transition-tables words sets ; regexp.transition-tables words sets regexp.classes unicode.case ;
IN: regexp.nfa IN: regexp.nfa
SYMBOL: negation-mode SYMBOL: negation-mode
@ -22,8 +22,13 @@ SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
: add-global-flag ( flag -- ) : options ( -- obj ) current-regexp get options>> ;
current-regexp get nfa-table>> flags>> conjoin ;
: option? ( obj -- ? ) options key? ;
: option-on ( obj -- ) options conjoin ;
: option-off ( obj -- ) options delete-at ;
: next-state ( regexp -- state ) : next-state ( regexp -- state )
[ state>> ] [ [ 1+ ] change-state drop ] bi ; [ state>> ] [ [ 1+ ] change-state drop ] bi ;
@ -106,6 +111,7 @@ M: kleene-star nfa-node ( node -- )
M: concatenation nfa-node ( node -- ) M: concatenation nfa-node ( node -- )
seq>> seq>>
reversed-regexp option? [ <reversed> ] when
[ [ nfa-node ] each ] [ [ nfa-node ] each ]
[ length 1- [ concatenate-nodes ] times ] bi ; [ length 1- [ concatenate-nodes ] times ] bi ;
@ -115,16 +121,59 @@ M: alternation nfa-node ( node -- )
[ length 1- [ alternate-nodes ] times ] bi ; [ length 1- [ alternate-nodes ] times ] bi ;
M: constant nfa-node ( node -- ) M: constant nfa-node ( node -- )
char>> literal-transition add-simple-entry ; case-insensitive option? [
dup char>> [ ch>lower ] [ ch>upper ] bi
2dup = [
2drop
char>> literal-transition add-simple-entry
] [
[ literal-transition add-simple-entry ] bi@
alternate-nodes drop
] if
] [
char>> literal-transition add-simple-entry
] if ;
M: epsilon nfa-node ( node -- ) M: epsilon nfa-node ( node -- )
drop eps literal-transition add-simple-entry ; drop eps literal-transition add-simple-entry ;
M: word nfa-node ( node -- ) M: word nfa-node ( node -- ) class-transition add-simple-entry ;
M: any-char nfa-node ( node -- )
[ dotall option? ] dip any-char-no-nl ?
class-transition add-simple-entry ; class-transition add-simple-entry ;
! M: beginning-of-text nfa-node ( node -- ) ;
M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
: choose-letter-class ( node -- node' )
case-insensitive option? Letter-class rot ? ;
M: letter-class nfa-node ( node -- )
choose-letter-class class-transition add-simple-entry ;
M: LETTER-class nfa-node ( node -- )
choose-letter-class class-transition add-simple-entry ;
M: character-class-range nfa-node ( node -- ) M: character-class-range nfa-node ( node -- )
class-transition add-simple-entry ; case-insensitive option? [
dup [ from>> ] [ to>> ] bi
2dup [ Letter? ] bi@ and [
rot drop
[ [ ch>lower ] bi@ character-class-range boa ]
[ [ ch>upper ] bi@ character-class-range boa ] 2bi
[ class-transition add-simple-entry ] bi@
alternate-nodes
] [
2drop
class-transition add-simple-entry
] if
] [
class-transition add-simple-entry
] if ;
M: capture-group nfa-node ( node -- ) M: capture-group nfa-node ( node -- )
eps literal-transition add-simple-entry eps literal-transition add-simple-entry
@ -141,26 +190,6 @@ M: non-capture-group nfa-node ( node -- )
M: reluctant-kleene-star nfa-node ( node -- ) M: reluctant-kleene-star nfa-node ( node -- )
term>> <kleene-star> nfa-node ; term>> <kleene-star> nfa-node ;
M: beginning-of-line nfa-node ( node -- )
drop
eps literal-transition add-simple-entry
beginning-of-line add-global-flag ;
M: end-of-line nfa-node ( node -- )
drop
eps literal-transition add-simple-entry
end-of-line add-global-flag ;
M: beginning-of-input nfa-node ( node -- )
drop
eps literal-transition add-simple-entry
beginning-of-input add-global-flag ;
M: end-of-input nfa-node ( node -- )
drop
eps literal-transition add-simple-entry
end-of-input add-global-flag ;
M: negation nfa-node ( node -- ) M: negation nfa-node ( node -- )
negation-mode inc negation-mode inc
term>> nfa-node term>> nfa-node
@ -182,6 +211,10 @@ M: lookbehind nfa-node ( node -- )
lookbehind-off add-traversal-flag lookbehind-off add-traversal-flag
2 [ concatenate-nodes ] times ; 2 [ concatenate-nodes ] times ;
M: option nfa-node ( node -- )
[ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
eps literal-transition add-simple-entry ;
: construct-nfa ( regexp -- ) : construct-nfa ( regexp -- )
[ [
reset-regexp reset-regexp

View File

@ -19,8 +19,8 @@ IN: regexp.parser
[ ] [ "(?:a)" test-regexp ] unit-test [ ] [ "(?:a)" test-regexp ] unit-test
[ ] [ "(?i:a)" test-regexp ] unit-test [ ] [ "(?i:a)" test-regexp ] unit-test
[ ] [ "(?-i:a)" test-regexp ] unit-test [ ] [ "(?-i:a)" test-regexp ] unit-test
[ "(?z:a)" test-regexp ] [ bad-option? ] must-fail-with [ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
[ "(?-z:a)" test-regexp ] [ bad-option? ] must-fail-with [ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
[ ] [ "(?=a)" test-regexp ] unit-test [ ] [ "(?=a)" test-regexp ] unit-test

View File

@ -4,12 +4,11 @@ USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser namespaces qualified sets kernel math math.parser namespaces qualified sets
quotations sequences splitting symbols vectors math.order quotations sequences splitting symbols vectors math.order
unicode.categories strings regexp.backend regexp.utils unicode.categories strings regexp.backend regexp.utils
unicode.case words locals ; unicode.case words locals regexp.classes ;
IN: regexp.parser IN: regexp.parser
FROM: math.ranges => [a,b] ; FROM: math.ranges => [a,b] ;
MIXIN: node
TUPLE: concatenation seq ; INSTANCE: concatenation node TUPLE: concatenation seq ; INSTANCE: concatenation node
TUPLE: alternation seq ; INSTANCE: alternation node TUPLE: alternation seq ; INSTANCE: alternation node
TUPLE: kleene-star term ; INSTANCE: kleene-star node TUPLE: kleene-star term ; INSTANCE: kleene-star node
@ -40,38 +39,31 @@ INSTANCE: independent-group parentheses-group
TUPLE: comment-group term ; INSTANCE: comment-group node TUPLE: comment-group term ; INSTANCE: comment-group node
INSTANCE: comment-group parentheses-group INSTANCE: comment-group parentheses-group
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
SINGLETON: epsilon INSTANCE: epsilon node SINGLETON: epsilon INSTANCE: epsilon node
SINGLETON: any-char INSTANCE: any-char node
SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
SINGLETON: beginning-of-input INSTANCE: beginning-of-input node
SINGLETON: end-of-input INSTANCE: end-of-input node
SINGLETON: beginning-of-line INSTANCE: beginning-of-line node
SINGLETON: end-of-line INSTANCE: end-of-line node
TUPLE: option-on option ; INSTANCE: option-on node TUPLE: option option on? ; INSTANCE: option node
TUPLE: option-off option ; INSTANCE: option-off node
SINGLETONS: unix-lines dotall multiline comments case-insensitive SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ; unicode-case reversed-regexp ;
SINGLETONS: letter-class LETTER-class Letter-class digit-class SINGLETONS: beginning-of-character-class end-of-character-class
alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-group end-of-group
beginning-of-character-class end-of-character-class
left-parenthesis pipe caret dash ; left-parenthesis pipe caret dash ;
: get-option ( option -- ? ) current-regexp get options>> at ; : push1 ( obj -- ) input-stream get stream>> push ;
: get-unix-lines ( -- ? ) unix-lines get-option ; : peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
: get-dotall ( -- ? ) dotall get-option ; : pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
: get-multiline ( -- ? ) multiline get-option ; : drop1 ( -- ) read1 drop ;
: get-comments ( -- ? ) comments get-option ;
: get-case-insensitive ( -- ? ) case-insensitive get-option ; : stack ( -- obj ) current-regexp get stack>> ;
: get-unicode-case ( -- ? ) unicode-case get-option ; : change-whole-stack ( quot -- )
: get-reversed-regexp ( -- ? ) reversed-regexp get-option ; current-regexp get
[ stack>> swap call ] keep (>>stack) ; inline
: push-stack ( obj -- ) stack push ;
: pop-stack ( -- obj ) stack pop ;
: cut-out ( vector n -- vector' vector ) cut rest ;
ERROR: cut-stack-error ;
: cut-stack ( obj vector -- vector' vector )
tuck last-index [ cut-stack-error ] unless* cut-out swap ;
: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ; : <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ; : <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
@ -80,18 +72,11 @@ left-parenthesis pipe caret dash ;
: <negation> ( obj -- negation ) negation boa ; : <negation> ( obj -- negation ) negation boa ;
: <concatenation> ( seq -- concatenation ) : <concatenation> ( seq -- concatenation )
>vector get-reversed-regexp [ reverse ] when >vector [ epsilon ] [ concatenation boa ] if-empty ;
[ epsilon ] [ concatenation boa ] if-empty ;
: <alternation> ( seq -- alternation ) >vector alternation boa ; : <alternation> ( seq -- alternation ) >vector alternation boa ;
: <capture-group> ( obj -- capture-group ) capture-group boa ; : <capture-group> ( obj -- capture-group ) capture-group boa ;
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ; : <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
: <constant> ( obj -- constant ) : <constant> ( obj -- constant ) constant boa ;
dup Letter? get-case-insensitive and [
[ ch>lower ] [ ch>upper ] bi
[ constant boa ] bi@ 2array <alternation>
] [
constant boa
] if ;
: first|concatenation ( seq -- first/concatenation ) : first|concatenation ( seq -- first/concatenation )
dup length 1 = [ first ] [ <concatenation> ] if ; dup length 1 = [ first ] [ <concatenation> ] if ;
@ -100,21 +85,14 @@ left-parenthesis pipe caret dash ;
dup length 1 = [ first ] [ <alternation> ] if ; dup length 1 = [ first ] [ <alternation> ] if ;
: <character-class-range> ( from to -- obj ) : <character-class-range> ( from to -- obj )
2dup [ Letter? ] bi@ or get-case-insensitive and [
[ [ ch>lower ] bi@ character-class-range boa ]
[ [ ch>upper ] bi@ character-class-range boa ] 2bi
2array [ [ from>> ] [ to>> ] bi < ] filter
[ unmatchable-class ] [ first|alternation ] if-empty
] [
2dup < 2dup <
[ character-class-range boa ] [ 2drop unmatchable-class ] if [ character-class-range boa ] [ 2drop unmatchable-class ] if ;
] if ;
ERROR: unmatched-parentheses ; ERROR: unmatched-parentheses ;
ERROR: bad-option ch ; ERROR: unknown-regexp-option option ;
: option ( ch -- singleton ) : ch>option ( ch -- singleton )
{ {
{ CHAR: i [ case-insensitive ] } { CHAR: i [ case-insensitive ] }
{ CHAR: d [ unix-lines ] } { CHAR: d [ unix-lines ] }
@ -124,13 +102,21 @@ ERROR: bad-option ch ;
{ CHAR: s [ dotall ] } { CHAR: s [ dotall ] }
{ CHAR: u [ unicode-case ] } { CHAR: u [ unicode-case ] }
{ CHAR: x [ comments ] } { CHAR: x [ comments ] }
[ bad-option ] [ unknown-regexp-option ]
} case ; } case ;
: option-on ( option -- ) current-regexp get options>> conjoin ; : option>ch ( option -- string )
: option-off ( option -- ) current-regexp get options>> delete-at ; {
{ case-insensitive [ CHAR: i ] }
{ multiline [ CHAR: m ] }
{ reversed-regexp [ CHAR: r ] }
{ dotall [ CHAR: s ] }
[ unknown-regexp-option ]
} case ;
: toggle-option ( ch ? -- )
[ ch>option ] dip option boa push-stack ;
: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ; : (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
: parse-options ( string -- ) : parse-options ( string -- )
@ -176,7 +162,7 @@ DEFER: (parse-regexp)
[ drop1 (parse-special-group) ] [ drop1 (parse-special-group) ]
[ capture-group f nested-parse-regexp ] if ; [ capture-group f nested-parse-regexp ] if ;
: handle-dot ( -- ) get-dotall any-char any-char-no-nl ? push-stack ; : handle-dot ( -- ) any-char push-stack ;
: handle-pipe ( -- ) pipe push-stack ; : handle-pipe ( -- ) pipe push-stack ;
: (handle-star) ( obj -- kleene-star ) : (handle-star) ( obj -- kleene-star )
peek1 { peek1 {
@ -234,11 +220,8 @@ ERROR: invalid-range a b ;
[ [ nip at-most-n ] [ at-least-n ] if* ] if [ [ nip at-most-n ] [ at-least-n ] if* ] if
] [ drop 0 max exactly-n ] if ; ] [ drop 0 max exactly-n ] if ;
: handle-front-anchor ( -- ) : handle-front-anchor ( -- ) beginning-of-line push-stack ;
get-multiline beginning-of-line beginning-of-input ? push-stack ; : handle-back-anchor ( -- ) end-of-line push-stack ;
: handle-back-anchor ( -- )
get-multiline end-of-line end-of-input ? push-stack ;
ERROR: bad-character-class obj ; ERROR: bad-character-class obj ;
ERROR: expected-posix-class ; ERROR: expected-posix-class ;
@ -247,8 +230,8 @@ ERROR: expected-posix-class ;
read1 CHAR: { = [ expected-posix-class ] unless read1 CHAR: { = [ expected-posix-class ] unless
"}" read-until [ bad-character-class ] unless "}" read-until [ bad-character-class ] unless
{ {
{ "Lower" [ get-case-insensitive Letter-class letter-class ? ] } { "Lower" [ letter-class ] }
{ "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] } { "Upper" [ LETTER-class ] }
{ "Alpha" [ Letter-class ] } { "Alpha" [ Letter-class ] }
{ "ASCII" [ ascii-class ] } { "ASCII" [ ascii-class ] }
{ "Digit" [ digit-class ] } { "Digit" [ digit-class ] }
@ -412,7 +395,8 @@ DEFER: handle-left-bracket
[ first|concatenation ] map first|alternation ; [ first|concatenation ] map first|alternation ;
: handle-right-parenthesis ( -- ) : handle-right-parenthesis ( -- )
stack dup [ parentheses-group "members" word-prop member? ] find-last -rot cut rest stack dup [ parentheses-group "members" word-prop member? ] find-last
-rot cut rest
[ [ push ] keep current-regexp get (>>stack) ] [ [ push ] keep current-regexp get (>>stack) ]
[ finish-regexp-parse push-stack ] bi* ; [ finish-regexp-parse push-stack ] bi* ;
@ -429,12 +413,9 @@ DEFER: handle-left-bracket
{ CHAR: [ [ handle-left-bracket t ] } { CHAR: [ [ handle-left-bracket t ] }
{ CHAR: \ [ handle-escape t ] } { CHAR: \ [ handle-escape t ] }
[ [
dup CHAR: $ = peek1 f = and [ dup CHAR: $ = peek1 f = and
drop [ drop handle-back-anchor f ]
handle-back-anchor f [ push-constant t ] if
] [
push-constant t
] if
] ]
} case ; } case ;
@ -451,7 +432,6 @@ DEFER: handle-left-bracket
parse-regexp-beginning (parse-regexp) parse-regexp-beginning (parse-regexp)
] with-input-stream ] with-input-stream
] unless-empty ] unless-empty
current-regexp get current-regexp get [ finish-regexp-parse ] change-stack
stack finish-regexp-parse dup stack>> >>parse-tree drop
>>parse-tree drop
] with-variable ; ] with-variable ;

View File

@ -238,7 +238,7 @@ IN: regexp-tests
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test [ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test [ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test [ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/ matches? ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -307,17 +307,30 @@ IN: regexp-tests
! Bug in parsing word ! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test [ t ] [ "a" R' a' matches? ] unit-test
! Convert to lowercase until E
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
! Convert to uppercase until E
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with ! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
[ t ] [ "a" R/ ^a/ matches? ] unit-test ! [ t ] [ "a" R/ ^a/ matches? ] unit-test
[ f ] [ "\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
[ f ] [ "\ra" R/ ^a/ matches? ] unit-test ! [ f ] [ "\ra" R/ ^a/ matches? ] unit-test
[ t ] [ "a" R/ a$/ matches? ] unit-test ! [ t ] [ "a" R/ a$/ matches? ] unit-test
[ f ] [ "a\n" R/ a$/ matches? ] unit-test ! [ f ] [ "a\n" R/ a$/ matches? ] unit-test
[ f ] [ "a\r" R/ a$/ matches? ] unit-test ! [ f ] [ "a\r" R/ a$/ matches? ] unit-test
[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test ! [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
! [ t ] [ "a" R/ a$|b$/ matches? ] unit-test
! [ t ] [ "b" R/ a$|b$/ matches? ] unit-test
! [ t ] [ "ab" R/ a$|b$/ matches? ] unit-test
! [ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test ! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test ! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
@ -347,14 +360,6 @@ IN: regexp-tests
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
! Convert to lowercase until E
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
! Convert to uppercase until E
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test ! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test ! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test ! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test

View File

@ -16,6 +16,7 @@ IN: regexp
H{ } clone >>nfa-traversal-flags H{ } clone >>nfa-traversal-flags
H{ } clone >>dfa-traversal-flags H{ } clone >>dfa-traversal-flags
H{ } clone >>options H{ } clone >>options
H{ } clone >>matchers
reset-regexp ; reset-regexp ;
: construct-regexp ( regexp -- regexp' ) : construct-regexp ( regexp -- regexp' )
@ -93,26 +94,6 @@ IN: regexp
{ "R| " "|" } { "R| " "|" }
} swap [ subseq? not nip ] curry assoc-find drop ; } swap [ subseq? not nip ] curry assoc-find drop ;
ERROR: unknown-regexp-option option ;
: option>ch ( option -- string )
{
{ case-insensitive [ CHAR: i ] }
{ multiline [ CHAR: m ] }
{ reversed-regexp [ CHAR: r ] }
{ dotall [ CHAR: s ] }
[ unknown-regexp-option ]
} case ;
: ch>option ( ch -- option )
{
{ CHAR: i [ case-insensitive ] }
{ CHAR: m [ multiline ] }
{ CHAR: r [ reversed-regexp ] }
{ CHAR: s [ dotall ] }
[ unknown-regexp-option ]
} case ;
: string>options ( string -- options ) : string>options ( string -- options )
[ ch>option dup ] H{ } map>assoc ; [ ch>option dup ] H{ } map>assoc ;

View File

@ -26,23 +26,6 @@ IN: regexp.utils
: ?insert-at ( value key hash/f -- hash ) : ?insert-at ( value key hash/f -- hash )
[ H{ } clone ] unless* [ insert-at ] keep ; [ H{ } clone ] unless* [ insert-at ] keep ;
: last-state ( regexp -- range ) stack>> peek first2 [a,b] ;
: push1 ( obj -- ) input-stream get stream>> push ;
: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
: drop1 ( -- ) read1 drop ;
: stack ( -- obj ) current-regexp get stack>> ;
: change-whole-stack ( quot -- )
current-regexp get
[ stack>> swap call ] keep (>>stack) ; inline
: push-stack ( obj -- ) stack push ;
: pop-stack ( -- obj ) stack pop ;
: cut-out ( vector n -- vector' vector ) cut rest ;
ERROR: cut-stack-error ;
: cut-stack ( obj vector -- vector' vector )
tuck last-index [ cut-stack-error ] unless* cut-out swap ;
ERROR: bad-octal number ; ERROR: bad-octal number ;
ERROR: bad-hex number ; ERROR: bad-hex number ;
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ; : check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;