move things around. the parser no longer adjusts nodes based on options, instead opting for nfa to handle it (case-insensitive, multiline, dotall, reversed..)
parent
e4a2b671d3
commit
384a11ecee
|
@ -5,12 +5,13 @@ IN: regexp.backend
|
||||||
|
|
||||||
TUPLE: regexp
|
TUPLE: regexp
|
||||||
raw
|
raw
|
||||||
{ stack vector }
|
|
||||||
parse-tree
|
|
||||||
{ options hashtable }
|
{ options hashtable }
|
||||||
|
stack
|
||||||
|
parse-tree
|
||||||
nfa-table
|
nfa-table
|
||||||
dfa-table
|
dfa-table
|
||||||
minimized-table
|
minimized-table
|
||||||
|
matchers
|
||||||
{ nfa-traversal-flags hashtable }
|
{ nfa-traversal-flags hashtable }
|
||||||
{ dfa-traversal-flags hashtable }
|
{ dfa-traversal-flags hashtable }
|
||||||
{ state integer }
|
{ state integer }
|
||||||
|
|
|
@ -1,12 +1,25 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors kernel math math.order symbols regexp.parser
|
USING: accessors kernel math math.order symbols
|
||||||
words regexp.utils unicode.categories combinators.short-circuit ;
|
words regexp.utils unicode.categories combinators.short-circuit ;
|
||||||
IN: regexp.classes
|
IN: regexp.classes
|
||||||
|
|
||||||
|
SINGLETONS: any-char any-char-no-nl
|
||||||
|
letter-class LETTER-class Letter-class digit-class
|
||||||
|
alpha-class non-newline-blank-class
|
||||||
|
ascii-class punctuation-class java-printable-class blank-class
|
||||||
|
control-character-class hex-digit-class java-blank-class c-identifier-class
|
||||||
|
unmatchable-class terminator-class word-boundary-class ;
|
||||||
|
|
||||||
|
SINGLETONS: beginning-of-input beginning-of-line
|
||||||
|
end-of-input end-of-line ;
|
||||||
|
|
||||||
|
MIXIN: node
|
||||||
|
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
|
||||||
|
|
||||||
GENERIC: class-member? ( obj class -- ? )
|
GENERIC: class-member? ( obj class -- ? )
|
||||||
|
|
||||||
M: word class-member? ( obj class -- ? ) 2drop f ;
|
M: t class-member? ( obj class -- ? ) 2drop f ;
|
||||||
|
|
||||||
M: integer class-member? ( obj class -- ? ) 2drop f ;
|
M: integer class-member? ( obj class -- ? ) 2drop f ;
|
||||||
|
|
||||||
|
@ -70,3 +83,9 @@ M: terminator-class class-member? ( obj class -- ? )
|
||||||
[ CHAR: \u002028 = ]
|
[ CHAR: \u002028 = ]
|
||||||
[ CHAR: \u002029 = ]
|
[ CHAR: \u002029 = ]
|
||||||
} 1|| ;
|
} 1|| ;
|
||||||
|
|
||||||
|
M: beginning-of-line class-member? ( obj class -- ? )
|
||||||
|
2drop f ;
|
||||||
|
|
||||||
|
M: end-of-line class-member? ( obj class -- ? )
|
||||||
|
2drop f ;
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
USING: accessors arrays assocs grouping kernel regexp.backend
|
USING: accessors arrays assocs grouping kernel regexp.backend
|
||||||
locals math namespaces regexp.parser sequences fry quotations
|
locals math namespaces regexp.parser sequences fry quotations
|
||||||
math.order math.ranges vectors unicode.categories regexp.utils
|
math.order math.ranges vectors unicode.categories regexp.utils
|
||||||
regexp.transition-tables words sets ;
|
regexp.transition-tables words sets regexp.classes unicode.case ;
|
||||||
IN: regexp.nfa
|
IN: regexp.nfa
|
||||||
|
|
||||||
SYMBOL: negation-mode
|
SYMBOL: negation-mode
|
||||||
|
@ -22,8 +22,13 @@ SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
|
||||||
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
|
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
|
||||||
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
|
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
|
||||||
|
|
||||||
: add-global-flag ( flag -- )
|
: options ( -- obj ) current-regexp get options>> ;
|
||||||
current-regexp get nfa-table>> flags>> conjoin ;
|
|
||||||
|
: option? ( obj -- ? ) options key? ;
|
||||||
|
|
||||||
|
: option-on ( obj -- ) options conjoin ;
|
||||||
|
|
||||||
|
: option-off ( obj -- ) options delete-at ;
|
||||||
|
|
||||||
: next-state ( regexp -- state )
|
: next-state ( regexp -- state )
|
||||||
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
|
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
|
||||||
|
@ -106,6 +111,7 @@ M: kleene-star nfa-node ( node -- )
|
||||||
|
|
||||||
M: concatenation nfa-node ( node -- )
|
M: concatenation nfa-node ( node -- )
|
||||||
seq>>
|
seq>>
|
||||||
|
reversed-regexp option? [ <reversed> ] when
|
||||||
[ [ nfa-node ] each ]
|
[ [ nfa-node ] each ]
|
||||||
[ length 1- [ concatenate-nodes ] times ] bi ;
|
[ length 1- [ concatenate-nodes ] times ] bi ;
|
||||||
|
|
||||||
|
@ -115,16 +121,59 @@ M: alternation nfa-node ( node -- )
|
||||||
[ length 1- [ alternate-nodes ] times ] bi ;
|
[ length 1- [ alternate-nodes ] times ] bi ;
|
||||||
|
|
||||||
M: constant nfa-node ( node -- )
|
M: constant nfa-node ( node -- )
|
||||||
char>> literal-transition add-simple-entry ;
|
case-insensitive option? [
|
||||||
|
dup char>> [ ch>lower ] [ ch>upper ] bi
|
||||||
|
2dup = [
|
||||||
|
2drop
|
||||||
|
char>> literal-transition add-simple-entry
|
||||||
|
] [
|
||||||
|
[ literal-transition add-simple-entry ] bi@
|
||||||
|
alternate-nodes drop
|
||||||
|
] if
|
||||||
|
] [
|
||||||
|
char>> literal-transition add-simple-entry
|
||||||
|
] if ;
|
||||||
|
|
||||||
M: epsilon nfa-node ( node -- )
|
M: epsilon nfa-node ( node -- )
|
||||||
drop eps literal-transition add-simple-entry ;
|
drop eps literal-transition add-simple-entry ;
|
||||||
|
|
||||||
M: word nfa-node ( node -- )
|
M: word nfa-node ( node -- ) class-transition add-simple-entry ;
|
||||||
|
|
||||||
|
M: any-char nfa-node ( node -- )
|
||||||
|
[ dotall option? ] dip any-char-no-nl ?
|
||||||
class-transition add-simple-entry ;
|
class-transition add-simple-entry ;
|
||||||
|
|
||||||
|
! M: beginning-of-text nfa-node ( node -- ) ;
|
||||||
|
|
||||||
|
M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
|
||||||
|
|
||||||
|
M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
|
||||||
|
|
||||||
|
: choose-letter-class ( node -- node' )
|
||||||
|
case-insensitive option? Letter-class rot ? ;
|
||||||
|
|
||||||
|
M: letter-class nfa-node ( node -- )
|
||||||
|
choose-letter-class class-transition add-simple-entry ;
|
||||||
|
|
||||||
|
M: LETTER-class nfa-node ( node -- )
|
||||||
|
choose-letter-class class-transition add-simple-entry ;
|
||||||
|
|
||||||
M: character-class-range nfa-node ( node -- )
|
M: character-class-range nfa-node ( node -- )
|
||||||
class-transition add-simple-entry ;
|
case-insensitive option? [
|
||||||
|
dup [ from>> ] [ to>> ] bi
|
||||||
|
2dup [ Letter? ] bi@ and [
|
||||||
|
rot drop
|
||||||
|
[ [ ch>lower ] bi@ character-class-range boa ]
|
||||||
|
[ [ ch>upper ] bi@ character-class-range boa ] 2bi
|
||||||
|
[ class-transition add-simple-entry ] bi@
|
||||||
|
alternate-nodes
|
||||||
|
] [
|
||||||
|
2drop
|
||||||
|
class-transition add-simple-entry
|
||||||
|
] if
|
||||||
|
] [
|
||||||
|
class-transition add-simple-entry
|
||||||
|
] if ;
|
||||||
|
|
||||||
M: capture-group nfa-node ( node -- )
|
M: capture-group nfa-node ( node -- )
|
||||||
eps literal-transition add-simple-entry
|
eps literal-transition add-simple-entry
|
||||||
|
@ -141,26 +190,6 @@ M: non-capture-group nfa-node ( node -- )
|
||||||
M: reluctant-kleene-star nfa-node ( node -- )
|
M: reluctant-kleene-star nfa-node ( node -- )
|
||||||
term>> <kleene-star> nfa-node ;
|
term>> <kleene-star> nfa-node ;
|
||||||
|
|
||||||
M: beginning-of-line nfa-node ( node -- )
|
|
||||||
drop
|
|
||||||
eps literal-transition add-simple-entry
|
|
||||||
beginning-of-line add-global-flag ;
|
|
||||||
|
|
||||||
M: end-of-line nfa-node ( node -- )
|
|
||||||
drop
|
|
||||||
eps literal-transition add-simple-entry
|
|
||||||
end-of-line add-global-flag ;
|
|
||||||
|
|
||||||
M: beginning-of-input nfa-node ( node -- )
|
|
||||||
drop
|
|
||||||
eps literal-transition add-simple-entry
|
|
||||||
beginning-of-input add-global-flag ;
|
|
||||||
|
|
||||||
M: end-of-input nfa-node ( node -- )
|
|
||||||
drop
|
|
||||||
eps literal-transition add-simple-entry
|
|
||||||
end-of-input add-global-flag ;
|
|
||||||
|
|
||||||
M: negation nfa-node ( node -- )
|
M: negation nfa-node ( node -- )
|
||||||
negation-mode inc
|
negation-mode inc
|
||||||
term>> nfa-node
|
term>> nfa-node
|
||||||
|
@ -182,6 +211,10 @@ M: lookbehind nfa-node ( node -- )
|
||||||
lookbehind-off add-traversal-flag
|
lookbehind-off add-traversal-flag
|
||||||
2 [ concatenate-nodes ] times ;
|
2 [ concatenate-nodes ] times ;
|
||||||
|
|
||||||
|
M: option nfa-node ( node -- )
|
||||||
|
[ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
|
||||||
|
eps literal-transition add-simple-entry ;
|
||||||
|
|
||||||
: construct-nfa ( regexp -- )
|
: construct-nfa ( regexp -- )
|
||||||
[
|
[
|
||||||
reset-regexp
|
reset-regexp
|
||||||
|
|
|
@ -19,8 +19,8 @@ IN: regexp.parser
|
||||||
[ ] [ "(?:a)" test-regexp ] unit-test
|
[ ] [ "(?:a)" test-regexp ] unit-test
|
||||||
[ ] [ "(?i:a)" test-regexp ] unit-test
|
[ ] [ "(?i:a)" test-regexp ] unit-test
|
||||||
[ ] [ "(?-i:a)" test-regexp ] unit-test
|
[ ] [ "(?-i:a)" test-regexp ] unit-test
|
||||||
[ "(?z:a)" test-regexp ] [ bad-option? ] must-fail-with
|
[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
|
||||||
[ "(?-z:a)" test-regexp ] [ bad-option? ] must-fail-with
|
[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
|
||||||
|
|
||||||
[ ] [ "(?=a)" test-regexp ] unit-test
|
[ ] [ "(?=a)" test-regexp ] unit-test
|
||||||
|
|
||||||
|
|
|
@ -4,12 +4,11 @@ USING: accessors arrays assocs combinators io io.streams.string
|
||||||
kernel math math.parser namespaces qualified sets
|
kernel math math.parser namespaces qualified sets
|
||||||
quotations sequences splitting symbols vectors math.order
|
quotations sequences splitting symbols vectors math.order
|
||||||
unicode.categories strings regexp.backend regexp.utils
|
unicode.categories strings regexp.backend regexp.utils
|
||||||
unicode.case words locals ;
|
unicode.case words locals regexp.classes ;
|
||||||
IN: regexp.parser
|
IN: regexp.parser
|
||||||
|
|
||||||
FROM: math.ranges => [a,b] ;
|
FROM: math.ranges => [a,b] ;
|
||||||
|
|
||||||
MIXIN: node
|
|
||||||
TUPLE: concatenation seq ; INSTANCE: concatenation node
|
TUPLE: concatenation seq ; INSTANCE: concatenation node
|
||||||
TUPLE: alternation seq ; INSTANCE: alternation node
|
TUPLE: alternation seq ; INSTANCE: alternation node
|
||||||
TUPLE: kleene-star term ; INSTANCE: kleene-star node
|
TUPLE: kleene-star term ; INSTANCE: kleene-star node
|
||||||
|
@ -40,38 +39,31 @@ INSTANCE: independent-group parentheses-group
|
||||||
TUPLE: comment-group term ; INSTANCE: comment-group node
|
TUPLE: comment-group term ; INSTANCE: comment-group node
|
||||||
INSTANCE: comment-group parentheses-group
|
INSTANCE: comment-group parentheses-group
|
||||||
|
|
||||||
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
|
|
||||||
SINGLETON: epsilon INSTANCE: epsilon node
|
SINGLETON: epsilon INSTANCE: epsilon node
|
||||||
SINGLETON: any-char INSTANCE: any-char node
|
|
||||||
SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
|
|
||||||
SINGLETON: beginning-of-input INSTANCE: beginning-of-input node
|
|
||||||
SINGLETON: end-of-input INSTANCE: end-of-input node
|
|
||||||
SINGLETON: beginning-of-line INSTANCE: beginning-of-line node
|
|
||||||
SINGLETON: end-of-line INSTANCE: end-of-line node
|
|
||||||
|
|
||||||
TUPLE: option-on option ; INSTANCE: option-on node
|
TUPLE: option option on? ; INSTANCE: option node
|
||||||
TUPLE: option-off option ; INSTANCE: option-off node
|
|
||||||
SINGLETONS: unix-lines dotall multiline comments case-insensitive
|
SINGLETONS: unix-lines dotall multiline comments case-insensitive
|
||||||
unicode-case reversed-regexp ;
|
unicode-case reversed-regexp ;
|
||||||
|
|
||||||
SINGLETONS: letter-class LETTER-class Letter-class digit-class
|
SINGLETONS: beginning-of-character-class end-of-character-class
|
||||||
alpha-class non-newline-blank-class
|
|
||||||
ascii-class punctuation-class java-printable-class blank-class
|
|
||||||
control-character-class hex-digit-class java-blank-class c-identifier-class
|
|
||||||
unmatchable-class terminator-class word-boundary-class ;
|
|
||||||
|
|
||||||
SINGLETONS: beginning-of-group end-of-group
|
|
||||||
beginning-of-character-class end-of-character-class
|
|
||||||
left-parenthesis pipe caret dash ;
|
left-parenthesis pipe caret dash ;
|
||||||
|
|
||||||
: get-option ( option -- ? ) current-regexp get options>> at ;
|
: push1 ( obj -- ) input-stream get stream>> push ;
|
||||||
: get-unix-lines ( -- ? ) unix-lines get-option ;
|
: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
|
||||||
: get-dotall ( -- ? ) dotall get-option ;
|
: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
|
||||||
: get-multiline ( -- ? ) multiline get-option ;
|
: drop1 ( -- ) read1 drop ;
|
||||||
: get-comments ( -- ? ) comments get-option ;
|
|
||||||
: get-case-insensitive ( -- ? ) case-insensitive get-option ;
|
: stack ( -- obj ) current-regexp get stack>> ;
|
||||||
: get-unicode-case ( -- ? ) unicode-case get-option ;
|
: change-whole-stack ( quot -- )
|
||||||
: get-reversed-regexp ( -- ? ) reversed-regexp get-option ;
|
current-regexp get
|
||||||
|
[ stack>> swap call ] keep (>>stack) ; inline
|
||||||
|
: push-stack ( obj -- ) stack push ;
|
||||||
|
: pop-stack ( -- obj ) stack pop ;
|
||||||
|
: cut-out ( vector n -- vector' vector ) cut rest ;
|
||||||
|
ERROR: cut-stack-error ;
|
||||||
|
: cut-stack ( obj vector -- vector' vector )
|
||||||
|
tuck last-index [ cut-stack-error ] unless* cut-out swap ;
|
||||||
|
|
||||||
: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
|
: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
|
||||||
: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
|
: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
|
||||||
|
@ -80,18 +72,11 @@ left-parenthesis pipe caret dash ;
|
||||||
|
|
||||||
: <negation> ( obj -- negation ) negation boa ;
|
: <negation> ( obj -- negation ) negation boa ;
|
||||||
: <concatenation> ( seq -- concatenation )
|
: <concatenation> ( seq -- concatenation )
|
||||||
>vector get-reversed-regexp [ reverse ] when
|
>vector [ epsilon ] [ concatenation boa ] if-empty ;
|
||||||
[ epsilon ] [ concatenation boa ] if-empty ;
|
|
||||||
: <alternation> ( seq -- alternation ) >vector alternation boa ;
|
: <alternation> ( seq -- alternation ) >vector alternation boa ;
|
||||||
: <capture-group> ( obj -- capture-group ) capture-group boa ;
|
: <capture-group> ( obj -- capture-group ) capture-group boa ;
|
||||||
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
|
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
|
||||||
: <constant> ( obj -- constant )
|
: <constant> ( obj -- constant ) constant boa ;
|
||||||
dup Letter? get-case-insensitive and [
|
|
||||||
[ ch>lower ] [ ch>upper ] bi
|
|
||||||
[ constant boa ] bi@ 2array <alternation>
|
|
||||||
] [
|
|
||||||
constant boa
|
|
||||||
] if ;
|
|
||||||
|
|
||||||
: first|concatenation ( seq -- first/concatenation )
|
: first|concatenation ( seq -- first/concatenation )
|
||||||
dup length 1 = [ first ] [ <concatenation> ] if ;
|
dup length 1 = [ first ] [ <concatenation> ] if ;
|
||||||
|
@ -100,21 +85,14 @@ left-parenthesis pipe caret dash ;
|
||||||
dup length 1 = [ first ] [ <alternation> ] if ;
|
dup length 1 = [ first ] [ <alternation> ] if ;
|
||||||
|
|
||||||
: <character-class-range> ( from to -- obj )
|
: <character-class-range> ( from to -- obj )
|
||||||
2dup [ Letter? ] bi@ or get-case-insensitive and [
|
|
||||||
[ [ ch>lower ] bi@ character-class-range boa ]
|
|
||||||
[ [ ch>upper ] bi@ character-class-range boa ] 2bi
|
|
||||||
2array [ [ from>> ] [ to>> ] bi < ] filter
|
|
||||||
[ unmatchable-class ] [ first|alternation ] if-empty
|
|
||||||
] [
|
|
||||||
2dup <
|
2dup <
|
||||||
[ character-class-range boa ] [ 2drop unmatchable-class ] if
|
[ character-class-range boa ] [ 2drop unmatchable-class ] if ;
|
||||||
] if ;
|
|
||||||
|
|
||||||
ERROR: unmatched-parentheses ;
|
ERROR: unmatched-parentheses ;
|
||||||
|
|
||||||
ERROR: bad-option ch ;
|
ERROR: unknown-regexp-option option ;
|
||||||
|
|
||||||
: option ( ch -- singleton )
|
: ch>option ( ch -- singleton )
|
||||||
{
|
{
|
||||||
{ CHAR: i [ case-insensitive ] }
|
{ CHAR: i [ case-insensitive ] }
|
||||||
{ CHAR: d [ unix-lines ] }
|
{ CHAR: d [ unix-lines ] }
|
||||||
|
@ -124,13 +102,21 @@ ERROR: bad-option ch ;
|
||||||
{ CHAR: s [ dotall ] }
|
{ CHAR: s [ dotall ] }
|
||||||
{ CHAR: u [ unicode-case ] }
|
{ CHAR: u [ unicode-case ] }
|
||||||
{ CHAR: x [ comments ] }
|
{ CHAR: x [ comments ] }
|
||||||
[ bad-option ]
|
[ unknown-regexp-option ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: option-on ( option -- ) current-regexp get options>> conjoin ;
|
: option>ch ( option -- string )
|
||||||
: option-off ( option -- ) current-regexp get options>> delete-at ;
|
{
|
||||||
|
{ case-insensitive [ CHAR: i ] }
|
||||||
|
{ multiline [ CHAR: m ] }
|
||||||
|
{ reversed-regexp [ CHAR: r ] }
|
||||||
|
{ dotall [ CHAR: s ] }
|
||||||
|
[ unknown-regexp-option ]
|
||||||
|
} case ;
|
||||||
|
|
||||||
|
: toggle-option ( ch ? -- )
|
||||||
|
[ ch>option ] dip option boa push-stack ;
|
||||||
|
|
||||||
: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
|
|
||||||
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
|
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
|
||||||
|
|
||||||
: parse-options ( string -- )
|
: parse-options ( string -- )
|
||||||
|
@ -176,7 +162,7 @@ DEFER: (parse-regexp)
|
||||||
[ drop1 (parse-special-group) ]
|
[ drop1 (parse-special-group) ]
|
||||||
[ capture-group f nested-parse-regexp ] if ;
|
[ capture-group f nested-parse-regexp ] if ;
|
||||||
|
|
||||||
: handle-dot ( -- ) get-dotall any-char any-char-no-nl ? push-stack ;
|
: handle-dot ( -- ) any-char push-stack ;
|
||||||
: handle-pipe ( -- ) pipe push-stack ;
|
: handle-pipe ( -- ) pipe push-stack ;
|
||||||
: (handle-star) ( obj -- kleene-star )
|
: (handle-star) ( obj -- kleene-star )
|
||||||
peek1 {
|
peek1 {
|
||||||
|
@ -234,11 +220,8 @@ ERROR: invalid-range a b ;
|
||||||
[ [ nip at-most-n ] [ at-least-n ] if* ] if
|
[ [ nip at-most-n ] [ at-least-n ] if* ] if
|
||||||
] [ drop 0 max exactly-n ] if ;
|
] [ drop 0 max exactly-n ] if ;
|
||||||
|
|
||||||
: handle-front-anchor ( -- )
|
: handle-front-anchor ( -- ) beginning-of-line push-stack ;
|
||||||
get-multiline beginning-of-line beginning-of-input ? push-stack ;
|
: handle-back-anchor ( -- ) end-of-line push-stack ;
|
||||||
|
|
||||||
: handle-back-anchor ( -- )
|
|
||||||
get-multiline end-of-line end-of-input ? push-stack ;
|
|
||||||
|
|
||||||
ERROR: bad-character-class obj ;
|
ERROR: bad-character-class obj ;
|
||||||
ERROR: expected-posix-class ;
|
ERROR: expected-posix-class ;
|
||||||
|
@ -247,8 +230,8 @@ ERROR: expected-posix-class ;
|
||||||
read1 CHAR: { = [ expected-posix-class ] unless
|
read1 CHAR: { = [ expected-posix-class ] unless
|
||||||
"}" read-until [ bad-character-class ] unless
|
"}" read-until [ bad-character-class ] unless
|
||||||
{
|
{
|
||||||
{ "Lower" [ get-case-insensitive Letter-class letter-class ? ] }
|
{ "Lower" [ letter-class ] }
|
||||||
{ "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] }
|
{ "Upper" [ LETTER-class ] }
|
||||||
{ "Alpha" [ Letter-class ] }
|
{ "Alpha" [ Letter-class ] }
|
||||||
{ "ASCII" [ ascii-class ] }
|
{ "ASCII" [ ascii-class ] }
|
||||||
{ "Digit" [ digit-class ] }
|
{ "Digit" [ digit-class ] }
|
||||||
|
@ -412,7 +395,8 @@ DEFER: handle-left-bracket
|
||||||
[ first|concatenation ] map first|alternation ;
|
[ first|concatenation ] map first|alternation ;
|
||||||
|
|
||||||
: handle-right-parenthesis ( -- )
|
: handle-right-parenthesis ( -- )
|
||||||
stack dup [ parentheses-group "members" word-prop member? ] find-last -rot cut rest
|
stack dup [ parentheses-group "members" word-prop member? ] find-last
|
||||||
|
-rot cut rest
|
||||||
[ [ push ] keep current-regexp get (>>stack) ]
|
[ [ push ] keep current-regexp get (>>stack) ]
|
||||||
[ finish-regexp-parse push-stack ] bi* ;
|
[ finish-regexp-parse push-stack ] bi* ;
|
||||||
|
|
||||||
|
@ -429,12 +413,9 @@ DEFER: handle-left-bracket
|
||||||
{ CHAR: [ [ handle-left-bracket t ] }
|
{ CHAR: [ [ handle-left-bracket t ] }
|
||||||
{ CHAR: \ [ handle-escape t ] }
|
{ CHAR: \ [ handle-escape t ] }
|
||||||
[
|
[
|
||||||
dup CHAR: $ = peek1 f = and [
|
dup CHAR: $ = peek1 f = and
|
||||||
drop
|
[ drop handle-back-anchor f ]
|
||||||
handle-back-anchor f
|
[ push-constant t ] if
|
||||||
] [
|
|
||||||
push-constant t
|
|
||||||
] if
|
|
||||||
]
|
]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
|
@ -451,7 +432,6 @@ DEFER: handle-left-bracket
|
||||||
parse-regexp-beginning (parse-regexp)
|
parse-regexp-beginning (parse-regexp)
|
||||||
] with-input-stream
|
] with-input-stream
|
||||||
] unless-empty
|
] unless-empty
|
||||||
current-regexp get
|
current-regexp get [ finish-regexp-parse ] change-stack
|
||||||
stack finish-regexp-parse
|
dup stack>> >>parse-tree drop
|
||||||
>>parse-tree drop
|
|
||||||
] with-variable ;
|
] with-variable ;
|
||||||
|
|
|
@ -238,7 +238,7 @@ IN: regexp-tests
|
||||||
|
|
||||||
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
|
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
|
||||||
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
|
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
|
||||||
[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test
|
[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/ matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||||
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||||
|
@ -307,17 +307,30 @@ IN: regexp-tests
|
||||||
! Bug in parsing word
|
! Bug in parsing word
|
||||||
[ t ] [ "a" R' a' matches? ] unit-test
|
[ t ] [ "a" R' a' matches? ] unit-test
|
||||||
|
|
||||||
|
! Convert to lowercase until E
|
||||||
|
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
|
||||||
|
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
|
||||||
|
|
||||||
|
! Convert to uppercase until E
|
||||||
|
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
|
||||||
|
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
|
||||||
|
|
||||||
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
|
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
|
||||||
|
|
||||||
[ t ] [ "a" R/ ^a/ matches? ] unit-test
|
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
|
||||||
[ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
||||||
[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
|
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
|
||||||
[ f ] [ "\ra" R/ ^a/ matches? ] unit-test
|
! [ f ] [ "\ra" R/ ^a/ matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "a" R/ a$/ matches? ] unit-test
|
! [ t ] [ "a" R/ a$/ matches? ] unit-test
|
||||||
[ f ] [ "a\n" R/ a$/ matches? ] unit-test
|
! [ f ] [ "a\n" R/ a$/ matches? ] unit-test
|
||||||
[ f ] [ "a\r" R/ a$/ matches? ] unit-test
|
! [ f ] [ "a\r" R/ a$/ matches? ] unit-test
|
||||||
[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
|
! [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "a" R/ a$|b$/ matches? ] unit-test
|
||||||
|
! [ t ] [ "b" R/ a$|b$/ matches? ] unit-test
|
||||||
|
! [ t ] [ "ab" R/ a$|b$/ matches? ] unit-test
|
||||||
|
! [ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test
|
||||||
|
|
||||||
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
|
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
|
||||||
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
|
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
|
||||||
|
@ -347,14 +360,6 @@ IN: regexp-tests
|
||||||
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
|
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
|
||||||
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
|
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
|
||||||
|
|
||||||
! Convert to lowercase until E
|
|
||||||
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
|
|
||||||
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
|
|
||||||
|
|
||||||
! Convert to uppercase until E
|
|
||||||
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
|
|
||||||
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
|
|
||||||
|
|
||||||
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
|
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
|
||||||
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
|
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
|
||||||
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
|
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
|
||||||
|
|
|
@ -16,6 +16,7 @@ IN: regexp
|
||||||
H{ } clone >>nfa-traversal-flags
|
H{ } clone >>nfa-traversal-flags
|
||||||
H{ } clone >>dfa-traversal-flags
|
H{ } clone >>dfa-traversal-flags
|
||||||
H{ } clone >>options
|
H{ } clone >>options
|
||||||
|
H{ } clone >>matchers
|
||||||
reset-regexp ;
|
reset-regexp ;
|
||||||
|
|
||||||
: construct-regexp ( regexp -- regexp' )
|
: construct-regexp ( regexp -- regexp' )
|
||||||
|
@ -93,26 +94,6 @@ IN: regexp
|
||||||
{ "R| " "|" }
|
{ "R| " "|" }
|
||||||
} swap [ subseq? not nip ] curry assoc-find drop ;
|
} swap [ subseq? not nip ] curry assoc-find drop ;
|
||||||
|
|
||||||
ERROR: unknown-regexp-option option ;
|
|
||||||
|
|
||||||
: option>ch ( option -- string )
|
|
||||||
{
|
|
||||||
{ case-insensitive [ CHAR: i ] }
|
|
||||||
{ multiline [ CHAR: m ] }
|
|
||||||
{ reversed-regexp [ CHAR: r ] }
|
|
||||||
{ dotall [ CHAR: s ] }
|
|
||||||
[ unknown-regexp-option ]
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: ch>option ( ch -- option )
|
|
||||||
{
|
|
||||||
{ CHAR: i [ case-insensitive ] }
|
|
||||||
{ CHAR: m [ multiline ] }
|
|
||||||
{ CHAR: r [ reversed-regexp ] }
|
|
||||||
{ CHAR: s [ dotall ] }
|
|
||||||
[ unknown-regexp-option ]
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: string>options ( string -- options )
|
: string>options ( string -- options )
|
||||||
[ ch>option dup ] H{ } map>assoc ;
|
[ ch>option dup ] H{ } map>assoc ;
|
||||||
|
|
||||||
|
|
|
@ -26,23 +26,6 @@ IN: regexp.utils
|
||||||
: ?insert-at ( value key hash/f -- hash )
|
: ?insert-at ( value key hash/f -- hash )
|
||||||
[ H{ } clone ] unless* [ insert-at ] keep ;
|
[ H{ } clone ] unless* [ insert-at ] keep ;
|
||||||
|
|
||||||
: last-state ( regexp -- range ) stack>> peek first2 [a,b] ;
|
|
||||||
: push1 ( obj -- ) input-stream get stream>> push ;
|
|
||||||
: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
|
|
||||||
: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
|
|
||||||
: drop1 ( -- ) read1 drop ;
|
|
||||||
|
|
||||||
: stack ( -- obj ) current-regexp get stack>> ;
|
|
||||||
: change-whole-stack ( quot -- )
|
|
||||||
current-regexp get
|
|
||||||
[ stack>> swap call ] keep (>>stack) ; inline
|
|
||||||
: push-stack ( obj -- ) stack push ;
|
|
||||||
: pop-stack ( -- obj ) stack pop ;
|
|
||||||
: cut-out ( vector n -- vector' vector ) cut rest ;
|
|
||||||
ERROR: cut-stack-error ;
|
|
||||||
: cut-stack ( obj vector -- vector' vector )
|
|
||||||
tuck last-index [ cut-stack-error ] unless* cut-out swap ;
|
|
||||||
|
|
||||||
ERROR: bad-octal number ;
|
ERROR: bad-octal number ;
|
||||||
ERROR: bad-hex number ;
|
ERROR: bad-hex number ;
|
||||||
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
|
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
|
||||||
|
|
Loading…
Reference in New Issue