case-insensitive for constants

db4
Doug Coleman 2008-08-21 17:12:26 -05:00
parent 88134e539f
commit 9eba6c0034
4 changed files with 41 additions and 20 deletions

View File

@ -7,10 +7,10 @@ TUPLE: regexp
raw raw
{ stack vector } { stack vector }
parse-tree parse-tree
{ options hashtable }
nfa-table nfa-table
dfa-table dfa-table
minimized-table minimized-table
case-insensitive
{ state integer } { state integer }
{ new-states vector } { new-states vector }
{ visited-states hashtable } ; { visited-states hashtable } ;

View File

@ -1,10 +1,10 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators io io.streams.string USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser multi-methods namespaces qualified kernel math math.parser multi-methods namespaces qualified sets
quotations sequences sequences.lib splitting symbols vectors quotations sequences sequences.lib splitting symbols vectors
dlists math.order combinators.lib unicode.categories dlists math.order combinators.lib unicode.categories strings
sequences.lib regexp2.backend regexp2.utils ; sequences.lib regexp2.backend regexp2.utils unicode.case ;
IN: regexp2.parser IN: regexp2.parser
FROM: math.ranges => [a,b] ; FROM: math.ranges => [a,b] ;
@ -48,12 +48,26 @@ SINGLETONS: beginning-of-group end-of-group
beginning-of-character-class end-of-character-class beginning-of-character-class end-of-character-class
left-parenthesis pipe caret dash ; left-parenthesis pipe caret dash ;
: <constant> ( obj -- constant ) constant boa ; : get-option ( option -- ? ) current-regexp get options>> at ;
: get-unix-lines ( -- ? ) unix-lines get-option ;
: get-dotall ( -- ? ) dotall get-option ;
: get-multiline ( -- ? ) multiline get-option ;
: get-comments ( -- ? ) comments get-option ;
: get-case-insensitive ( -- ? ) case-insensitive get-option ;
: get-unicode-case ( -- ? ) unicode-case get-option ;
: <negation> ( obj -- negation ) negation boa ; : <negation> ( obj -- negation ) negation boa ;
: <concatenation> ( seq -- concatenation ) >vector concatenation boa ; : <concatenation> ( seq -- concatenation ) >vector concatenation boa ;
: <alternation> ( seq -- alternation ) >vector alternation boa ; : <alternation> ( seq -- alternation ) >vector alternation boa ;
: <capture-group> ( obj -- capture-group ) capture-group boa ; : <capture-group> ( obj -- capture-group ) capture-group boa ;
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ; : <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
: <constant> ( obj -- constant )
dup Letter? get-case-insensitive and [
[ ch>lower constant boa ]
[ ch>upper constant boa ] bi 2array <alternation>
] [
constant boa
] if ;
: first|concatenation ( seq -- first/concatenation ) : first|concatenation ( seq -- first/concatenation )
dup length 1 = [ first ] [ <concatenation> ] if ; dup length 1 = [ first ] [ <concatenation> ] if ;
@ -96,18 +110,19 @@ ERROR: bad-option ch ;
[ bad-option ] [ bad-option ]
} case ; } case ;
: option-on ( ch -- ) option \ option-on boa push-stack ; : option-on ( option -- ) current-regexp get options>> conjoin ;
: option-off ( ch -- ) option \ option-off boa push-stack ; : option-off ( option -- ) current-regexp get options>> delete-at ;
: toggle-option ( ch ? -- ) [ option-on ] [ option-off ] if ;
: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ; : (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
: parse-options ( string -- ) : parse-options ( string -- )
"-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
DEFER: (parse-regexp) DEFER: (parse-regexp)
: parse-special-group-options ( options -- ) : parse-special-group ( -- )
beginning-of-group push-stack beginning-of-group push-stack
parse-options (parse-regexp) pop-stack make-non-capturing-group ; (parse-regexp) pop-stack make-non-capturing-group ;
ERROR: bad-special-group string ; ERROR: bad-special-group string ;
@ -126,8 +141,13 @@ ERROR: bad-special-group string ;
{ [ dup CHAR: < = peek1 CHAR: ! = and ] { [ dup CHAR: < = peek1 CHAR: ! = and ]
[ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] } [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] }
[ [
":" read-until [ bad-special-group ] unless ":)" read-until
swap prefix parse-special-group-options [ swap prefix ] dip
{
{ CHAR: : [ parse-options parse-special-group ] }
{ CHAR: ) [ parse-options ] }
[ drop bad-special-group ]
} case
] ]
} cond ; } cond ;
@ -312,10 +332,8 @@ DEFER: handle-left-bracket
beginning-of-character-class push-stack beginning-of-character-class push-stack
parse-character-class-first (parse-character-class) ; parse-character-class-first (parse-character-class) ;
ERROR: empty-regexp ;
: finish-regexp-parse ( stack -- obj ) : finish-regexp-parse ( stack -- obj )
dup length { dup length {
{ 0 [ empty-regexp ] }
{ 1 [ first ] } { 1 [ first ] }
[ [
drop { pipe } split drop { pipe } split

View File

@ -151,7 +151,7 @@ IN: regexp2-tests
[ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test [ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
[ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test [ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
[ t ] [ "" "\\Q\\E" <regexp> matches? ] unit-test [ f ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
[ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test [ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
[ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test [ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
[ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test [ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math math.ranges USING: accessors combinators kernel math math.ranges
sequences regexp2.backend regexp2.utils memoize sequences regexp2.backend regexp2.utils memoize sets
regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal
regexp2.transition-tables ; regexp2.transition-tables ;
IN: regexp2 IN: regexp2
@ -30,12 +30,15 @@ IN: regexp2
: match-head ( string regexp -- end ) match length>> 1- ; : match-head ( string regexp -- end ) match length>> 1- ;
MEMO: <regexp> ( string -- regexp ) : initial-option ( regexp option -- regexp' )
over options>> conjoin ;
: <regexp> ( string -- regexp )
default-regexp construct-regexp ; default-regexp construct-regexp ;
MEMO: <iregexp> ( string -- regexp ) : <iregexp> ( string -- regexp )
default-regexp default-regexp
t >>case-insensitive case-insensitive initial-option
construct-regexp ; construct-regexp ;
: R! CHAR: ! <regexp> ; parsing : R! CHAR: ! <regexp> ; parsing