From 9eba6c00345ef7661b4b7eefd63d2fa0773b63d1 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Thu, 21 Aug 2008 17:12:26 -0500 Subject: [PATCH] case-insensitive for constants --- extra/regexp2/backend/backend.factor | 2 +- extra/regexp2/parser/parser.factor | 46 +++++++++++++++++++--------- extra/regexp2/regexp2-tests.factor | 2 +- extra/regexp2/regexp2.factor | 11 ++++--- 4 files changed, 41 insertions(+), 20 deletions(-) diff --git a/extra/regexp2/backend/backend.factor b/extra/regexp2/backend/backend.factor index b12cf20df5..3d4b08d3c0 100644 --- a/extra/regexp2/backend/backend.factor +++ b/extra/regexp2/backend/backend.factor @@ -7,10 +7,10 @@ TUPLE: regexp raw { stack vector } parse-tree + { options hashtable } nfa-table dfa-table minimized-table - case-insensitive { state integer } { new-states vector } { visited-states hashtable } ; diff --git a/extra/regexp2/parser/parser.factor b/extra/regexp2/parser/parser.factor index fc1029db58..e173b71fc1 100644 --- a/extra/regexp2/parser/parser.factor +++ b/extra/regexp2/parser/parser.factor @@ -1,10 +1,10 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators io io.streams.string -kernel math math.parser multi-methods namespaces qualified +kernel math math.parser multi-methods namespaces qualified sets quotations sequences sequences.lib splitting symbols vectors -dlists math.order combinators.lib unicode.categories -sequences.lib regexp2.backend regexp2.utils ; +dlists math.order combinators.lib unicode.categories strings +sequences.lib regexp2.backend regexp2.utils unicode.case ; IN: regexp2.parser FROM: math.ranges => [a,b] ; @@ -48,12 +48,26 @@ SINGLETONS: beginning-of-group end-of-group beginning-of-character-class end-of-character-class left-parenthesis pipe caret dash ; -: ( obj -- constant ) constant boa ; +: get-option ( option -- ? ) current-regexp get options>> at ; +: get-unix-lines ( -- ? ) unix-lines get-option ; +: get-dotall ( -- ? ) dotall get-option ; +: get-multiline ( -- ? ) multiline get-option ; +: get-comments ( -- ? ) comments get-option ; +: get-case-insensitive ( -- ? ) case-insensitive get-option ; +: get-unicode-case ( -- ? ) unicode-case get-option ; + : ( obj -- negation ) negation boa ; : ( seq -- concatenation ) >vector concatenation boa ; : ( seq -- alternation ) >vector alternation boa ; : ( obj -- capture-group ) capture-group boa ; : ( obj -- kleene-star ) kleene-star boa ; +: ( obj -- constant ) + dup Letter? get-case-insensitive and [ + [ ch>lower constant boa ] + [ ch>upper constant boa ] bi 2array + ] [ + constant boa + ] if ; : first|concatenation ( seq -- first/concatenation ) dup length 1 = [ first ] [ ] if ; @@ -95,19 +109,20 @@ ERROR: bad-option ch ; { CHAR: x [ comments ] } [ bad-option ] } case ; - -: option-on ( ch -- ) option \ option-on boa push-stack ; -: option-off ( ch -- ) option \ option-off boa push-stack ; -: toggle-option ( ch ? -- ) [ option-on ] [ option-off ] if ; + +: option-on ( option -- ) current-regexp get options>> conjoin ; +: option-off ( option -- ) current-regexp get options>> delete-at ; + +: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ; : (parse-options) ( string ? -- ) [ toggle-option ] curry each ; : parse-options ( string -- ) "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; DEFER: (parse-regexp) -: parse-special-group-options ( options -- ) +: parse-special-group ( -- ) beginning-of-group push-stack - parse-options (parse-regexp) pop-stack make-non-capturing-group ; + (parse-regexp) pop-stack make-non-capturing-group ; ERROR: bad-special-group string ; @@ -126,8 +141,13 @@ ERROR: bad-special-group string ; { [ dup CHAR: < = peek1 CHAR: ! = and ] [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] } [ - ":" read-until [ bad-special-group ] unless - swap prefix parse-special-group-options + ":)" read-until + [ swap prefix ] dip + { + { CHAR: : [ parse-options parse-special-group ] } + { CHAR: ) [ parse-options ] } + [ drop bad-special-group ] + } case ] } cond ; @@ -312,10 +332,8 @@ DEFER: handle-left-bracket beginning-of-character-class push-stack parse-character-class-first (parse-character-class) ; -ERROR: empty-regexp ; : finish-regexp-parse ( stack -- obj ) dup length { - { 0 [ empty-regexp ] } { 1 [ first ] } [ drop { pipe } split diff --git a/extra/regexp2/regexp2-tests.factor b/extra/regexp2/regexp2-tests.factor index 2b34fe6e77..dd6055117e 100644 --- a/extra/regexp2/regexp2-tests.factor +++ b/extra/regexp2/regexp2-tests.factor @@ -151,7 +151,7 @@ IN: regexp2-tests [ f ] [ "abc" "[\\p{Upper}]{3}" matches? ] unit-test [ t ] [ "ABC" "[\\p{Upper}]{3}" matches? ] unit-test -[ t ] [ "" "\\Q\\E" matches? ] unit-test +[ f ] [ "" "\\Q\\E" matches? ] unit-test [ f ] [ "a" "\\Q\\E" matches? ] unit-test [ t ] [ "|*+" "\\Q|*+\\E" matches? ] unit-test [ f ] [ "abc" "\\Q|*+\\E" matches? ] unit-test diff --git a/extra/regexp2/regexp2.factor b/extra/regexp2/regexp2.factor index 51d4d5b24f..c227218450 100644 --- a/extra/regexp2/regexp2.factor +++ b/extra/regexp2/regexp2.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors combinators kernel math math.ranges -sequences regexp2.backend regexp2.utils memoize +sequences regexp2.backend regexp2.utils memoize sets regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal regexp2.transition-tables ; IN: regexp2 @@ -30,12 +30,15 @@ IN: regexp2 : match-head ( string regexp -- end ) match length>> 1- ; -MEMO: ( string -- regexp ) +: initial-option ( regexp option -- regexp' ) + over options>> conjoin ; + +: ( string -- regexp ) default-regexp construct-regexp ; -MEMO: ( string -- regexp ) +: ( string -- regexp ) default-regexp - t >>case-insensitive + case-insensitive initial-option construct-regexp ; : R! CHAR: ! ; parsing