diff --git a/basis/regexp/backend/backend.factor b/basis/regexp/backend/backend.factor index 75a010b705..4c82876650 100644 --- a/basis/regexp/backend/backend.factor +++ b/basis/regexp/backend/backend.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors hashtables kernel math state-tables vectors ; +USING: accessors hashtables kernel math vectors ; IN: regexp.backend TUPLE: regexp diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 50847d6ff9..99d94b4bcb 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -1,9 +1,9 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs grouping kernel regexp.backend -locals math namespaces regexp.parser sequences state-tables fry -quotations math.order math.ranges vectors unicode.categories -regexp.utils regexp.transition-tables words sets ; +locals math namespaces regexp.parser sequences fry quotations +math.order math.ranges vectors unicode.categories regexp.utils +regexp.transition-tables words sets ; IN: regexp.nfa SYMBOL: negation-mode @@ -22,6 +22,9 @@ SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag +: add-global-flag ( flag -- ) + current-regexp get nfa-table>> flags>> conjoin ; + : next-state ( regexp -- state ) [ state>> ] [ [ 1+ ] change-state drop ] bi ; @@ -138,21 +141,25 @@ M: non-capture-group nfa-node ( node -- ) M: reluctant-kleene-star nfa-node ( node -- ) term>> nfa-node ; - -: add-epsilon-flag ( flag -- ) - eps literal-transition add-simple-entry add-traversal-flag ; - M: beginning-of-line nfa-node ( node -- ) - drop beginning-of-line add-epsilon-flag ; + drop + eps literal-transition add-simple-entry + beginning-of-line add-global-flag ; M: end-of-line nfa-node ( node -- ) - drop end-of-line add-epsilon-flag ; + drop + eps literal-transition add-simple-entry + end-of-line add-global-flag ; M: beginning-of-input nfa-node ( node -- ) - drop beginning-of-input add-epsilon-flag ; + drop + eps literal-transition add-simple-entry + beginning-of-input add-global-flag ; M: end-of-input nfa-node ( node -- ) - drop end-of-input add-epsilon-flag ; + drop + eps literal-transition add-simple-entry + end-of-input add-global-flag ; M: negation nfa-node ( node -- ) negation-mode inc diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index ea8aaffcd5..71a3e067f3 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -58,7 +58,7 @@ SINGLETONS: letter-class LETTER-class Letter-class digit-class alpha-class non-newline-blank-class ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class -terminator-class unmatchable-class word-boundary-class ; +unmatchable-class terminator-class word-boundary-class ; SINGLETONS: beginning-of-group end-of-group beginning-of-character-class end-of-character-class @@ -87,8 +87,8 @@ left-parenthesis pipe caret dash ; : ( obj -- kleene-star ) kleene-star boa ; : ( obj -- constant ) dup Letter? get-case-insensitive and [ - [ ch>lower constant boa ] - [ ch>upper constant boa ] bi 2array + [ ch>lower ] [ ch>upper ] bi + [ constant boa ] bi@ 2array ] [ constant boa ] if ; @@ -384,20 +384,22 @@ DEFER: handle-left-bracket } case [ (parse-character-class) ] when ; +: push-constant ( ch -- ) push-stack ; + : parse-character-class-second ( -- ) read1 { - { CHAR: [ [ CHAR: [ push-stack ] } - { CHAR: ] [ CHAR: ] push-stack ] } - { CHAR: - [ CHAR: - push-stack ] } + { CHAR: [ [ CHAR: [ push-constant ] } + { CHAR: ] [ CHAR: ] push-constant ] } + { CHAR: - [ CHAR: - push-constant ] } [ push1 ] } case ; : parse-character-class-first ( -- ) read1 { { CHAR: ^ [ caret push-stack parse-character-class-second ] } - { CHAR: [ [ CHAR: [ push-stack ] } - { CHAR: ] [ CHAR: ] push-stack ] } - { CHAR: - [ CHAR: - push-stack ] } + { CHAR: [ [ CHAR: [ push-constant ] } + { CHAR: ] [ CHAR: ] push-constant ] } + { CHAR: - [ CHAR: - push-constant ] } [ push1 ] } case ; @@ -431,7 +433,7 @@ DEFER: handle-left-bracket drop handle-back-anchor f ] [ - push-stack t + push-constant t ] if ] } case ; diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index 3050be14fa..80317a1b66 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -25,12 +25,13 @@ TUPLE: default ; : ( from to -- transition ) t default-transition make-transition ; -TUPLE: transition-table transitions start-state final-states ; +TUPLE: transition-table transitions start-state final-states flags ; : ( -- transition-table ) transition-table new H{ } clone >>transitions - H{ } clone >>final-states ; + H{ } clone >>final-states + H{ } clone >>flags ; : maybe-initialize-key ( key hashtable -- ) 2dup key? [ 2drop ] [ H{ } clone -rot set-at ] if ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index c880c11c53..d8c25eda18 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -82,6 +82,7 @@ M: end-of-input flag-action ( dfa-traverser flag -- ) drop dup end-of-text? [ t >>match-failed? ] unless drop ; + M: beginning-of-line flag-action ( dfa-traverser flag -- ) drop dup { @@ -96,6 +97,7 @@ M: end-of-line flag-action ( dfa-traverser flag -- ) [ next-text-character terminator-class class-member? ] } 1|| [ t >>match-failed? ] unless drop ; + M: word-boundary flag-action ( dfa-traverser flag -- ) drop dup { @@ -103,6 +105,7 @@ M: word-boundary flag-action ( dfa-traverser flag -- ) [ current-text-character terminator-class class-member? ] } 1|| [ t >>match-failed? ] unless drop ; + M: lookahead-on flag-action ( dfa-traverser flag -- ) drop lookahead-counters>> 0 swap push ;