diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index d018fa3a36..ad67d76d12 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -16,11 +16,17 @@ C: from-to TUPLE: at-least n ; C: at-least -TUPLE: concatenation seq ; -C: concatenation +SINGLETON: epsilon -TUPLE: alternation seq ; -C: alternation +TUPLE: concatenation first second ; + +: ( seq -- concatenation ) + epsilon [ concatenation boa ] reduce ; + +TUPLE: alternation first second ; + +: ( seq -- alternation ) + unclip [ alternation boa ] reduce ; TUPLE: star term ; C: star diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 7109e8bcbd..44f33f9fcf 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,4 +1,4 @@ -! Copyright (C) 2008 Doug Coleman. +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: accessors kernel math math.order words ascii unicode.categories combinators.short-circuit sequences ; @@ -41,9 +41,10 @@ C: range GENERIC: class-member? ( obj class -- ? ) +! When does t get put in? M: t class-member? ( obj class -- ? ) 2drop f ; -M: integer class-member? ( obj class -- ? ) 2drop f ; +M: integer class-member? ( obj class -- ? ) = ; M: range class-member? ( obj class -- ? ) [ from>> ] [ to>> ] bi between? ; @@ -111,3 +112,15 @@ M: beginning-of-line class-member? ( obj class -- ? ) M: end-of-line class-member? ( obj class -- ? ) 2drop f ; + +TUPLE: or-class seq ; +C: or-class + +TUPLE: not-class class ; +C: not-class + +M: or-class class-member? + seq>> [ class-member? ] with any? ; + +M: not-class class-member? + class>> class-member? not ; diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor index 52a852af50..163e87f2b4 100644 --- a/basis/regexp/minimize/minimize.factor +++ b/basis/regexp/minimize/minimize.factor @@ -1,20 +1,48 @@ ! Copyright (C) 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: kernel sequences regexp.transition-tables fry assocs -accessors locals math sorting arrays sets hashtables regexp.dfa ; +accessors locals math sorting arrays sets hashtables regexp.dfa +combinators.short-circuit ; IN: regexp.minimize +: number-transitions ( transitions numbering -- new-transitions ) + dup '[ + [ _ at ] + [ [ first _ at ] assoc-map ] bi* + ] assoc-map ; + +: table>state-numbers ( table -- assoc ) + transitions>> keys [ swap ] H{ } assoc-map-as ; + +: map-set ( assoc quot -- new-assoc ) + '[ drop @ dup ] assoc-map ; inline + +: rewrite-transitions ( transition-table assoc quot -- transition-table ) + [ + [ '[ _ at ] change-start-state ] + [ '[ [ _ at ] map-set ] change-final-states ] + [ ] tri + ] dip '[ _ @ ] change-transitions ; inline + +: number-states ( table -- newtable ) + dup table>state-numbers + [ number-transitions ] rewrite-transitions ; + +: initially-same? ( s1 s2 transition-table -- ? ) + { + [ drop <= ] + [ transitions>> '[ _ at keys ] bi@ set= ] + [ final-states>> '[ _ key? ] bi@ = ] + } 3&& ; + :: initialize-partitions ( transition-table -- partitions ) ! Partition table is sorted-array => ? H{ } clone :> out transition-table transitions>> keys :> states states [| s1 | states [| s2 | - s1 s2 <= [ - s1 s2 [ transition-table transitions>> at keys ] bi@ set= - s1 s2 [ transition-table final-states>> key? ] bi@ = and - [ t s1 s2 2array out set-at ] when - ] when + s1 s2 transition-table initially-same? + [ s1 s2 2array out conjoin ] when ] each ] each out ; @@ -29,7 +57,6 @@ IN: regexp.minimize '[ _ same-partition? ] assoc-all? ; : partition-more ( partitions transition-table -- partitions ) - ! This is horribly slow! over '[ drop first2 _ _ stay-same? ] assoc-filter ; : partition>classes ( partitions -- synonyms ) ! old-state => new-state @@ -40,7 +67,7 @@ IN: regexp.minimize : state-classes ( transition-table -- synonyms ) [ initialize-partitions ] keep - '[ _ partition-more ] [ ] while-changes + '[ _ partition-more ] [ assoc-size ] while-changes partition>classes ; : canonical-state? ( state state-classes -- ? ) @@ -52,33 +79,12 @@ IN: regexp.minimize : rewrite-duplicates ( new-transitions state-classes -- new-transitions ) '[ [ _ at ] assoc-map ] assoc-map ; -: map-set ( assoc quot -- new-assoc ) - '[ drop @ dup ] assoc-map ; inline +: combine-transitions ( transitions state-classes -- new-transitions ) + [ delete-duplicates ] [ rewrite-duplicates ] bi ; : combine-states ( table -- smaller-table ) dup state-classes - [ - '[ - _ [ delete-duplicates ] - [ rewrite-duplicates ] bi - ] change-transitions - ] - [ '[ [ _ at ] map-set ] change-final-states ] - [ '[ _ at ] change-start-state ] - tri ; - -: number-transitions ( transitions numbering -- new-transitions ) - [ - [ at ] - [ '[ first _ at ] assoc-map ] - bi-curry bi* - ] curry assoc-map ; - -: number-states ( table -- newtable ) - dup transitions>> keys [ swap ] H{ } assoc-map-as - [ '[ _ at ] change-start-state ] - [ '[ [ _ at ] map-set ] change-final-states ] - [ '[ _ number-transitions ] change-transitions ] tri ; + [ combine-transitions ] rewrite-transitions ; : minimize ( table -- minimal-table ) clone number-states combine-states ; diff --git a/basis/regexp/negation/negation-tests.factor b/basis/regexp/negation/negation-tests.factor new file mode 100644 index 0000000000..2dbca2e8d8 --- /dev/null +++ b/basis/regexp/negation/negation-tests.factor @@ -0,0 +1,27 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: tools.test regexp.negation regexp.transition-tables regexp.classes ; +IN: regexp.negation.tests + +[ + ! R/ |[^a]|.+/ + T{ transition-table + { transitions H{ + { 0 H{ { CHAR: a 1 } { T{ not-class f T{ or-class f { CHAR: a } } } -1 } } } + { 1 H{ { T{ not-class f T{ or-class f { } } } -1 } } } + { -1 H{ { any-char -1 } } } + } } + { start-state 0 } + { final-states H{ { 0 0 } { -1 -1 } } } + } +] [ + ! R/ a/ + T{ transition-table + { transitions H{ + { 0 H{ { CHAR: a 1 } } } + { 1 H{ } } + } } + { start-state 0 } + { final-states H{ { 1 1 } } } + } negate-table +] unit-test diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor new file mode 100644 index 0000000000..5a9f772581 --- /dev/null +++ b/basis/regexp/negation/negation.factor @@ -0,0 +1,36 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences +assocs regexp.classes hashtables accessors ; +IN: regexp.negation + +: ast>dfa ( parse-tree -- minimal-dfa ) + construct-nfa construct-dfa minimize ; + +CONSTANT: fail-state -1 + +: add-default-transition ( state's-transitions -- new-state's-transitions ) + clone dup + [ [ fail-state ] dip keys ] keep set-at ; + +: fail-state-recurses ( transitions -- new-transitions ) + clone dup + [ fail-state any-char associate fail-state ] dip set-at ; + +: add-fail-state ( transitions -- new-transitions ) + [ add-default-transition ] assoc-map + fail-state-recurses ; + +: assoc>set ( assoc -- keys-set ) + [ drop dup ] assoc-map ; + +: inverse-final-states ( transition-table -- final-states ) + [ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ; + +: negate-table ( transition-table -- transition-table ) + clone + [ add-fail-state ] change-transitions + dup inverse-final-states >>final-states ; + +! M: negation nfa-node ( node -- ) +! ast>dfa negate-table adjoin-dfa ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 4ad5e0314d..c759ffdf98 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -3,15 +3,13 @@ USING: accessors arrays assocs grouping kernel locals math namespaces sequences fry quotations math.order math.ranges vectors unicode.categories -regexp.transition-tables words sets +regexp.transition-tables words sets hashtables unicode.case.private regexp.ast regexp.classes ; ! This uses unicode.case.private for ch>upper and ch>lower ! but case-insensitive matching should be done by case-folding everything ! before processing starts IN: regexp.nfa -ERROR: feature-is-broken feature ; - SYMBOL: negated? : negate ( -- ) @@ -21,14 +19,13 @@ SINGLETON: eps SYMBOL: option-stack -SYMBOL: combine-stack - SYMBOL: state : next-state ( -- state ) state [ get ] [ inc ] bi ; SYMBOL: nfa-table +: table ( -- table ) nfa-table get ; : set-each ( keys value hashtable -- ) '[ _ swap _ set-at ] each ; @@ -46,84 +43,56 @@ SYMBOL: nfa-table : option? ( obj -- ? ) option-stack get assoc-stack ; -: set-start-state ( -- nfa-table ) - nfa-table get - combine-stack get pop first >>start-state ; +GENERIC: nfa-node ( node -- start-state end-state ) -GENERIC: nfa-node ( node -- ) +:: add-simple-entry ( obj class -- start-state end-state ) + next-state :> s0 + next-state :> s1 + negated? get [ + s0 f obj class make-transition table add-transition + s0 s1 table add-transition + ] [ + s0 s1 obj class make-transition table add-transition + ] if + s0 s1 ; -:: add-simple-entry ( obj class -- ) - [let* | s0 [ next-state ] - s1 [ next-state ] - stack [ combine-stack get ] - table [ nfa-table get ] | - negated? get [ - s0 f obj class make-transition table add-transition - s0 s1 table add-transition - ] [ - s0 s1 obj class make-transition table add-transition - ] if - s0 s1 2array stack push - t s1 table final-states>> set-at ] ; +: epsilon-transition ( source target -- ) + eps table add-transition ; -:: concatenate-nodes ( -- ) - [let* | stack [ combine-stack get ] - table [ nfa-table get ] - s2 [ stack peek first ] - s3 [ stack pop second ] - s0 [ stack peek first ] - s1 [ stack pop second ] | - s1 s2 eps table add-transition - s1 table final-states>> delete-at - s0 s3 2array stack push ] ; +M:: star nfa-node ( node -- start end ) + node term>> nfa-node :> s1 :> s0 + next-state :> s2 + next-state :> s3 + s1 s0 epsilon-transition + s2 s0 epsilon-transition + s2 s3 epsilon-transition + s1 s3 epsilon-transition + s2 s3 ; -:: alternate-nodes ( -- ) - [let* | stack [ combine-stack get ] - table [ nfa-table get ] - s2 [ stack peek first ] - s3 [ stack pop second ] - s0 [ stack peek first ] - s1 [ stack pop second ] - s4 [ next-state ] - s5 [ next-state ] | - s4 s0 eps table add-transition - s4 s2 eps table add-transition - s1 s5 eps table add-transition - s3 s5 eps table add-transition - s1 table final-states>> delete-at - s3 table final-states>> delete-at - t s5 table final-states>> set-at - s4 s5 2array stack push ] ; +M: epsilon nfa-node + drop eps literal-transition add-simple-entry ; -M: star nfa-node ( node -- ) - term>> nfa-node - [let* | stack [ combine-stack get ] - s0 [ stack peek first ] - s1 [ stack pop second ] - s2 [ next-state ] - s3 [ next-state ] - table [ nfa-table get ] | - s1 table final-states>> delete-at - t s3 table final-states>> set-at - s1 s0 eps table add-transition - s2 s0 eps table add-transition - s2 s3 eps table add-transition - s1 s3 eps table add-transition - s2 s3 2array stack push ] ; +M: concatenation nfa-node ( node -- start end ) + [ first>> ] [ second>> ] bi + reversed-regexp option? [ swap ] when + [ nfa-node ] bi@ + [ epsilon-transition ] dip ; -M: concatenation nfa-node ( node -- ) - seq>> [ eps literal-transition add-simple-entry ] [ - reversed-regexp option? [ ] when - [ [ nfa-node ] each ] - [ length 1- [ concatenate-nodes ] times ] bi - ] if-empty ; +:: alternate-nodes ( s0 s1 s2 s3 -- start end ) + next-state :> s4 + next-state :> s5 + s4 s0 epsilon-transition + s4 s2 epsilon-transition + s1 s5 epsilon-transition + s3 s5 epsilon-transition + s4 s5 ; -M: alternation nfa-node ( node -- ) - seq>> - [ [ nfa-node ] each ] - [ length 1- [ alternate-nodes ] times ] bi ; +M: alternation nfa-node ( node -- start end ) + [ first>> ] [ second>> ] bi + [ nfa-node ] bi@ + alternate-nodes ; -M: integer nfa-node ( node -- ) +M: integer nfa-node ( node -- start end ) case-insensitive option? [ dup [ ch>lower ] [ ch>upper ] bi 2dup = [ @@ -131,26 +100,26 @@ M: integer nfa-node ( node -- ) literal-transition add-simple-entry ] [ [ literal-transition add-simple-entry ] bi@ - alternate-nodes drop + alternate-nodes [ nip ] dip ] if ] [ literal-transition add-simple-entry ] if ; -M: primitive-class nfa-node ( node -- ) +M: primitive-class nfa-node ( node -- start end ) class>> dup { letter-class LETTER-class } member? case-insensitive option? and [ drop Letter-class ] when class-transition add-simple-entry ; -M: any-char nfa-node ( node -- ) +M: any-char nfa-node ( node -- start end ) [ dotall option? ] dip any-char-no-nl ? class-transition add-simple-entry ; -M: negation nfa-node ( node -- ) +M: negation nfa-node ( node -- start end ) negate term>> nfa-node negate ; -M: range nfa-node ( node -- ) +M: range nfa-node ( node -- start end ) case-insensitive option? [ ! This should be implemented for Unicode by case-folding ! the input and all strings in the regexp. @@ -169,15 +138,16 @@ M: range nfa-node ( node -- ) class-transition add-simple-entry ] if ; -M: with-options nfa-node ( node -- ) +M: with-options nfa-node ( node -- start end ) dup options>> [ tree>> nfa-node ] using-options ; : construct-nfa ( ast -- nfa-table ) [ negated? off - V{ } clone combine-stack set 0 state set clone nfa-table set nfa-node - set-start-state + table + swap dup associate >>final-states + swap >>start-state ] with-scope ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index dbd37f2d8e..6b2f28dbf6 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -4,6 +4,7 @@ USING: peg.ebnf kernel math.parser sequences assocs arrays fry math combinators regexp.classes strings splitting peg locals accessors regexp.ast ; IN: regexp.parser + : allowed-char? ( ch -- ? ) ".()|[*+?" member? not ; @@ -130,6 +131,7 @@ Parenthized = "?:" Alternation:a => [[ a ]] | "?" Options:on "-"? Options:off ":" Alternation:a => [[ a on off parse-options ]] | "?#" [^)]* => [[ f ]] + | "?~" Alternation:a => [[ a ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index b6fd32a245..189d430d85 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -4,14 +4,15 @@ USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend prettyprint.custom make lexer namespaces parser arrays fry locals regexp.minimize regexp.parser regexp.nfa regexp.dfa regexp.traversal -regexp.transition-tables splitting sorting regexp.ast ; +regexp.transition-tables splitting sorting regexp.ast +regexp.negation ; IN: regexp TUPLE: regexp raw parse-tree options dfa ; : ( string options -- regexp ) [ dup parse-regexp ] [ string>options ] bi* - 2dup construct-nfa construct-dfa minimize + 2dup ast>dfa regexp boa ; : ( string -- regexp ) "" ;