diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor new file mode 100644 index 0000000000..d018fa3a36 --- /dev/null +++ b/basis/regexp/ast/ast.factor @@ -0,0 +1,53 @@ +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: kernel arrays accessors fry sequences ; +FROM: math.ranges => [a,b] ; +IN: regexp.ast + +TUPLE: primitive-class class ; +C: primitive-class + +TUPLE: negation term ; +C: negation + +TUPLE: from-to n m ; +C: from-to + +TUPLE: at-least n ; +C: at-least + +TUPLE: concatenation seq ; +C: concatenation + +TUPLE: alternation seq ; +C: alternation + +TUPLE: star term ; +C: star + +TUPLE: with-options tree options ; +C: with-options + +TUPLE: options on off ; +C: options + +SINGLETONS: unix-lines dotall multiline comments case-insensitive +unicode-case reversed-regexp ; + +: ( term -- term' ) + f 2array ; + +: ( term -- term' ) + dup 2array ; + +: repetition ( n term -- term' ) + ; + +GENERIC: ( term times -- term' ) +M: at-least + n>> swap [ repetition ] [ ] bi 2array ; +M: from-to + [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map ; + +: char-class ( ranges ? -- term ) + [ ] dip [ ] when ; diff --git a/basis/regexp/backend/backend.factor b/basis/regexp/backend/backend.factor deleted file mode 100644 index 5eff0579c8..0000000000 --- a/basis/regexp/backend/backend.factor +++ /dev/null @@ -1,27 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors hashtables kernel math vectors ; -IN: regexp.backend - -TUPLE: regexp - raw - { options hashtable } - stack - parse-tree - nfa-table - dfa-table - minimized-table - matchers - { nfa-traversal-flags hashtable } - { dfa-traversal-flags hashtable } - { state integer } - { new-states vector } - { visited-states hashtable } ; - -: reset-regexp ( regexp -- regexp ) - 0 >>state - V{ } clone >>stack - V{ } clone >>new-states - H{ } clone >>visited-states ; - -SYMBOL: current-regexp diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 94d1b78d59..7109e8bcbd 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,9 +1,31 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors kernel math math.order words regexp.utils -ascii unicode.categories combinators.short-circuit ; +USING: accessors kernel math math.order words +ascii unicode.categories combinators.short-circuit sequences ; IN: regexp.classes +: punct? ( ch -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; + +: c-identifier-char? ( ch -- ? ) + { [ alpha? ] [ CHAR: _ = ] } 1|| ; + +: java-blank? ( ch -- ? ) + { + CHAR: \s CHAR: \t CHAR: \n + HEX: b HEX: 7 CHAR: \r + } member? ; + +: java-printable? ( ch -- ? ) + [ [ alpha? ] [ punct? ] ] 1|| ; + +: hex-digit? ( ch -- ? ) + { + [ CHAR: A CHAR: F between? ] + [ CHAR: a CHAR: f between? ] + [ CHAR: 0 CHAR: 9 between? ] + } 1|| ; + SINGLETONS: any-char any-char-no-nl letter-class LETTER-class Letter-class digit-class alpha-class non-newline-blank-class @@ -14,8 +36,8 @@ unmatchable-class terminator-class word-boundary-class ; SINGLETONS: beginning-of-input beginning-of-line end-of-input end-of-line ; -MIXIN: node -TUPLE: character-class-range from to ; INSTANCE: character-class-range node +TUPLE: range from to ; +C: range GENERIC: class-member? ( obj class -- ? ) @@ -23,7 +45,7 @@ M: t class-member? ( obj class -- ? ) 2drop f ; M: integer class-member? ( obj class -- ? ) 2drop f ; -M: character-class-range class-member? ( obj class -- ? ) +M: range class-member? ( obj class -- ? ) [ from>> ] [ to>> ] bi between? ; M: any-char class-member? ( obj class -- ? ) diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 549669cab7..4dd3713fc2 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -2,83 +2,74 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators fry kernel locals math math.order regexp.nfa regexp.transition-tables sequences -sets sorting vectors regexp.utils sequences.deep ; +sets sorting vectors sequences.deep ; USING: io prettyprint threads ; IN: regexp.dfa -: find-delta ( states transition regexp -- new-states ) - nfa-table>> transitions>> - rot [ swap at at ] with with gather sift ; +: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj ) + [ [ dup slip ] dip pick over call ] dip dupd = + [ 3drop ] [ (while-changes) ] if ; inline recursive -: (find-epsilon-closure) ( states regexp -- new-states ) +: while-changes ( obj quot pred -- obj' ) + 3dup nip call (while-changes) ; inline + +: find-delta ( states transition nfa -- new-states ) + transitions>> '[ _ swap _ at at ] gather sift ; + +: (find-epsilon-closure) ( states nfa -- new-states ) eps swap find-delta ; -: find-epsilon-closure ( states regexp -- new-states ) +: find-epsilon-closure ( states nfa -- new-states ) '[ dup _ (find-epsilon-closure) union ] [ length ] while-changes natural-sort ; -: find-closure ( states transition regexp -- new-states ) - [ find-delta ] 2keep nip find-epsilon-closure ; +: find-closure ( states transition nfa -- new-states ) + [ find-delta ] keep find-epsilon-closure ; -: find-start-state ( regexp -- state ) - [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ; +: find-start-state ( nfa -- state ) + [ start-state>> 1vector ] keep find-epsilon-closure ; -: find-transitions ( seq1 regexp -- seq2 ) - nfa-table>> transitions>> - [ at keys ] curry gather +: find-transitions ( dfa-state nfa -- next-dfa-state ) + transitions>> + '[ _ at keys ] gather eps swap remove ; -: add-todo-state ( state regexp -- ) - 2dup visited-states>> key? [ - 2drop - ] [ - [ visited-states>> conjoin ] - [ new-states>> push ] 2bi +: add-todo-state ( state visited-states new-states -- ) + 3dup drop key? [ 3drop ] [ + [ conjoin ] [ push ] bi-curry* bi ] if ; -: new-transitions ( regexp -- ) - dup new-states>> [ - drop - ] [ - dupd pop dup pick find-transitions rot - [ - [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep - [ swapd transition make-transition ] dip - dfa-table>> add-transition - ] curry with each - new-transitions +:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa ) + new-states [ nfa dfa ] [ + new-states pop :> state + state nfa-table find-transitions + [| trans | + state trans nfa find-closure :> new-state + state visited-states new-state add-todo-state + state new-state trans transition make-transition dfa add-transition + ] each + nfa dfa new-states visited-states new-transitions ] if-empty ; : states ( hashtable -- array ) [ keys ] [ values [ values concat ] map concat append ] bi ; -: set-final-states ( regexp -- ) - dup - [ nfa-table>> final-states>> keys ] - [ dfa-table>> transitions>> states ] bi - [ intersects? ] with filter - - swap dfa-table>> final-states>> +: set-final-states ( nfa dfa -- ) + [ + [ final-states>> keys ] + [ transitions>> states ] bi* + [ intersects? ] with filter + ] [ final-states>> ] bi [ conjoin ] curry each ; -: set-initial-state ( regexp -- ) - dup - [ dfa-table>> ] [ find-start-state ] bi - [ >>start-state drop ] keep - 1vector >>new-states drop ; +: initialize-dfa ( nfa -- dfa ) + + swap find-start-state >>start-state ; -: set-traversal-flags ( regexp -- ) - dup - [ nfa-traversal-flags>> ] - [ dfa-table>> transitions>> keys ] bi - [ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc - >>dfa-traversal-flags drop ; - -: construct-dfa ( regexp -- ) - { - [ set-initial-state ] - [ new-transitions ] - [ set-final-states ] - [ set-traversal-flags ] - } cleave ; +: construct-dfa ( nfa -- dfa ) + dup initialize-dfa + dup start-state>> 1vector + H{ } clone + new-transitions + [ set-final-states ] keep ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index c8ee1187bc..4ad5e0314d 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -1,9 +1,10 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs grouping kernel regexp.backend -locals math namespaces regexp.parser sequences fry quotations -math.order math.ranges vectors unicode.categories regexp.utils -regexp.transition-tables words sets regexp.classes unicode.case.private ; +USING: accessors arrays assocs grouping kernel +locals math namespaces sequences fry quotations +math.order math.ranges vectors unicode.categories +regexp.transition-tables words sets +unicode.case.private regexp.ast regexp.classes ; ! This uses unicode.case.private for ch>upper and ch>lower ! but case-insensitive matching should be done by case-folding everything ! before processing starts @@ -13,34 +14,49 @@ ERROR: feature-is-broken feature ; SYMBOL: negated? +: negate ( -- ) + negated? [ not ] change ; + SINGLETON: eps -: options ( -- obj ) current-regexp get options>> ; +SYMBOL: option-stack -: option? ( obj -- ? ) options key? ; +SYMBOL: combine-stack -: option-on ( obj -- ) options conjoin ; +SYMBOL: state -: option-off ( obj -- ) options delete-at ; +: next-state ( -- state ) + state [ get ] [ inc ] bi ; -: next-state ( regexp -- state ) - [ state>> ] [ [ 1+ ] change-state drop ] bi ; +SYMBOL: nfa-table -: set-start-state ( regexp -- ) - dup stack>> [ - drop - ] [ - [ nfa-table>> ] [ pop first ] bi* >>start-state drop - ] if-empty ; +: set-each ( keys value hashtable -- ) + '[ _ swap _ set-at ] each ; + +: options>hash ( options -- hashtable ) + H{ } clone [ + [ [ on>> t ] dip set-each ] + [ [ off>> f ] dip set-each ] 2bi + ] keep ; + +: using-options ( options quot -- ) + [ options>hash option-stack [ ?push ] change ] dip + call option-stack get pop* ; inline + +: option? ( obj -- ? ) + option-stack get assoc-stack ; + +: set-start-state ( -- nfa-table ) + nfa-table get + combine-stack get pop first >>start-state ; GENERIC: nfa-node ( node -- ) :: add-simple-entry ( obj class -- ) - [let* | regexp [ current-regexp get ] - s0 [ regexp next-state ] - s1 [ regexp next-state ] - stack [ regexp stack>> ] - table [ regexp nfa-table>> ] | + [let* | s0 [ next-state ] + s1 [ next-state ] + stack [ combine-stack get ] + table [ nfa-table get ] | negated? get [ s0 f obj class make-transition table add-transition s0 s1 table add-transition @@ -51,9 +67,8 @@ GENERIC: nfa-node ( node -- ) t s1 table final-states>> set-at ] ; :: concatenate-nodes ( -- ) - [let* | regexp [ current-regexp get ] - stack [ regexp stack>> ] - table [ regexp nfa-table>> ] + [let* | stack [ combine-stack get ] + table [ nfa-table get ] s2 [ stack peek first ] s3 [ stack pop second ] s0 [ stack peek first ] @@ -63,15 +78,14 @@ GENERIC: nfa-node ( node -- ) s0 s3 2array stack push ] ; :: alternate-nodes ( -- ) - [let* | regexp [ current-regexp get ] - stack [ regexp stack>> ] - table [ regexp nfa-table>> ] + [let* | stack [ combine-stack get ] + table [ nfa-table get ] s2 [ stack peek first ] s3 [ stack pop second ] s0 [ stack peek first ] s1 [ stack pop second ] - s4 [ regexp next-state ] - s5 [ regexp next-state ] | + s4 [ next-state ] + s5 [ next-state ] | s4 s0 eps table add-transition s4 s2 eps table add-transition s1 s5 eps table add-transition @@ -83,13 +97,12 @@ GENERIC: nfa-node ( node -- ) M: star nfa-node ( node -- ) term>> nfa-node - [let* | regexp [ current-regexp get ] - stack [ regexp stack>> ] + [let* | stack [ combine-stack get ] s0 [ stack peek first ] s1 [ stack pop second ] - s2 [ regexp next-state ] - s3 [ regexp next-state ] - table [ regexp nfa-table>> ] | + s2 [ next-state ] + s3 [ next-state ] + table [ nfa-table get ] | s1 table final-states>> delete-at t s3 table final-states>> set-at s1 s0 eps table add-transition @@ -99,58 +112,53 @@ M: star nfa-node ( node -- ) s2 s3 2array stack push ] ; M: concatenation nfa-node ( node -- ) - seq>> - reversed-regexp option? [ ] when - [ [ nfa-node ] each ] - [ length 1- [ concatenate-nodes ] times ] bi ; + seq>> [ eps literal-transition add-simple-entry ] [ + reversed-regexp option? [ ] when + [ [ nfa-node ] each ] + [ length 1- [ concatenate-nodes ] times ] bi + ] if-empty ; M: alternation nfa-node ( node -- ) seq>> [ [ nfa-node ] each ] [ length 1- [ alternate-nodes ] times ] bi ; -M: constant nfa-node ( node -- ) +M: integer nfa-node ( node -- ) case-insensitive option? [ - dup char>> [ ch>lower ] [ ch>upper ] bi + dup [ ch>lower ] [ ch>upper ] bi 2dup = [ 2drop - char>> literal-transition add-simple-entry + literal-transition add-simple-entry ] [ [ literal-transition add-simple-entry ] bi@ alternate-nodes drop ] if ] [ - char>> literal-transition add-simple-entry + literal-transition add-simple-entry ] if ; -M: word nfa-node ( node -- ) class-transition add-simple-entry ; +M: primitive-class nfa-node ( node -- ) + class>> dup + { letter-class LETTER-class } member? case-insensitive option? and + [ drop Letter-class ] when + class-transition add-simple-entry ; M: any-char nfa-node ( node -- ) [ dotall option? ] dip any-char-no-nl ? class-transition add-simple-entry ; -M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ; +M: negation nfa-node ( node -- ) + negate term>> nfa-node negate ; -M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ; - -: choose-letter-class ( node -- node' ) - case-insensitive option? Letter-class rot ? ; - -M: letter-class nfa-node ( node -- ) - choose-letter-class class-transition add-simple-entry ; - -M: LETTER-class nfa-node ( node -- ) - choose-letter-class class-transition add-simple-entry ; - -M: character-class-range nfa-node ( node -- ) +M: range nfa-node ( node -- ) case-insensitive option? [ ! This should be implemented for Unicode by case-folding ! the input and all strings in the regexp. dup [ from>> ] [ to>> ] bi 2dup [ Letter? ] bi@ and [ rot drop - [ [ ch>lower ] bi@ character-class-range boa ] - [ [ ch>upper ] bi@ character-class-range boa ] 2bi + [ [ ch>lower ] bi@ ] + [ [ ch>upper ] bi@ ] 2bi [ class-transition add-simple-entry ] bi@ alternate-nodes ] [ @@ -161,14 +169,15 @@ M: character-class-range nfa-node ( node -- ) class-transition add-simple-entry ] if ; -M: option nfa-node ( node -- ) - [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if - eps literal-transition add-simple-entry ; +M: with-options nfa-node ( node -- ) + dup options>> [ tree>> nfa-node ] using-options ; -: construct-nfa ( regexp -- ) +: construct-nfa ( ast -- nfa-table ) [ - reset-regexp - [ current-regexp set ] - [ parse-tree>> nfa-node ] - [ set-start-state ] tri + negated? off + V{ } clone combine-stack set + 0 state set + clone nfa-table set + nfa-node + set-start-state ] with-scope ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 65965fdeb9..dbd37f2d8e 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -1,28 +1,9 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: peg.ebnf kernel math.parser sequences assocs arrays -combinators regexp.classes strings splitting peg locals ; +USING: peg.ebnf kernel math.parser sequences assocs arrays fry math +combinators regexp.classes strings splitting peg locals accessors +regexp.ast ; IN: regexp.parser - -TUPLE: range from to ; -TUPLE: char-class ranges ; -TUPLE: primitive-class class ; -TUPLE: not-char-class ranges ; -TUPLE: not-primitive-class class ; -TUPLE: from-to n m ; -TUPLE: at-least n ; -TUPLE: up-to n ; -TUPLE: exactly n ; -TUPLE: times expression times ; -TUPLE: concatenation seq ; -TUPLE: alternation seq ; -TUPLE: maybe term ; -TUPLE: star term ; -TUPLE: plus term ; -TUPLE: with-options tree options ; -TUPLE: ast ^? $? tree ; -SINGLETON: any-char - : allowed-char? ( ch -- ? ) ".()|[*+?" member? not ; @@ -64,21 +45,16 @@ ERROR: bad-class name ; { CHAR: e [ HEX: 1b ] } { CHAR: \\ [ CHAR: \\ ] } - { CHAR: w [ c-identifier-class primitive-class boa ] } - { CHAR: W [ c-identifier-class not-primitive-class boa ] } - { CHAR: s [ java-blank-class primitive-class boa ] } - { CHAR: S [ java-blank-class not-primitive-class boa ] } - { CHAR: d [ digit-class primitive-class boa ] } - { CHAR: D [ digit-class not-primitive-class boa ] } + { CHAR: w [ c-identifier-class ] } + { CHAR: W [ c-identifier-class ] } + { CHAR: s [ java-blank-class ] } + { CHAR: S [ java-blank-class ] } + { CHAR: d [ digit-class ] } + { CHAR: D [ digit-class ] } [ ] } case ; -TUPLE: options on off ; - -SINGLETONS: unix-lines dotall multiline comments case-insensitive -unicode-case reversed-regexp ; - : options-assoc ( -- assoc ) H{ { CHAR: i case-insensitive } @@ -98,19 +74,30 @@ unicode-case reversed-regexp ; options-assoc value-at ; : parse-options ( on off -- options ) - [ [ ch>option ] map ] bi@ options boa ; + [ [ ch>option ] { } map-as ] bi@ ; -! TODO: make range syntax better (negation, and, etc), -! add syntax for various parenthized things, +: string>options ( string -- options ) + "-" split1 parse-options ; + +: options>string ( options -- string ) + [ on>> ] [ off>> ] bi + [ [ option>ch ] map ] bi@ + [ "-" swap 3append ] unless-empty + "" like ; + +! TODO: add syntax for various parenthized things, ! add greedy and nongreedy forms of matching ! (once it's all implemented) -EBNF: (parse-regexp) +EBNF: parse-regexp CharacterInBracket = !("}") Character -Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]] - | "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]] +QuotedCharacter = !("\\E") . + +Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class ]] + | "P{" CharacterInBracket*:s "}" => [[ s >string name>class ]] + | "Q" QuotedCharacter*:s "\\E" => [[ s ]] | "u" Character:a Character:b Character:c Character:d => [[ { a b c d } hex> ensure-number ]] | "x" Character:a Character:b @@ -119,30 +106,30 @@ Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-cla => [[ { a b c } oct> ensure-number ]] | . => [[ lookup-escape ]] -Character = "\\" Escape:e => [[ e ]] - | . ?[ allowed-char? ]? +EscapeSequence = "\\" Escape:e => [[ e ]] -AnyRangeCharacter = Character | "[" +Character = EscapeSequence | . ?[ allowed-char? ]? + +AnyRangeCharacter = EscapeSequence | . RangeCharacter = !("]") AnyRangeCharacter -Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] +Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b ]] | RangeCharacter -StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] +StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b ]] | AnyRangeCharacter Ranges = StartRange:s Range*:r => [[ r s prefix ]] -CharClass = "^" Ranges:e => [[ e not-char-class boa ]] - | Ranges:e => [[ e char-class boa ]] +CharClass = "^"?:n Ranges:e => [[ e n char-class ]] Options = [idmsux]* Parenthized = "?:" Alternation:a => [[ a ]] | "?" Options:on "-"? Options:off ":" Alternation:a - => [[ a on off parse-options with-options boa ]] - | "?#" [^)]* => [[ ignore ]] + => [[ a on off parse-options ]] + | "?#" [^)]* => [[ f ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] @@ -152,32 +139,24 @@ Element = "(" Parenthized:p ")" => [[ p ]] Number = (!(","|"}").)* => [[ string>number ensure-number ]] -Times = "," Number:n "}" => [[ n up-to boa ]] - | Number:n ",}" => [[ n at-least boa ]] - | Number:n "}" => [[ n exactly boa ]] +Times = "," Number:n "}" => [[ 0 n ]] + | Number:n ",}" => [[ n ]] + | Number:n "}" => [[ n n ]] | "}" => [[ bad-number ]] - | Number:n "," Number:m "}" => [[ n m from-to boa ]] + | Number:n "," Number:m "}" => [[ n m ]] -Repeated = Element:e "{" Times:t => [[ e t times boa ]] - | Element:e "?" => [[ e maybe boa ]] - | Element:e "*" => [[ e star boa ]] - | Element:e "+" => [[ e plus boa ]] +Repeated = Element:e "{" Times:t => [[ e t ]] + | Element:e "?" => [[ e ]] + | Element:e "*" => [[ e ]] + | Element:e "+" => [[ e ]] | Element -Concatenation = Repeated*:r => [[ r concatenation boa ]] +Concatenation = Repeated*:r => [[ r sift ]] Alternation = Concatenation:c ("|" Concatenation)*:a - => [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]] + => [[ a empty? [ c ] [ a values c prefix ] if ]] End = !(.) Main = Alternation End ;EBNF - -: parse-regexp ( string -- regexp ) - ! Hack because I want $ allowable in regexps, - ! but with special behavior at the end - ! This fails if the regexp is stupid, though... - dup first CHAR: ^ = tuck [ rest ] when - dup peek CHAR: $ = tuck [ but-last ] when - (parse-regexp) ast boa ; diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index 378ae503ce..1dc2a22d81 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel strings help.markup help.syntax regexp.backend ; +USING: kernel strings help.markup help.syntax ; IN: regexp HELP: diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index cc9b2cccf1..4331eaa250 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -189,8 +189,8 @@ IN: regexp-tests [ t ] [ "SXY" "\\0123XY" matches? ] unit-test [ t ] [ "x" "\\x78" matches? ] unit-test [ f ] [ "y" "\\x78" matches? ] unit-test -[ t ] [ "x" "\\u000078" matches? ] unit-test -[ f ] [ "y" "\\u000078" matches? ] unit-test +[ t ] [ "x" "\\u0078" matches? ] unit-test +[ f ] [ "y" "\\u0078" matches? ] unit-test [ t ] [ "ab" "a+b" matches? ] unit-test [ f ] [ "b" "a+b" matches? ] unit-test @@ -317,16 +317,6 @@ IN: regexp-tests ! Bug in parsing word [ t ] [ "a" R' a' matches? ] unit-test -! Convert to lowercase until E -[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test -[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test - -! Convert to uppercase until E -[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test -[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test - -! [ "{Lower}" ] [ invalid-range? ] must-fail-with - ! [ t ] [ "a" R/ ^a/ matches? ] unit-test ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test @@ -370,10 +360,10 @@ IN: regexp-tests ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test -! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test -! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test -! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test -! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test +! [ t ] [ "a" R/ a$/m matches? ] unit-test +! [ t ] [ "a\n" R/ a$/m matches? ] unit-test +! [ t ] [ "a\r" R/ a$/m matches? ] unit-test +! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test ! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 62ebaab502..8f6edd853e 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -2,33 +2,15 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend prettyprint.custom make lexer -namespaces parser arrays fry regexp.backend regexp.utils +namespaces parser arrays fry locals regexp.parser regexp.nfa regexp.dfa regexp.traversal -regexp.transition-tables splitting sorting ; +regexp.transition-tables splitting sorting regexp.ast ; IN: regexp -: default-regexp ( string -- regexp ) - regexp new - swap >>raw - >>nfa-table - >>dfa-table - >>minimized-table - H{ } clone >>nfa-traversal-flags - H{ } clone >>dfa-traversal-flags - H{ } clone >>options - H{ } clone >>matchers - reset-regexp ; - -: construct-regexp ( regexp -- regexp' ) - { - [ dup raw>> parse-regexp >>parse-tree drop ] - [ construct-nfa ] - [ construct-dfa ] - [ ] - } cleave ; +TUPLE: regexp raw options parse-tree dfa ; : (match) ( string regexp -- dfa-traverser ) - do-match ; inline + dfa>> do-match ; inline : match ( string regexp -- slice/f ) (match) return-match ; @@ -94,17 +76,17 @@ IN: regexp { "R| " "|" } } swap [ subseq? not nip ] curry assoc-find drop ; -: string>options ( string -- options ) - [ ch>option dup ] H{ } map>assoc ; - -: options>string ( options -- string ) - keys [ option>ch ] map natural-sort >string ; - PRIVATE> -: ( string option-string -- regexp ) - [ default-regexp ] [ string>options ] bi* >>options - construct-regexp ; +:: ( string options -- regexp ) + string parse-regexp :> tree + options parse-options :> opt + tree opt :> ast + regexp new + string >>raw + opt >>options + tree >>parse-tree + tree opt construct-nfa construct-dfa >>dfa ; : ( string -- regexp ) "" ; diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index 64d5cdb244..c02ebce91f 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs fry hashtables kernel sequences -vectors regexp.utils ; +vectors ; IN: regexp.transition-tables TUPLE: transition from to obj ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index 394bfe0d52..e06efa7f80 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors assocs combinators kernel math -quotations sequences regexp.parser regexp.classes fry arrays -combinators.short-circuit regexp.utils prettyprint regexp.nfa ; +quotations sequences regexp.classes fry arrays +combinators.short-circuit prettyprint regexp.nfa ; IN: regexp.traversal TUPLE: dfa-traverser @@ -13,8 +13,7 @@ TUPLE: dfa-traverser start-index current-index matches ; -: ( text regexp -- match ) - dfa-table>> +: ( text dfa -- match ) dfa-traverser new swap [ start-state>> >>current-state ] [ >>dfa-table ] bi swap >>text diff --git a/basis/regexp/utils/utils-tests.factor b/basis/regexp/utils/utils-tests.factor deleted file mode 100644 index d048ad4be1..0000000000 --- a/basis/regexp/utils/utils-tests.factor +++ /dev/null @@ -1,4 +0,0 @@ -USING: regexp.utils tools.test ; -IN: regexp.utils.tests - -[ [ ] [ ] while-changes ] must-infer diff --git a/basis/regexp/utils/utils.factor b/basis/regexp/utils/utils.factor deleted file mode 100644 index d1266a6d98..0000000000 --- a/basis/regexp/utils/utils.factor +++ /dev/null @@ -1,42 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs io kernel math math.order -namespaces regexp.backend sequences unicode.categories -math.ranges fry combinators.short-circuit vectors ; -IN: regexp.utils - -: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj ) - [ [ dup slip ] dip pick over call ] dip dupd = - [ 3drop ] [ (while-changes) ] if ; inline recursive - -: while-changes ( obj quot pred -- obj' ) - pick over call (while-changes) ; inline - -ERROR: bad-octal number ; -ERROR: bad-hex number ; -: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ; -: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ; - -: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; - -: hex-digit? ( n -- ? ) - { - [ decimal-digit? ] - [ CHAR: a CHAR: f between? ] - [ CHAR: A CHAR: F between? ] - } 1|| ; - -: punct? ( n -- ? ) - "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; - -: c-identifier-char? ( ch -- ? ) - { [ alpha? ] [ CHAR: _ = ] } 1|| ; - -: java-blank? ( n -- ? ) - { - CHAR: \s CHAR: \t CHAR: \n - HEX: b HEX: 7 CHAR: \r - } member? ; - -: java-printable? ( n -- ? ) - [ [ alpha? ] [ punct? ] ] 1|| ;