Almost done with regexp cleanup

db4
Daniel Ehrenberg 2009-02-18 12:27:07 -06:00
parent 242cfb5c19
commit b8845cb87e
13 changed files with 271 additions and 319 deletions

View File

@ -0,0 +1,53 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel arrays accessors fry sequences ;
FROM: math.ranges => [a,b] ;
IN: regexp.ast
TUPLE: primitive-class class ;
C: <primitive-class> primitive-class
TUPLE: negation term ;
C: <negation> negation
TUPLE: from-to n m ;
C: <from-to> from-to
TUPLE: at-least n ;
C: <at-least> at-least
TUPLE: concatenation seq ;
C: <concatenation> concatenation
TUPLE: alternation seq ;
C: <alternation> alternation
TUPLE: star term ;
C: <star> star
TUPLE: with-options tree options ;
C: <with-options> with-options
TUPLE: options on off ;
C: <options> options
SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ;
: <maybe> ( term -- term' )
f <concatenation> 2array <alternation> ;
: <plus> ( term -- term' )
dup <star> 2array <concatenation> ;
: repetition ( n term -- term' )
<array> <concatenation> ;
GENERIC: <times> ( term times -- term' )
M: at-least <times>
n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
M: from-to <times>
[ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
: char-class ( ranges ? -- term )
[ <alternation> ] dip [ <negation> ] when ;

View File

@ -1,27 +0,0 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors hashtables kernel math vectors ;
IN: regexp.backend
TUPLE: regexp
raw
{ options hashtable }
stack
parse-tree
nfa-table
dfa-table
minimized-table
matchers
{ nfa-traversal-flags hashtable }
{ dfa-traversal-flags hashtable }
{ state integer }
{ new-states vector }
{ visited-states hashtable } ;
: reset-regexp ( regexp -- regexp )
0 >>state
V{ } clone >>stack
V{ } clone >>new-states
H{ } clone >>visited-states ;
SYMBOL: current-regexp

View File

@ -1,9 +1,31 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words regexp.utils USING: accessors kernel math math.order words
ascii unicode.categories combinators.short-circuit ; ascii unicode.categories combinators.short-circuit sequences ;
IN: regexp.classes IN: regexp.classes
: punct? ( ch -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
: java-blank? ( ch -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
: java-printable? ( ch -- ? )
[ [ alpha? ] [ punct? ] ] 1|| ;
: hex-digit? ( ch -- ? )
{
[ CHAR: A CHAR: F between? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: 0 CHAR: 9 between? ]
} 1|| ;
SINGLETONS: any-char any-char-no-nl SINGLETONS: any-char any-char-no-nl
letter-class LETTER-class Letter-class digit-class letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class alpha-class non-newline-blank-class
@ -14,8 +36,8 @@ unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input beginning-of-line SINGLETONS: beginning-of-input beginning-of-line
end-of-input end-of-line ; end-of-input end-of-line ;
MIXIN: node TUPLE: range from to ;
TUPLE: character-class-range from to ; INSTANCE: character-class-range node C: <range> range
GENERIC: class-member? ( obj class -- ? ) GENERIC: class-member? ( obj class -- ? )
@ -23,7 +45,7 @@ M: t class-member? ( obj class -- ? ) 2drop f ;
M: integer class-member? ( obj class -- ? ) 2drop f ; M: integer class-member? ( obj class -- ? ) 2drop f ;
M: character-class-range class-member? ( obj class -- ? ) M: range class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ; [ from>> ] [ to>> ] bi between? ;
M: any-char class-member? ( obj class -- ? ) M: any-char class-member? ( obj class -- ? )

View File

@ -2,83 +2,74 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp.nfa regexp.transition-tables sequences math math.order regexp.nfa regexp.transition-tables sequences
sets sorting vectors regexp.utils sequences.deep ; sets sorting vectors sequences.deep ;
USING: io prettyprint threads ; USING: io prettyprint threads ;
IN: regexp.dfa IN: regexp.dfa
: find-delta ( states transition regexp -- new-states ) : (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
nfa-table>> transitions>> [ [ dup slip ] dip pick over call ] dip dupd =
rot [ swap at at ] with with gather sift ; [ 3drop ] [ (while-changes) ] if ; inline recursive
: (find-epsilon-closure) ( states regexp -- new-states ) : while-changes ( obj quot pred -- obj' )
3dup nip call (while-changes) ; inline
: find-delta ( states transition nfa -- new-states )
transitions>> '[ _ swap _ at at ] gather sift ;
: (find-epsilon-closure) ( states nfa -- new-states )
eps swap find-delta ; eps swap find-delta ;
: find-epsilon-closure ( states regexp -- new-states ) : find-epsilon-closure ( states nfa -- new-states )
'[ dup _ (find-epsilon-closure) union ] [ length ] while-changes '[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
natural-sort ; natural-sort ;
: find-closure ( states transition regexp -- new-states ) : find-closure ( states transition nfa -- new-states )
[ find-delta ] 2keep nip find-epsilon-closure ; [ find-delta ] keep find-epsilon-closure ;
: find-start-state ( regexp -- state ) : find-start-state ( nfa -- state )
[ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ; [ start-state>> 1vector ] keep find-epsilon-closure ;
: find-transitions ( seq1 regexp -- seq2 ) : find-transitions ( dfa-state nfa -- next-dfa-state )
nfa-table>> transitions>> transitions>>
[ at keys ] curry gather '[ _ at keys ] gather
eps swap remove ; eps swap remove ;
: add-todo-state ( state regexp -- ) : add-todo-state ( state visited-states new-states -- )
2dup visited-states>> key? [ 3dup drop key? [ 3drop ] [
2drop [ conjoin ] [ push ] bi-curry* bi
] [
[ visited-states>> conjoin ]
[ new-states>> push ] 2bi
] if ; ] if ;
: new-transitions ( regexp -- ) :: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
dup new-states>> [ new-states [ nfa dfa ] [
drop new-states pop :> state
] [ state nfa-table find-transitions
dupd pop dup pick find-transitions rot [| trans |
[ state trans nfa find-closure :> new-state
[ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep state visited-states new-state add-todo-state
[ swapd transition make-transition ] dip state new-state trans transition make-transition dfa add-transition
dfa-table>> add-transition ] each
] curry with each nfa dfa new-states visited-states new-transitions
new-transitions
] if-empty ; ] if-empty ;
: states ( hashtable -- array ) : states ( hashtable -- array )
[ keys ] [ keys ]
[ values [ values concat ] map concat append ] bi ; [ values [ values concat ] map concat append ] bi ;
: set-final-states ( regexp -- ) : set-final-states ( nfa dfa -- )
dup [
[ nfa-table>> final-states>> keys ] [ final-states>> keys ]
[ dfa-table>> transitions>> states ] bi [ transitions>> states ] bi*
[ intersects? ] with filter [ intersects? ] with filter
] [ final-states>> ] bi
swap dfa-table>> final-states>>
[ conjoin ] curry each ; [ conjoin ] curry each ;
: set-initial-state ( regexp -- ) : initialize-dfa ( nfa -- dfa )
dup <transition-table>
[ dfa-table>> ] [ find-start-state ] bi swap find-start-state >>start-state ;
[ >>start-state drop ] keep
1vector >>new-states drop ;
: set-traversal-flags ( regexp -- ) : construct-dfa ( nfa -- dfa )
dup dup initialize-dfa
[ nfa-traversal-flags>> ] dup start-state>> 1vector
[ dfa-table>> transitions>> keys ] bi H{ } clone
[ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc new-transitions
>>dfa-traversal-flags drop ; [ set-final-states ] keep ;
: construct-dfa ( regexp -- )
{
[ set-initial-state ]
[ new-transitions ]
[ set-final-states ]
[ set-traversal-flags ]
} cleave ;

View File

@ -1,9 +1,10 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs grouping kernel regexp.backend USING: accessors arrays assocs grouping kernel
locals math namespaces regexp.parser sequences fry quotations locals math namespaces sequences fry quotations
math.order math.ranges vectors unicode.categories regexp.utils math.order math.ranges vectors unicode.categories
regexp.transition-tables words sets regexp.classes unicode.case.private ; regexp.transition-tables words sets
unicode.case.private regexp.ast regexp.classes ;
! This uses unicode.case.private for ch>upper and ch>lower ! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything ! but case-insensitive matching should be done by case-folding everything
! before processing starts ! before processing starts
@ -13,34 +14,49 @@ ERROR: feature-is-broken feature ;
SYMBOL: negated? SYMBOL: negated?
: negate ( -- )
negated? [ not ] change ;
SINGLETON: eps SINGLETON: eps
: options ( -- obj ) current-regexp get options>> ; SYMBOL: option-stack
: option? ( obj -- ? ) options key? ; SYMBOL: combine-stack
: option-on ( obj -- ) options conjoin ; SYMBOL: state
: option-off ( obj -- ) options delete-at ; : next-state ( -- state )
state [ get ] [ inc ] bi ;
: next-state ( regexp -- state ) SYMBOL: nfa-table
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
: set-start-state ( regexp -- ) : set-each ( keys value hashtable -- )
dup stack>> [ '[ _ swap _ set-at ] each ;
drop
] [ : options>hash ( options -- hashtable )
[ nfa-table>> ] [ pop first ] bi* >>start-state drop H{ } clone [
] if-empty ; [ [ on>> t ] dip set-each ]
[ [ off>> f ] dip set-each ] 2bi
] keep ;
: using-options ( options quot -- )
[ options>hash option-stack [ ?push ] change ] dip
call option-stack get pop* ; inline
: option? ( obj -- ? )
option-stack get assoc-stack ;
: set-start-state ( -- nfa-table )
nfa-table get
combine-stack get pop first >>start-state ;
GENERIC: nfa-node ( node -- ) GENERIC: nfa-node ( node -- )
:: add-simple-entry ( obj class -- ) :: add-simple-entry ( obj class -- )
[let* | regexp [ current-regexp get ] [let* | s0 [ next-state ]
s0 [ regexp next-state ] s1 [ next-state ]
s1 [ regexp next-state ] stack [ combine-stack get ]
stack [ regexp stack>> ] table [ nfa-table get ] |
table [ regexp nfa-table>> ] |
negated? get [ negated? get [
s0 f obj class make-transition table add-transition s0 f obj class make-transition table add-transition
s0 s1 <default-transition> table add-transition s0 s1 <default-transition> table add-transition
@ -51,9 +67,8 @@ GENERIC: nfa-node ( node -- )
t s1 table final-states>> set-at ] ; t s1 table final-states>> set-at ] ;
:: concatenate-nodes ( -- ) :: concatenate-nodes ( -- )
[let* | regexp [ current-regexp get ] [let* | stack [ combine-stack get ]
stack [ regexp stack>> ] table [ nfa-table get ]
table [ regexp nfa-table>> ]
s2 [ stack peek first ] s2 [ stack peek first ]
s3 [ stack pop second ] s3 [ stack pop second ]
s0 [ stack peek first ] s0 [ stack peek first ]
@ -63,15 +78,14 @@ GENERIC: nfa-node ( node -- )
s0 s3 2array stack push ] ; s0 s3 2array stack push ] ;
:: alternate-nodes ( -- ) :: alternate-nodes ( -- )
[let* | regexp [ current-regexp get ] [let* | stack [ combine-stack get ]
stack [ regexp stack>> ] table [ nfa-table get ]
table [ regexp nfa-table>> ]
s2 [ stack peek first ] s2 [ stack peek first ]
s3 [ stack pop second ] s3 [ stack pop second ]
s0 [ stack peek first ] s0 [ stack peek first ]
s1 [ stack pop second ] s1 [ stack pop second ]
s4 [ regexp next-state ] s4 [ next-state ]
s5 [ regexp next-state ] | s5 [ next-state ] |
s4 s0 eps <literal-transition> table add-transition s4 s0 eps <literal-transition> table add-transition
s4 s2 eps <literal-transition> table add-transition s4 s2 eps <literal-transition> table add-transition
s1 s5 eps <literal-transition> table add-transition s1 s5 eps <literal-transition> table add-transition
@ -83,13 +97,12 @@ GENERIC: nfa-node ( node -- )
M: star nfa-node ( node -- ) M: star nfa-node ( node -- )
term>> nfa-node term>> nfa-node
[let* | regexp [ current-regexp get ] [let* | stack [ combine-stack get ]
stack [ regexp stack>> ]
s0 [ stack peek first ] s0 [ stack peek first ]
s1 [ stack pop second ] s1 [ stack pop second ]
s2 [ regexp next-state ] s2 [ next-state ]
s3 [ regexp next-state ] s3 [ next-state ]
table [ regexp nfa-table>> ] | table [ nfa-table get ] |
s1 table final-states>> delete-at s1 table final-states>> delete-at
t s3 table final-states>> set-at t s3 table final-states>> set-at
s1 s0 eps <literal-transition> table add-transition s1 s0 eps <literal-transition> table add-transition
@ -99,58 +112,53 @@ M: star nfa-node ( node -- )
s2 s3 2array stack push ] ; s2 s3 2array stack push ] ;
M: concatenation nfa-node ( node -- ) M: concatenation nfa-node ( node -- )
seq>> seq>> [ eps literal-transition add-simple-entry ] [
reversed-regexp option? [ <reversed> ] when reversed-regexp option? [ <reversed> ] when
[ [ nfa-node ] each ] [ [ nfa-node ] each ]
[ length 1- [ concatenate-nodes ] times ] bi ; [ length 1- [ concatenate-nodes ] times ] bi
] if-empty ;
M: alternation nfa-node ( node -- ) M: alternation nfa-node ( node -- )
seq>> seq>>
[ [ nfa-node ] each ] [ [ nfa-node ] each ]
[ length 1- [ alternate-nodes ] times ] bi ; [ length 1- [ alternate-nodes ] times ] bi ;
M: constant nfa-node ( node -- ) M: integer nfa-node ( node -- )
case-insensitive option? [ case-insensitive option? [
dup char>> [ ch>lower ] [ ch>upper ] bi dup [ ch>lower ] [ ch>upper ] bi
2dup = [ 2dup = [
2drop 2drop
char>> literal-transition add-simple-entry literal-transition add-simple-entry
] [ ] [
[ literal-transition add-simple-entry ] bi@ [ literal-transition add-simple-entry ] bi@
alternate-nodes drop alternate-nodes drop
] if ] if
] [ ] [
char>> literal-transition add-simple-entry literal-transition add-simple-entry
] if ; ] if ;
M: word nfa-node ( node -- ) class-transition add-simple-entry ; M: primitive-class nfa-node ( node -- )
class>> dup
{ letter-class LETTER-class } member? case-insensitive option? and
[ drop Letter-class ] when
class-transition add-simple-entry ;
M: any-char nfa-node ( node -- ) M: any-char nfa-node ( node -- )
[ dotall option? ] dip any-char-no-nl ? [ dotall option? ] dip any-char-no-nl ?
class-transition add-simple-entry ; class-transition add-simple-entry ;
M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ; M: negation nfa-node ( node -- )
negate term>> nfa-node negate ;
M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ; M: range nfa-node ( node -- )
: choose-letter-class ( node -- node' )
case-insensitive option? Letter-class rot ? ;
M: letter-class nfa-node ( node -- )
choose-letter-class class-transition add-simple-entry ;
M: LETTER-class nfa-node ( node -- )
choose-letter-class class-transition add-simple-entry ;
M: character-class-range nfa-node ( node -- )
case-insensitive option? [ case-insensitive option? [
! This should be implemented for Unicode by case-folding ! This should be implemented for Unicode by case-folding
! the input and all strings in the regexp. ! the input and all strings in the regexp.
dup [ from>> ] [ to>> ] bi dup [ from>> ] [ to>> ] bi
2dup [ Letter? ] bi@ and [ 2dup [ Letter? ] bi@ and [
rot drop rot drop
[ [ ch>lower ] bi@ character-class-range boa ] [ [ ch>lower ] bi@ <range> ]
[ [ ch>upper ] bi@ character-class-range boa ] 2bi [ [ ch>upper ] bi@ <range> ] 2bi
[ class-transition add-simple-entry ] bi@ [ class-transition add-simple-entry ] bi@
alternate-nodes alternate-nodes
] [ ] [
@ -161,14 +169,15 @@ M: character-class-range nfa-node ( node -- )
class-transition add-simple-entry class-transition add-simple-entry
] if ; ] if ;
M: option nfa-node ( node -- ) M: with-options nfa-node ( node -- )
[ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if dup options>> [ tree>> nfa-node ] using-options ;
eps literal-transition add-simple-entry ;
: construct-nfa ( regexp -- ) : construct-nfa ( ast -- nfa-table )
[ [
reset-regexp negated? off
[ current-regexp set ] V{ } clone combine-stack set
[ parse-tree>> nfa-node ] 0 state set
[ set-start-state ] tri <transition-table> clone nfa-table set
nfa-node
set-start-state
] with-scope ; ] with-scope ;

View File

@ -1,28 +1,9 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: peg.ebnf kernel math.parser sequences assocs arrays USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
combinators regexp.classes strings splitting peg locals ; combinators regexp.classes strings splitting peg locals accessors
regexp.ast ;
IN: regexp.parser IN: regexp.parser
TUPLE: range from to ;
TUPLE: char-class ranges ;
TUPLE: primitive-class class ;
TUPLE: not-char-class ranges ;
TUPLE: not-primitive-class class ;
TUPLE: from-to n m ;
TUPLE: at-least n ;
TUPLE: up-to n ;
TUPLE: exactly n ;
TUPLE: times expression times ;
TUPLE: concatenation seq ;
TUPLE: alternation seq ;
TUPLE: maybe term ;
TUPLE: star term ;
TUPLE: plus term ;
TUPLE: with-options tree options ;
TUPLE: ast ^? $? tree ;
SINGLETON: any-char
: allowed-char? ( ch -- ? ) : allowed-char? ( ch -- ? )
".()|[*+?" member? not ; ".()|[*+?" member? not ;
@ -64,21 +45,16 @@ ERROR: bad-class name ;
{ CHAR: e [ HEX: 1b ] } { CHAR: e [ HEX: 1b ] }
{ CHAR: \\ [ CHAR: \\ ] } { CHAR: \\ [ CHAR: \\ ] }
{ CHAR: w [ c-identifier-class primitive-class boa ] } { CHAR: w [ c-identifier-class <primitive-class> ] }
{ CHAR: W [ c-identifier-class not-primitive-class boa ] } { CHAR: W [ c-identifier-class <primitive-class> <negation> ] }
{ CHAR: s [ java-blank-class primitive-class boa ] } { CHAR: s [ java-blank-class <primitive-class> ] }
{ CHAR: S [ java-blank-class not-primitive-class boa ] } { CHAR: S [ java-blank-class <primitive-class> <negation> ] }
{ CHAR: d [ digit-class primitive-class boa ] } { CHAR: d [ digit-class <primitive-class> ] }
{ CHAR: D [ digit-class not-primitive-class boa ] } { CHAR: D [ digit-class <primitive-class> <negation> ] }
[ ] [ ]
} case ; } case ;
TUPLE: options on off ;
SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ;
: options-assoc ( -- assoc ) : options-assoc ( -- assoc )
H{ H{
{ CHAR: i case-insensitive } { CHAR: i case-insensitive }
@ -98,19 +74,30 @@ unicode-case reversed-regexp ;
options-assoc value-at ; options-assoc value-at ;
: parse-options ( on off -- options ) : parse-options ( on off -- options )
[ [ ch>option ] map ] bi@ options boa ; [ [ ch>option ] { } map-as ] bi@ <options> ;
! TODO: make range syntax better (negation, and, etc), : string>options ( string -- options )
! add syntax for various parenthized things, "-" split1 parse-options ;
: options>string ( options -- string )
[ on>> ] [ off>> ] bi
[ [ option>ch ] map ] bi@
[ "-" swap 3append ] unless-empty
"" like ;
! TODO: add syntax for various parenthized things,
! add greedy and nongreedy forms of matching ! add greedy and nongreedy forms of matching
! (once it's all implemented) ! (once it's all implemented)
EBNF: (parse-regexp) EBNF: parse-regexp
CharacterInBracket = !("}") Character CharacterInBracket = !("}") Character
Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]] QuotedCharacter = !("\\E") .
| "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]]
Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
| "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
| "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
| "u" Character:a Character:b Character:c Character:d | "u" Character:a Character:b Character:c Character:d
=> [[ { a b c d } hex> ensure-number ]] => [[ { a b c d } hex> ensure-number ]]
| "x" Character:a Character:b | "x" Character:a Character:b
@ -119,30 +106,30 @@ Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-cla
=> [[ { a b c } oct> ensure-number ]] => [[ { a b c } oct> ensure-number ]]
| . => [[ lookup-escape ]] | . => [[ lookup-escape ]]
Character = "\\" Escape:e => [[ e ]] EscapeSequence = "\\" Escape:e => [[ e ]]
| . ?[ allowed-char? ]?
AnyRangeCharacter = Character | "[" Character = EscapeSequence | . ?[ allowed-char? ]?
AnyRangeCharacter = EscapeSequence | .
RangeCharacter = !("]") AnyRangeCharacter RangeCharacter = !("]") AnyRangeCharacter
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
| RangeCharacter | RangeCharacter
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
| AnyRangeCharacter | AnyRangeCharacter
Ranges = StartRange:s Range*:r => [[ r s prefix ]] Ranges = StartRange:s Range*:r => [[ r s prefix ]]
CharClass = "^" Ranges:e => [[ e not-char-class boa ]] CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
| Ranges:e => [[ e char-class boa ]]
Options = [idmsux]* Options = [idmsux]*
Parenthized = "?:" Alternation:a => [[ a ]] Parenthized = "?:" Alternation:a => [[ a ]]
| "?" Options:on "-"? Options:off ":" Alternation:a | "?" Options:on "-"? Options:off ":" Alternation:a
=> [[ a on off parse-options with-options boa ]] => [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ ignore ]] | "?#" [^)]* => [[ f ]]
| Alternation | Alternation
Element = "(" Parenthized:p ")" => [[ p ]] Element = "(" Parenthized:p ")" => [[ p ]]
@ -152,32 +139,24 @@ Element = "(" Parenthized:p ")" => [[ p ]]
Number = (!(","|"}").)* => [[ string>number ensure-number ]] Number = (!(","|"}").)* => [[ string>number ensure-number ]]
Times = "," Number:n "}" => [[ n up-to boa ]] Times = "," Number:n "}" => [[ 0 n <from-to> ]]
| Number:n ",}" => [[ n at-least boa ]] | Number:n ",}" => [[ n <at-least> ]]
| Number:n "}" => [[ n exactly boa ]] | Number:n "}" => [[ n n <from-to> ]]
| "}" => [[ bad-number ]] | "}" => [[ bad-number ]]
| Number:n "," Number:m "}" => [[ n m from-to boa ]] | Number:n "," Number:m "}" => [[ n m <from-to> ]]
Repeated = Element:e "{" Times:t => [[ e t times boa ]] Repeated = Element:e "{" Times:t => [[ e t <times> ]]
| Element:e "?" => [[ e maybe boa ]] | Element:e "?" => [[ e <maybe> ]]
| Element:e "*" => [[ e star boa ]] | Element:e "*" => [[ e <star> ]]
| Element:e "+" => [[ e plus boa ]] | Element:e "+" => [[ e <plus> ]]
| Element | Element
Concatenation = Repeated*:r => [[ r concatenation boa ]] Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
Alternation = Concatenation:c ("|" Concatenation)*:a Alternation = Concatenation:c ("|" Concatenation)*:a
=> [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]] => [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
End = !(.) End = !(.)
Main = Alternation End Main = Alternation End
;EBNF ;EBNF
: parse-regexp ( string -- regexp )
! Hack because I want $ allowable in regexps,
! but with special behavior at the end
! This fails if the regexp is stupid, though...
dup first CHAR: ^ = tuck [ rest ] when
dup peek CHAR: $ = tuck [ but-last ] when
(parse-regexp) ast boa ;

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings help.markup help.syntax regexp.backend ; USING: kernel strings help.markup help.syntax ;
IN: regexp IN: regexp
HELP: <regexp> HELP: <regexp>

View File

@ -189,8 +189,8 @@ IN: regexp-tests
[ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test [ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
[ t ] [ "x" "\\x78" <regexp> matches? ] unit-test [ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
[ f ] [ "y" "\\x78" <regexp> matches? ] unit-test [ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test [ t ] [ "x" "\\u0078" <regexp> matches? ] unit-test
[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test [ f ] [ "y" "\\u0078" <regexp> matches? ] unit-test
[ t ] [ "ab" "a+b" <regexp> matches? ] unit-test [ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
[ f ] [ "b" "a+b" <regexp> matches? ] unit-test [ f ] [ "b" "a+b" <regexp> matches? ] unit-test
@ -317,16 +317,6 @@ IN: regexp-tests
! Bug in parsing word ! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test [ t ] [ "a" R' a' matches? ] unit-test
! Convert to lowercase until E
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
! Convert to uppercase until E
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
! [ t ] [ "a" R/ ^a/ matches? ] unit-test ! [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
@ -370,10 +360,10 @@ IN: regexp-tests
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test ! [ t ] [ "a" R/ a$/m matches? ] unit-test
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test ! [ t ] [ "a\n" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test ! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test ! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test

View File

@ -2,33 +2,15 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences strings sets USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry regexp.backend regexp.utils namespaces parser arrays fry locals
regexp.parser regexp.nfa regexp.dfa regexp.traversal regexp.parser regexp.nfa regexp.dfa regexp.traversal
regexp.transition-tables splitting sorting ; regexp.transition-tables splitting sorting regexp.ast ;
IN: regexp IN: regexp
: default-regexp ( string -- regexp ) TUPLE: regexp raw options parse-tree dfa ;
regexp new
swap >>raw
<transition-table> >>nfa-table
<transition-table> >>dfa-table
<transition-table> >>minimized-table
H{ } clone >>nfa-traversal-flags
H{ } clone >>dfa-traversal-flags
H{ } clone >>options
H{ } clone >>matchers
reset-regexp ;
: construct-regexp ( regexp -- regexp' )
{
[ dup raw>> parse-regexp >>parse-tree drop ]
[ construct-nfa ]
[ construct-dfa ]
[ ]
} cleave ;
: (match) ( string regexp -- dfa-traverser ) : (match) ( string regexp -- dfa-traverser )
<dfa-traverser> do-match ; inline dfa>> <dfa-traverser> do-match ; inline
: match ( string regexp -- slice/f ) : match ( string regexp -- slice/f )
(match) return-match ; (match) return-match ;
@ -94,17 +76,17 @@ IN: regexp
{ "R| " "|" } { "R| " "|" }
} swap [ subseq? not nip ] curry assoc-find drop ; } swap [ subseq? not nip ] curry assoc-find drop ;
: string>options ( string -- options )
[ ch>option dup ] H{ } map>assoc ;
: options>string ( options -- string )
keys [ option>ch ] map natural-sort >string ;
PRIVATE> PRIVATE>
: <optioned-regexp> ( string option-string -- regexp ) :: <optioned-regexp> ( string options -- regexp )
[ default-regexp ] [ string>options ] bi* >>options string parse-regexp :> tree
construct-regexp ; options parse-options :> opt
tree opt <with-options> :> ast
regexp new
string >>raw
opt >>options
tree >>parse-tree
tree opt <with-options> construct-nfa construct-dfa >>dfa ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ; : <regexp> ( string -- regexp ) "" <optioned-regexp> ;

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs fry hashtables kernel sequences USING: accessors arrays assocs fry hashtables kernel sequences
vectors regexp.utils ; vectors ;
IN: regexp.transition-tables IN: regexp.transition-tables
TUPLE: transition from to obj ; TUPLE: transition from to obj ;

View File

@ -1,8 +1,8 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs combinators kernel math USING: accessors assocs combinators kernel math
quotations sequences regexp.parser regexp.classes fry arrays quotations sequences regexp.classes fry arrays
combinators.short-circuit regexp.utils prettyprint regexp.nfa ; combinators.short-circuit prettyprint regexp.nfa ;
IN: regexp.traversal IN: regexp.traversal
TUPLE: dfa-traverser TUPLE: dfa-traverser
@ -13,8 +13,7 @@ TUPLE: dfa-traverser
start-index current-index start-index current-index
matches ; matches ;
: <dfa-traverser> ( text regexp -- match ) : <dfa-traverser> ( text dfa -- match )
dfa-table>>
dfa-traverser new dfa-traverser new
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
swap >>text swap >>text

View File

@ -1,4 +0,0 @@
USING: regexp.utils tools.test ;
IN: regexp.utils.tests
[ [ ] [ ] while-changes ] must-infer

View File

@ -1,42 +0,0 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs io kernel math math.order
namespaces regexp.backend sequences unicode.categories
math.ranges fry combinators.short-circuit vectors ;
IN: regexp.utils
: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
[ [ dup slip ] dip pick over call ] dip dupd =
[ 3drop ] [ (while-changes) ] if ; inline recursive
: while-changes ( obj quot pred -- obj' )
pick over call (while-changes) ; inline
ERROR: bad-octal number ;
ERROR: bad-hex number ;
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
: hex-digit? ( n -- ? )
{
[ decimal-digit? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: A CHAR: F between? ]
} 1|| ;
: punct? ( n -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
: java-blank? ( n -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
: java-printable? ( n -- ? )
[ [ alpha? ] [ punct? ] ] 1|| ;