Almost done with regexp cleanup

db4
Daniel Ehrenberg 2009-02-18 12:27:07 -06:00
parent 242cfb5c19
commit b8845cb87e
13 changed files with 271 additions and 319 deletions

View File

@ -0,0 +1,53 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel arrays accessors fry sequences ;
FROM: math.ranges => [a,b] ;
IN: regexp.ast
TUPLE: primitive-class class ;
C: <primitive-class> primitive-class
TUPLE: negation term ;
C: <negation> negation
TUPLE: from-to n m ;
C: <from-to> from-to
TUPLE: at-least n ;
C: <at-least> at-least
TUPLE: concatenation seq ;
C: <concatenation> concatenation
TUPLE: alternation seq ;
C: <alternation> alternation
TUPLE: star term ;
C: <star> star
TUPLE: with-options tree options ;
C: <with-options> with-options
TUPLE: options on off ;
C: <options> options
SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ;
: <maybe> ( term -- term' )
f <concatenation> 2array <alternation> ;
: <plus> ( term -- term' )
dup <star> 2array <concatenation> ;
: repetition ( n term -- term' )
<array> <concatenation> ;
GENERIC: <times> ( term times -- term' )
M: at-least <times>
n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
M: from-to <times>
[ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
: char-class ( ranges ? -- term )
[ <alternation> ] dip [ <negation> ] when ;

View File

@ -1,27 +0,0 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors hashtables kernel math vectors ;
IN: regexp.backend
TUPLE: regexp
raw
{ options hashtable }
stack
parse-tree
nfa-table
dfa-table
minimized-table
matchers
{ nfa-traversal-flags hashtable }
{ dfa-traversal-flags hashtable }
{ state integer }
{ new-states vector }
{ visited-states hashtable } ;
: reset-regexp ( regexp -- regexp )
0 >>state
V{ } clone >>stack
V{ } clone >>new-states
H{ } clone >>visited-states ;
SYMBOL: current-regexp

View File

@ -1,9 +1,31 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words regexp.utils
ascii unicode.categories combinators.short-circuit ;
USING: accessors kernel math math.order words
ascii unicode.categories combinators.short-circuit sequences ;
IN: regexp.classes
: punct? ( ch -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
: java-blank? ( ch -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
: java-printable? ( ch -- ? )
[ [ alpha? ] [ punct? ] ] 1|| ;
: hex-digit? ( ch -- ? )
{
[ CHAR: A CHAR: F between? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: 0 CHAR: 9 between? ]
} 1|| ;
SINGLETONS: any-char any-char-no-nl
letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
@ -14,8 +36,8 @@ unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input beginning-of-line
end-of-input end-of-line ;
MIXIN: node
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
TUPLE: range from to ;
C: <range> range
GENERIC: class-member? ( obj class -- ? )
@ -23,7 +45,7 @@ M: t class-member? ( obj class -- ? ) 2drop f ;
M: integer class-member? ( obj class -- ? ) 2drop f ;
M: character-class-range class-member? ( obj class -- ? )
M: range class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ;
M: any-char class-member? ( obj class -- ? )

View File

@ -2,83 +2,74 @@
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp.nfa regexp.transition-tables sequences
sets sorting vectors regexp.utils sequences.deep ;
sets sorting vectors sequences.deep ;
USING: io prettyprint threads ;
IN: regexp.dfa
: find-delta ( states transition regexp -- new-states )
nfa-table>> transitions>>
rot [ swap at at ] with with gather sift ;
: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
[ [ dup slip ] dip pick over call ] dip dupd =
[ 3drop ] [ (while-changes) ] if ; inline recursive
: (find-epsilon-closure) ( states regexp -- new-states )
: while-changes ( obj quot pred -- obj' )
3dup nip call (while-changes) ; inline
: find-delta ( states transition nfa -- new-states )
transitions>> '[ _ swap _ at at ] gather sift ;
: (find-epsilon-closure) ( states nfa -- new-states )
eps swap find-delta ;
: find-epsilon-closure ( states regexp -- new-states )
: find-epsilon-closure ( states nfa -- new-states )
'[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
natural-sort ;
: find-closure ( states transition regexp -- new-states )
[ find-delta ] 2keep nip find-epsilon-closure ;
: find-closure ( states transition nfa -- new-states )
[ find-delta ] keep find-epsilon-closure ;
: find-start-state ( regexp -- state )
[ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ;
: find-start-state ( nfa -- state )
[ start-state>> 1vector ] keep find-epsilon-closure ;
: find-transitions ( seq1 regexp -- seq2 )
nfa-table>> transitions>>
[ at keys ] curry gather
: find-transitions ( dfa-state nfa -- next-dfa-state )
transitions>>
'[ _ at keys ] gather
eps swap remove ;
: add-todo-state ( state regexp -- )
2dup visited-states>> key? [
2drop
] [
[ visited-states>> conjoin ]
[ new-states>> push ] 2bi
: add-todo-state ( state visited-states new-states -- )
3dup drop key? [ 3drop ] [
[ conjoin ] [ push ] bi-curry* bi
] if ;
: new-transitions ( regexp -- )
dup new-states>> [
drop
] [
dupd pop dup pick find-transitions rot
[
[ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
[ swapd transition make-transition ] dip
dfa-table>> add-transition
] curry with each
new-transitions
:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
new-states [ nfa dfa ] [
new-states pop :> state
state nfa-table find-transitions
[| trans |
state trans nfa find-closure :> new-state
state visited-states new-state add-todo-state
state new-state trans transition make-transition dfa add-transition
] each
nfa dfa new-states visited-states new-transitions
] if-empty ;
: states ( hashtable -- array )
[ keys ]
[ values [ values concat ] map concat append ] bi ;
: set-final-states ( regexp -- )
dup
[ nfa-table>> final-states>> keys ]
[ dfa-table>> transitions>> states ] bi
[ intersects? ] with filter
swap dfa-table>> final-states>>
: set-final-states ( nfa dfa -- )
[
[ final-states>> keys ]
[ transitions>> states ] bi*
[ intersects? ] with filter
] [ final-states>> ] bi
[ conjoin ] curry each ;
: set-initial-state ( regexp -- )
dup
[ dfa-table>> ] [ find-start-state ] bi
[ >>start-state drop ] keep
1vector >>new-states drop ;
: initialize-dfa ( nfa -- dfa )
<transition-table>
swap find-start-state >>start-state ;
: set-traversal-flags ( regexp -- )
dup
[ nfa-traversal-flags>> ]
[ dfa-table>> transitions>> keys ] bi
[ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc
>>dfa-traversal-flags drop ;
: construct-dfa ( regexp -- )
{
[ set-initial-state ]
[ new-transitions ]
[ set-final-states ]
[ set-traversal-flags ]
} cleave ;
: construct-dfa ( nfa -- dfa )
dup initialize-dfa
dup start-state>> 1vector
H{ } clone
new-transitions
[ set-final-states ] keep ;

View File

@ -1,9 +1,10 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs grouping kernel regexp.backend
locals math namespaces regexp.parser sequences fry quotations
math.order math.ranges vectors unicode.categories regexp.utils
regexp.transition-tables words sets regexp.classes unicode.case.private ;
USING: accessors arrays assocs grouping kernel
locals math namespaces sequences fry quotations
math.order math.ranges vectors unicode.categories
regexp.transition-tables words sets
unicode.case.private regexp.ast regexp.classes ;
! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts
@ -13,34 +14,49 @@ ERROR: feature-is-broken feature ;
SYMBOL: negated?
: negate ( -- )
negated? [ not ] change ;
SINGLETON: eps
: options ( -- obj ) current-regexp get options>> ;
SYMBOL: option-stack
: option? ( obj -- ? ) options key? ;
SYMBOL: combine-stack
: option-on ( obj -- ) options conjoin ;
SYMBOL: state
: option-off ( obj -- ) options delete-at ;
: next-state ( -- state )
state [ get ] [ inc ] bi ;
: next-state ( regexp -- state )
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
SYMBOL: nfa-table
: set-start-state ( regexp -- )
dup stack>> [
drop
] [
[ nfa-table>> ] [ pop first ] bi* >>start-state drop
] if-empty ;
: set-each ( keys value hashtable -- )
'[ _ swap _ set-at ] each ;
: options>hash ( options -- hashtable )
H{ } clone [
[ [ on>> t ] dip set-each ]
[ [ off>> f ] dip set-each ] 2bi
] keep ;
: using-options ( options quot -- )
[ options>hash option-stack [ ?push ] change ] dip
call option-stack get pop* ; inline
: option? ( obj -- ? )
option-stack get assoc-stack ;
: set-start-state ( -- nfa-table )
nfa-table get
combine-stack get pop first >>start-state ;
GENERIC: nfa-node ( node -- )
:: add-simple-entry ( obj class -- )
[let* | regexp [ current-regexp get ]
s0 [ regexp next-state ]
s1 [ regexp next-state ]
stack [ regexp stack>> ]
table [ regexp nfa-table>> ] |
[let* | s0 [ next-state ]
s1 [ next-state ]
stack [ combine-stack get ]
table [ nfa-table get ] |
negated? get [
s0 f obj class make-transition table add-transition
s0 s1 <default-transition> table add-transition
@ -51,9 +67,8 @@ GENERIC: nfa-node ( node -- )
t s1 table final-states>> set-at ] ;
:: concatenate-nodes ( -- )
[let* | regexp [ current-regexp get ]
stack [ regexp stack>> ]
table [ regexp nfa-table>> ]
[let* | stack [ combine-stack get ]
table [ nfa-table get ]
s2 [ stack peek first ]
s3 [ stack pop second ]
s0 [ stack peek first ]
@ -63,15 +78,14 @@ GENERIC: nfa-node ( node -- )
s0 s3 2array stack push ] ;
:: alternate-nodes ( -- )
[let* | regexp [ current-regexp get ]
stack [ regexp stack>> ]
table [ regexp nfa-table>> ]
[let* | stack [ combine-stack get ]
table [ nfa-table get ]
s2 [ stack peek first ]
s3 [ stack pop second ]
s0 [ stack peek first ]
s1 [ stack pop second ]
s4 [ regexp next-state ]
s5 [ regexp next-state ] |
s4 [ next-state ]
s5 [ next-state ] |
s4 s0 eps <literal-transition> table add-transition
s4 s2 eps <literal-transition> table add-transition
s1 s5 eps <literal-transition> table add-transition
@ -83,13 +97,12 @@ GENERIC: nfa-node ( node -- )
M: star nfa-node ( node -- )
term>> nfa-node
[let* | regexp [ current-regexp get ]
stack [ regexp stack>> ]
[let* | stack [ combine-stack get ]
s0 [ stack peek first ]
s1 [ stack pop second ]
s2 [ regexp next-state ]
s3 [ regexp next-state ]
table [ regexp nfa-table>> ] |
s2 [ next-state ]
s3 [ next-state ]
table [ nfa-table get ] |
s1 table final-states>> delete-at
t s3 table final-states>> set-at
s1 s0 eps <literal-transition> table add-transition
@ -99,58 +112,53 @@ M: star nfa-node ( node -- )
s2 s3 2array stack push ] ;
M: concatenation nfa-node ( node -- )
seq>>
reversed-regexp option? [ <reversed> ] when
[ [ nfa-node ] each ]
[ length 1- [ concatenate-nodes ] times ] bi ;
seq>> [ eps literal-transition add-simple-entry ] [
reversed-regexp option? [ <reversed> ] when
[ [ nfa-node ] each ]
[ length 1- [ concatenate-nodes ] times ] bi
] if-empty ;
M: alternation nfa-node ( node -- )
seq>>
[ [ nfa-node ] each ]
[ length 1- [ alternate-nodes ] times ] bi ;
M: constant nfa-node ( node -- )
M: integer nfa-node ( node -- )
case-insensitive option? [
dup char>> [ ch>lower ] [ ch>upper ] bi
dup [ ch>lower ] [ ch>upper ] bi
2dup = [
2drop
char>> literal-transition add-simple-entry
literal-transition add-simple-entry
] [
[ literal-transition add-simple-entry ] bi@
alternate-nodes drop
] if
] [
char>> literal-transition add-simple-entry
literal-transition add-simple-entry
] if ;
M: word nfa-node ( node -- ) class-transition add-simple-entry ;
M: primitive-class nfa-node ( node -- )
class>> dup
{ letter-class LETTER-class } member? case-insensitive option? and
[ drop Letter-class ] when
class-transition add-simple-entry ;
M: any-char nfa-node ( node -- )
[ dotall option? ] dip any-char-no-nl ?
class-transition add-simple-entry ;
M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
M: negation nfa-node ( node -- )
negate term>> nfa-node negate ;
M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
: choose-letter-class ( node -- node' )
case-insensitive option? Letter-class rot ? ;
M: letter-class nfa-node ( node -- )
choose-letter-class class-transition add-simple-entry ;
M: LETTER-class nfa-node ( node -- )
choose-letter-class class-transition add-simple-entry ;
M: character-class-range nfa-node ( node -- )
M: range nfa-node ( node -- )
case-insensitive option? [
! This should be implemented for Unicode by case-folding
! the input and all strings in the regexp.
dup [ from>> ] [ to>> ] bi
2dup [ Letter? ] bi@ and [
rot drop
[ [ ch>lower ] bi@ character-class-range boa ]
[ [ ch>upper ] bi@ character-class-range boa ] 2bi
[ [ ch>lower ] bi@ <range> ]
[ [ ch>upper ] bi@ <range> ] 2bi
[ class-transition add-simple-entry ] bi@
alternate-nodes
] [
@ -161,14 +169,15 @@ M: character-class-range nfa-node ( node -- )
class-transition add-simple-entry
] if ;
M: option nfa-node ( node -- )
[ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
eps literal-transition add-simple-entry ;
M: with-options nfa-node ( node -- )
dup options>> [ tree>> nfa-node ] using-options ;
: construct-nfa ( regexp -- )
: construct-nfa ( ast -- nfa-table )
[
reset-regexp
[ current-regexp set ]
[ parse-tree>> nfa-node ]
[ set-start-state ] tri
negated? off
V{ } clone combine-stack set
0 state set
<transition-table> clone nfa-table set
nfa-node
set-start-state
] with-scope ;

View File

@ -1,28 +1,9 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: peg.ebnf kernel math.parser sequences assocs arrays
combinators regexp.classes strings splitting peg locals ;
USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
combinators regexp.classes strings splitting peg locals accessors
regexp.ast ;
IN: regexp.parser
TUPLE: range from to ;
TUPLE: char-class ranges ;
TUPLE: primitive-class class ;
TUPLE: not-char-class ranges ;
TUPLE: not-primitive-class class ;
TUPLE: from-to n m ;
TUPLE: at-least n ;
TUPLE: up-to n ;
TUPLE: exactly n ;
TUPLE: times expression times ;
TUPLE: concatenation seq ;
TUPLE: alternation seq ;
TUPLE: maybe term ;
TUPLE: star term ;
TUPLE: plus term ;
TUPLE: with-options tree options ;
TUPLE: ast ^? $? tree ;
SINGLETON: any-char
: allowed-char? ( ch -- ? )
".()|[*+?" member? not ;
@ -64,21 +45,16 @@ ERROR: bad-class name ;
{ CHAR: e [ HEX: 1b ] }
{ CHAR: \\ [ CHAR: \\ ] }
{ CHAR: w [ c-identifier-class primitive-class boa ] }
{ CHAR: W [ c-identifier-class not-primitive-class boa ] }
{ CHAR: s [ java-blank-class primitive-class boa ] }
{ CHAR: S [ java-blank-class not-primitive-class boa ] }
{ CHAR: d [ digit-class primitive-class boa ] }
{ CHAR: D [ digit-class not-primitive-class boa ] }
{ CHAR: w [ c-identifier-class <primitive-class> ] }
{ CHAR: W [ c-identifier-class <primitive-class> <negation> ] }
{ CHAR: s [ java-blank-class <primitive-class> ] }
{ CHAR: S [ java-blank-class <primitive-class> <negation> ] }
{ CHAR: d [ digit-class <primitive-class> ] }
{ CHAR: D [ digit-class <primitive-class> <negation> ] }
[ ]
} case ;
TUPLE: options on off ;
SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ;
: options-assoc ( -- assoc )
H{
{ CHAR: i case-insensitive }
@ -98,19 +74,30 @@ unicode-case reversed-regexp ;
options-assoc value-at ;
: parse-options ( on off -- options )
[ [ ch>option ] map ] bi@ options boa ;
[ [ ch>option ] { } map-as ] bi@ <options> ;
! TODO: make range syntax better (negation, and, etc),
! add syntax for various parenthized things,
: string>options ( string -- options )
"-" split1 parse-options ;
: options>string ( options -- string )
[ on>> ] [ off>> ] bi
[ [ option>ch ] map ] bi@
[ "-" swap 3append ] unless-empty
"" like ;
! TODO: add syntax for various parenthized things,
! add greedy and nongreedy forms of matching
! (once it's all implemented)
EBNF: (parse-regexp)
EBNF: parse-regexp
CharacterInBracket = !("}") Character
Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]]
| "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]]
QuotedCharacter = !("\\E") .
Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
| "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
| "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
| "u" Character:a Character:b Character:c Character:d
=> [[ { a b c d } hex> ensure-number ]]
| "x" Character:a Character:b
@ -119,30 +106,30 @@ Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-cla
=> [[ { a b c } oct> ensure-number ]]
| . => [[ lookup-escape ]]
Character = "\\" Escape:e => [[ e ]]
| . ?[ allowed-char? ]?
EscapeSequence = "\\" Escape:e => [[ e ]]
AnyRangeCharacter = Character | "["
Character = EscapeSequence | . ?[ allowed-char? ]?
AnyRangeCharacter = EscapeSequence | .
RangeCharacter = !("]") AnyRangeCharacter
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
| RangeCharacter
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
| AnyRangeCharacter
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
CharClass = "^" Ranges:e => [[ e not-char-class boa ]]
| Ranges:e => [[ e char-class boa ]]
CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
Options = [idmsux]*
Parenthized = "?:" Alternation:a => [[ a ]]
| "?" Options:on "-"? Options:off ":" Alternation:a
=> [[ a on off parse-options with-options boa ]]
| "?#" [^)]* => [[ ignore ]]
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
| Alternation
Element = "(" Parenthized:p ")" => [[ p ]]
@ -152,32 +139,24 @@ Element = "(" Parenthized:p ")" => [[ p ]]
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
Times = "," Number:n "}" => [[ n up-to boa ]]
| Number:n ",}" => [[ n at-least boa ]]
| Number:n "}" => [[ n exactly boa ]]
Times = "," Number:n "}" => [[ 0 n <from-to> ]]
| Number:n ",}" => [[ n <at-least> ]]
| Number:n "}" => [[ n n <from-to> ]]
| "}" => [[ bad-number ]]
| Number:n "," Number:m "}" => [[ n m from-to boa ]]
| Number:n "," Number:m "}" => [[ n m <from-to> ]]
Repeated = Element:e "{" Times:t => [[ e t times boa ]]
| Element:e "?" => [[ e maybe boa ]]
| Element:e "*" => [[ e star boa ]]
| Element:e "+" => [[ e plus boa ]]
Repeated = Element:e "{" Times:t => [[ e t <times> ]]
| Element:e "?" => [[ e <maybe> ]]
| Element:e "*" => [[ e <star> ]]
| Element:e "+" => [[ e <plus> ]]
| Element
Concatenation = Repeated*:r => [[ r concatenation boa ]]
Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
Alternation = Concatenation:c ("|" Concatenation)*:a
=> [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]]
=> [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
End = !(.)
Main = Alternation End
;EBNF
: parse-regexp ( string -- regexp )
! Hack because I want $ allowable in regexps,
! but with special behavior at the end
! This fails if the regexp is stupid, though...
dup first CHAR: ^ = tuck [ rest ] when
dup peek CHAR: $ = tuck [ but-last ] when
(parse-regexp) ast boa ;

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings help.markup help.syntax regexp.backend ;
USING: kernel strings help.markup help.syntax ;
IN: regexp
HELP: <regexp>

View File

@ -189,8 +189,8 @@ IN: regexp-tests
[ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
[ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
[ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
[ t ] [ "x" "\\u0078" <regexp> matches? ] unit-test
[ f ] [ "y" "\\u0078" <regexp> matches? ] unit-test
[ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
[ f ] [ "b" "a+b" <regexp> matches? ] unit-test
@ -317,16 +317,6 @@ IN: regexp-tests
! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test
! Convert to lowercase until E
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
! Convert to uppercase until E
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
@ -370,10 +360,10 @@ IN: regexp-tests
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a" R/ a$/m matches? ] unit-test
! [ t ] [ "a\n" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test

View File

@ -2,33 +2,15 @@
! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry regexp.backend regexp.utils
namespaces parser arrays fry locals
regexp.parser regexp.nfa regexp.dfa regexp.traversal
regexp.transition-tables splitting sorting ;
regexp.transition-tables splitting sorting regexp.ast ;
IN: regexp
: default-regexp ( string -- regexp )
regexp new
swap >>raw
<transition-table> >>nfa-table
<transition-table> >>dfa-table
<transition-table> >>minimized-table
H{ } clone >>nfa-traversal-flags
H{ } clone >>dfa-traversal-flags
H{ } clone >>options
H{ } clone >>matchers
reset-regexp ;
: construct-regexp ( regexp -- regexp' )
{
[ dup raw>> parse-regexp >>parse-tree drop ]
[ construct-nfa ]
[ construct-dfa ]
[ ]
} cleave ;
TUPLE: regexp raw options parse-tree dfa ;
: (match) ( string regexp -- dfa-traverser )
<dfa-traverser> do-match ; inline
dfa>> <dfa-traverser> do-match ; inline
: match ( string regexp -- slice/f )
(match) return-match ;
@ -94,17 +76,17 @@ IN: regexp
{ "R| " "|" }
} swap [ subseq? not nip ] curry assoc-find drop ;
: string>options ( string -- options )
[ ch>option dup ] H{ } map>assoc ;
: options>string ( options -- string )
keys [ option>ch ] map natural-sort >string ;
PRIVATE>
: <optioned-regexp> ( string option-string -- regexp )
[ default-regexp ] [ string>options ] bi* >>options
construct-regexp ;
:: <optioned-regexp> ( string options -- regexp )
string parse-regexp :> tree
options parse-options :> opt
tree opt <with-options> :> ast
regexp new
string >>raw
opt >>options
tree >>parse-tree
tree opt <with-options> construct-nfa construct-dfa >>dfa ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs fry hashtables kernel sequences
vectors regexp.utils ;
vectors ;
IN: regexp.transition-tables
TUPLE: transition from to obj ;

View File

@ -1,8 +1,8 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs combinators kernel math
quotations sequences regexp.parser regexp.classes fry arrays
combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
quotations sequences regexp.classes fry arrays
combinators.short-circuit prettyprint regexp.nfa ;
IN: regexp.traversal
TUPLE: dfa-traverser
@ -13,8 +13,7 @@ TUPLE: dfa-traverser
start-index current-index
matches ;
: <dfa-traverser> ( text regexp -- match )
dfa-table>>
: <dfa-traverser> ( text dfa -- match )
dfa-traverser new
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
swap >>text

View File

@ -1,4 +0,0 @@
USING: regexp.utils tools.test ;
IN: regexp.utils.tests
[ [ ] [ ] while-changes ] must-infer

View File

@ -1,42 +0,0 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs io kernel math math.order
namespaces regexp.backend sequences unicode.categories
math.ranges fry combinators.short-circuit vectors ;
IN: regexp.utils
: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
[ [ dup slip ] dip pick over call ] dip dupd =
[ 3drop ] [ (while-changes) ] if ; inline recursive
: while-changes ( obj quot pred -- obj' )
pick over call (while-changes) ; inline
ERROR: bad-octal number ;
ERROR: bad-hex number ;
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
: hex-digit? ( n -- ? )
{
[ decimal-digit? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: A CHAR: F between? ]
} 1|| ;
: punct? ( n -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
: java-blank? ( n -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
: java-printable? ( n -- ? )
[ [ alpha? ] [ punct? ] ] 1|| ;