Merge branch 'master' of git://factorcode.org/git/factor

db4
Doug Coleman 2009-03-09 11:32:09 -05:00
commit d6d98a0211
44 changed files with 1787 additions and 1349 deletions

View File

@ -10,7 +10,7 @@ IN: ascii
: LETTER? ( ch -- ? ) CHAR: A CHAR: Z between? ; inline : LETTER? ( ch -- ? ) CHAR: A CHAR: Z between? ; inline
: digit? ( ch -- ? ) CHAR: 0 CHAR: 9 between? ; inline : digit? ( ch -- ? ) CHAR: 0 CHAR: 9 between? ; inline
: printable? ( ch -- ? ) CHAR: \s CHAR: ~ between? ; inline : printable? ( ch -- ? ) CHAR: \s CHAR: ~ between? ; inline
: control? ( ch -- ? ) "\0\e\r\n\t\u000008\u00007f" member? ; inline : control? ( ch -- ? ) { [ 0 HEX: 1F between? ] [ HEX: 7F = ] } 1|| ; inline
: quotable? ( ch -- ? ) { [ printable? ] [ "\"\\" member? not ] } 1&& ; inline : quotable? ( ch -- ? ) { [ printable? ] [ "\"\\" member? not ] } 1&& ; inline
: Letter? ( ch -- ? ) { [ letter? ] [ LETTER? ] } 1|| ; inline : Letter? ( ch -- ? ) { [ letter? ] [ LETTER? ] } 1|| ; inline
: alpha? ( ch -- ? ) { [ Letter? ] [ digit? ] } 1|| ; inline : alpha? ( ch -- ? ) { [ Letter? ] [ digit? ] } 1|| ; inline
@ -20,4 +20,4 @@ IN: ascii
: >upper ( str -- upper ) [ ch>upper ] map ; : >upper ( str -- upper ) [ ch>upper ] map ;
HINTS: >lower string ; HINTS: >lower string ;
HINTS: >upper string ; HINTS: >upper string ;

View File

@ -14,5 +14,6 @@ USING: tools.test globs ;
[ f ] [ "foo.java" "*.{xml,txt}" glob-matches? ] unit-test [ f ] [ "foo.java" "*.{xml,txt}" glob-matches? ] unit-test
[ t ] [ "foo.txt" "*.{xml,txt}" glob-matches? ] unit-test [ t ] [ "foo.txt" "*.{xml,txt}" glob-matches? ] unit-test
[ t ] [ "foo.xml" "*.{xml,txt}" glob-matches? ] unit-test [ t ] [ "foo.xml" "*.{xml,txt}" glob-matches? ] unit-test
[ f ] [ "foo." "*.{,xml,txt}" glob-matches? ] unit-test [ f ] [ "foo." "*.{xml,txt}" glob-matches? ] unit-test
[ t ] [ "foo." "*.{,xml,txt}" glob-matches? ] unit-test
[ t ] [ "foo.{" "*.{" glob-matches? ] unit-test [ t ] [ "foo.{" "*.{" glob-matches? ] unit-test

View File

@ -1,42 +1,42 @@
! Copyright (C) 2007 Slava Pestov. ! Copyright (C) 2007, 2009 Slava Pestov, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: parser-combinators parser-combinators.regexp lists sequences kernel USING: sequences kernel regexp.combinators regexp.matchers strings unicode.case
promises strings unicode.case ; peg.ebnf regexp arrays ;
IN: globs IN: globs
<PRIVATE EBNF: <glob>
: 'char' ( -- parser ) Character = "\\" .:c => [[ c 1string <literal> ]]
[ ",*?" member? not ] satisfy ; | !(","|"}") . => [[ 1string <literal> ]]
: 'string' ( -- parser ) RangeCharacter = !("]") .
'char' <+> [ >lower token ] <@ ;
: 'escaped-char' ( -- parser ) Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <char-range> ]]
"\\" token any-char-parser &> [ 1token ] <@ ; | RangeCharacter => [[ 1string <literal> ]]
: 'escaped-string' ( -- parser ) StartRange = .:a "-" RangeCharacter:b => [[ a b <char-range> ]]
'string' 'escaped-char' <|> ; | . => [[ 1string <literal> ]]
DEFER: 'term' Ranges = StartRange:s Range*:r => [[ r s prefix ]]
: 'glob' ( -- parser ) CharClass = "^"?:n Ranges:e => [[ e <or> n [ <not> ] when ]]
'term' <*> [ <and-parser> ] <@ ;
: 'union' ( -- parser ) AlternationBody = Concatenation:c "," AlternationBody:a => [[ a c prefix ]]
'glob' "," token nonempty-list-of "{" "}" surrounded-by | Concatenation => [[ 1array ]]
[ <or-parser> ] <@ ;
LAZY: 'term' ( -- parser ) Element = "*" => [[ R/ .*/ ]]
'union' | "?" => [[ R/ ./ ]]
'character-class' <|> | "[" CharClass:c "]" => [[ c ]]
"?" token [ drop any-char-parser ] <@ <|> | "{" AlternationBody:b "}" => [[ b <or> ]]
"*" token [ drop any-char-parser <*> ] <@ <|> | Character
'escaped-string' <|> ;
PRIVATE> Concatenation = Element* => [[ <sequence> ]]
: <glob> ( string -- glob ) 'glob' just parse-1 just ; End = !(.)
Main = Concatenation End
;EBNF
: glob-matches? ( input glob -- ? ) : glob-matches? ( input glob -- ? )
[ >lower ] [ <glob> ] bi* parse nil? not ; [ >case-fold ] bi@ <glob> matches? ;

View File

@ -9,6 +9,8 @@ IN: http.tests
[ "text/html" utf8 ] [ "text/html; charset=UTF-8" parse-content-type ] unit-test [ "text/html" utf8 ] [ "text/html; charset=UTF-8" parse-content-type ] unit-test
[ "text/html" utf8 ] [ "text/html; charset=\"utf-8\"" parse-content-type ] unit-test
[ "application/octet-stream" binary ] [ "application/octet-stream" parse-content-type ] unit-test [ "application/octet-stream" binary ] [ "application/octet-stream" parse-content-type ] unit-test
: lf>crlf "\n" split "\r\n" join ; : lf>crlf "\n" split "\r\n" join ;

View File

@ -213,7 +213,10 @@ TUPLE: post-data data params content-type content-encoding ;
swap >>content-type ; swap >>content-type ;
: parse-content-type-attributes ( string -- attributes ) : parse-content-type-attributes ( string -- attributes )
" " split harvest [ "=" split1 [ >lower ] dip ] { } map>assoc ; " " split harvest [
"=" split1
[ >lower ] [ "\"" ?head drop "\"" ?tail drop ] bi*
] { } map>assoc ;
: parse-content-type ( content-type -- type encoding ) : parse-content-type ( content-type -- type encoding )
";" split1 ";" split1

View File

@ -0,0 +1,65 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel arrays accessors fry sequences regexp.classes ;
FROM: math.ranges => [a,b] ;
IN: regexp.ast
TUPLE: negation term ;
C: <negation> negation
TUPLE: from-to n m ;
C: <from-to> from-to
TUPLE: at-least n ;
C: <at-least> at-least
TUPLE: tagged-epsilon tag ;
C: <tagged-epsilon> tagged-epsilon
CONSTANT: epsilon T{ tagged-epsilon { tag t } }
TUPLE: concatenation first second ;
: <concatenation> ( seq -- concatenation )
[ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
TUPLE: alternation first second ;
: <alternation> ( seq -- alternation )
unclip [ alternation boa ] reduce ;
TUPLE: star term ;
C: <star> star
TUPLE: with-options tree options ;
C: <with-options> with-options
TUPLE: options on off ;
C: <options> options
SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ;
: <maybe> ( term -- term' )
f <concatenation> 2array <alternation> ;
: <plus> ( term -- term' )
dup <star> 2array <concatenation> ;
: repetition ( n term -- term' )
<array> <concatenation> ;
GENERIC: <times> ( term times -- term' )
M: at-least <times>
n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
M: from-to <times>
[ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
: char-class ( ranges ? -- term )
[ <or-class> ] dip [ <not-class> ] when ;
TUPLE: lookahead term positive? ;
C: <lookahead> lookahead
TUPLE: lookbehind term positive? ;
C: <lookbehind> lookbehind

View File

@ -1,27 +0,0 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors hashtables kernel math vectors ;
IN: regexp.backend
TUPLE: regexp
raw
{ options hashtable }
stack
parse-tree
nfa-table
dfa-table
minimized-table
matchers
{ nfa-traversal-flags hashtable }
{ dfa-traversal-flags hashtable }
{ state integer }
{ new-states vector }
{ visited-states hashtable } ;
: reset-regexp ( regexp -- regexp )
0 >>state
V{ } clone >>stack
V{ } clone >>new-states
H{ } clone >>visited-states ;
SYMBOL: current-regexp

View File

@ -0,0 +1,58 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.classes tools.test arrays kernel ;
IN: regexp.classes.tests
! Class algebra
[ f ] [ { 1 2 } <and-class> ] unit-test
[ T{ or-class f { 2 1 } } ] [ { 1 2 } <or-class> ] unit-test
[ 3 ] [ { 1 2 } <and-class> 3 2array <or-class> ] unit-test
[ CHAR: A ] [ CHAR: A LETTER-class <primitive-class> 2array <and-class> ] unit-test
[ CHAR: A ] [ LETTER-class <primitive-class> CHAR: A 2array <and-class> ] unit-test
[ T{ primitive-class { class LETTER-class } } ] [ CHAR: A LETTER-class <primitive-class> 2array <or-class> ] unit-test
[ T{ primitive-class { class LETTER-class } } ] [ LETTER-class <primitive-class> CHAR: A 2array <or-class> ] unit-test
[ t ] [ { t 1 } <or-class> ] unit-test
[ t ] [ { 1 t } <or-class> ] unit-test
[ f ] [ { f 1 } <and-class> ] unit-test
[ f ] [ { 1 f } <and-class> ] unit-test
[ 1 ] [ { f 1 } <or-class> ] unit-test
[ 1 ] [ { 1 f } <or-class> ] unit-test
[ 1 ] [ { t 1 } <and-class> ] unit-test
[ 1 ] [ { 1 t } <and-class> ] unit-test
[ 1 ] [ 1 <not-class> <not-class> ] unit-test
[ 1 ] [ { 1 1 } <and-class> ] unit-test
[ 1 ] [ { 1 1 } <or-class> ] unit-test
[ t ] [ { t t } <or-class> ] unit-test
[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <and-class> ] unit-test
[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <or-class> ] unit-test
[ T{ or-class { seq { 2 3 1 } } } ] [ { 1 2 } <or-class> { 2 3 } <or-class> 2array <or-class> ] unit-test
[ T{ or-class { seq { 3 2 } } } ] [ { 2 3 } <or-class> 1 <not-class> 2array <and-class> ] unit-test
[ f ] [ t <not-class> ] unit-test
[ t ] [ f <not-class> ] unit-test
[ f ] [ 1 <not-class> 1 t replace-question ] unit-test
! Making classes into nested conditionals
[ V{ 1 2 3 4 } ] [ T{ and-class f { 1 T{ not-class f 2 } T{ or-class f { 3 4 } } 2 } } class>questions ] unit-test
[ { 3 } ] [ { { 3 t } } table>condition ] unit-test
[ { T{ primitive-class } } ] [ { { 1 t } { 2 T{ primitive-class } } } table>questions ] unit-test
[ { { 1 t } { 2 t } } ] [ { { 1 t } { 2 T{ primitive-class } } } T{ primitive-class } t answer ] unit-test
[ { { 1 t } } ] [ { { 1 t } { 2 T{ primitive-class } } } T{ primitive-class } f answer ] unit-test
[ T{ condition f T{ primitive-class } { 1 2 } { 1 } } ] [ { { 1 t } { 2 T{ primitive-class } } } table>condition ] unit-test
SYMBOL: foo
SYMBOL: bar
[ T{ condition f T{ primitive-class f bar } T{ condition f T{ primitive-class f foo } { 1 3 2 } { 1 3 } } T{ condition f T{ primitive-class f foo } { 1 2 } { 1 } } } ] [ { { 1 t } { 3 T{ primitive-class f bar } } { 2 T{ primitive-class f foo } } } table>condition ] unit-test
[ t ] [ foo <primitive-class> dup t replace-question ] unit-test
[ f ] [ foo <primitive-class> dup f replace-question ] unit-test
[ T{ primitive-class f foo } ] [ foo <primitive-class> bar <primitive-class> t replace-question ] unit-test
[ T{ primitive-class f foo } ] [ foo <primitive-class> bar <primitive-class> f replace-question ] unit-test
[ T{ primitive-class f foo } ] [ foo <primitive-class> bar <primitive-class> 2array <and-class> bar <primitive-class> t replace-question ] unit-test
[ T{ primitive-class f bar } ] [ foo <primitive-class> bar <primitive-class> 2array <and-class> foo <primitive-class> t replace-question ] unit-test
[ f ] [ foo <primitive-class> bar <primitive-class> 2array <and-class> foo <primitive-class> f replace-question ] unit-test
[ f ] [ foo <primitive-class> bar <primitive-class> 2array <and-class> bar <primitive-class> f replace-question ] unit-test
[ t ] [ foo <primitive-class> bar <primitive-class> 2array <or-class> bar <primitive-class> t replace-question ] unit-test
[ T{ primitive-class f foo } ] [ foo <primitive-class> bar <primitive-class> 2array <or-class> bar <primitive-class> f replace-question ] unit-test

View File

@ -1,7 +1,8 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words regexp.utils USING: accessors kernel math math.order words combinators locals
unicode.categories combinators.short-circuit ; ascii unicode.categories combinators.short-circuit sequences
fry macros arrays assocs sets classes ;
IN: regexp.classes IN: regexp.classes
SINGLETONS: any-char any-char-no-nl SINGLETONS: any-char any-char-no-nl
@ -11,19 +12,18 @@ ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ; unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input beginning-of-line SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ;
end-of-input end-of-line ;
MIXIN: node TUPLE: range from to ;
TUPLE: character-class-range from to ; INSTANCE: character-class-range node C: <range> range
GENERIC: class-member? ( obj class -- ? ) GENERIC: class-member? ( obj class -- ? )
M: t class-member? ( obj class -- ? ) 2drop f ; M: t class-member? ( obj class -- ? ) 2drop t ;
M: integer class-member? ( obj class -- ? ) 2drop f ; M: integer class-member? ( obj class -- ? ) = ;
M: character-class-range class-member? ( obj class -- ? ) M: range class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ; [ from>> ] [ to>> ] bi between? ;
M: any-char class-member? ( obj class -- ? ) M: any-char class-member? ( obj class -- ? )
@ -47,16 +47,24 @@ M: ascii-class class-member? ( obj class -- ? )
M: digit-class class-member? ( obj class -- ? ) M: digit-class class-member? ( obj class -- ? )
drop digit? ; drop digit? ;
: c-identifier-char? ( ch -- ? )
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
M: c-identifier-class class-member? ( obj class -- ? ) M: c-identifier-class class-member? ( obj class -- ? )
drop drop c-identifier-char? ;
{ [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
M: alpha-class class-member? ( obj class -- ? ) M: alpha-class class-member? ( obj class -- ? )
drop alpha? ; drop alpha? ;
: punct? ( ch -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
M: punctuation-class class-member? ( obj class -- ? ) M: punctuation-class class-member? ( obj class -- ? )
drop punct? ; drop punct? ;
: java-printable? ( ch -- ? )
{ [ alpha? ] [ punct? ] } 1|| ;
M: java-printable-class class-member? ( obj class -- ? ) M: java-printable-class class-member? ( obj class -- ? )
drop java-printable? ; drop java-printable? ;
@ -64,11 +72,24 @@ M: non-newline-blank-class class-member? ( obj class -- ? )
drop { [ blank? ] [ CHAR: \n = not ] } 1&& ; drop { [ blank? ] [ CHAR: \n = not ] } 1&& ;
M: control-character-class class-member? ( obj class -- ? ) M: control-character-class class-member? ( obj class -- ? )
drop control-char? ; drop control? ;
: hex-digit? ( ch -- ? )
{
[ CHAR: A CHAR: F between? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: 0 CHAR: 9 between? ]
} 1|| ;
M: hex-digit-class class-member? ( obj class -- ? ) M: hex-digit-class class-member? ( obj class -- ? )
drop hex-digit? ; drop hex-digit? ;
: java-blank? ( ch -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
M: java-blank-class class-member? ( obj class -- ? ) M: java-blank-class class-member? ( obj class -- ? )
drop java-blank? ; drop java-blank? ;
@ -76,16 +97,187 @@ M: unmatchable-class class-member? ( obj class -- ? )
2drop f ; 2drop f ;
M: terminator-class class-member? ( obj class -- ? ) M: terminator-class class-member? ( obj class -- ? )
drop { drop "\r\n\u000085\u002029\u002028" member? ;
[ CHAR: \r = ]
[ CHAR: \n = ]
[ CHAR: \u000085 = ]
[ CHAR: \u002028 = ]
[ CHAR: \u002029 = ]
} 1|| ;
M: beginning-of-line class-member? ( obj class -- ? ) M: ^ class-member? ( obj class -- ? )
2drop f ; 2drop f ;
M: end-of-line class-member? ( obj class -- ? ) M: $ class-member? ( obj class -- ? )
2drop f ; 2drop f ;
M: f class-member? 2drop f ;
TUPLE: primitive-class class ;
C: <primitive-class> primitive-class
TUPLE: or-class seq ;
TUPLE: not-class class ;
TUPLE: and-class seq ;
GENERIC: combine-and ( class1 class2 -- combined ? )
: replace-if-= ( object object -- object ? )
over = ;
M: object combine-and replace-if-= ;
M: t combine-and
drop t ;
M: f combine-and
nip t ;
M: not-class combine-and
class>> 2dup = [ 2drop f t ] [
dup integer? [
2dup swap class-member?
[ 2drop f f ]
[ drop t ] if
] [ 2drop f f ] if
] if ;
M: integer combine-and
swap 2dup class-member? [ drop t ] [ 2drop f t ] if ;
GENERIC: combine-or ( class1 class2 -- combined ? )
M: object combine-or replace-if-= ;
M: t combine-or
nip t ;
M: f combine-or
drop t ;
M: not-class combine-or
class>> = [ t t ] [ f f ] if ;
M: integer combine-or
2dup swap class-member? [ drop t ] [ 2drop f f ] if ;
: flatten ( seq class -- newseq )
'[ dup _ instance? [ seq>> ] [ 1array ] if ] map concat ; inline
: try-combine ( elt1 elt2 quot -- combined/f ? )
3dup call [ [ 3drop ] dip t ] [ drop swapd call ] if ; inline
:: prefix-combining ( seq elt quot: ( elt1 elt2 -- combined/f ? ) -- newseq )
f :> combined!
seq [ elt quot try-combine swap combined! ] find drop
[ seq remove-nth combined prefix ]
[ seq elt prefix ] if* ; inline
:: combine ( seq quot: ( elt1 elt2 -- combined/f ? ) empty class -- newseq )
seq class flatten
{ } [ quot prefix-combining ] reduce
dup length {
{ 0 [ drop empty ] }
{ 1 [ first ] }
[ drop class new swap >>seq ]
} case ; inline
: <and-class> ( seq -- class )
[ combine-and ] t and-class combine ;
M: and-class class-member?
seq>> [ class-member? ] with all? ;
: <or-class> ( seq -- class )
[ combine-or ] f or-class combine ;
M: or-class class-member?
seq>> [ class-member? ] with any? ;
GENERIC: <not-class> ( class -- inverse )
M: object <not-class>
not-class boa ;
M: not-class <not-class>
class>> ;
M: and-class <not-class>
seq>> [ <not-class> ] map <or-class> ;
M: or-class <not-class>
seq>> [ <not-class> ] map <and-class> ;
M: t <not-class> drop f ;
M: f <not-class> drop t ;
M: not-class class-member?
class>> class-member? not ;
M: primitive-class class-member?
class>> class-member? ;
UNION: class primitive-class not-class or-class and-class range ;
TUPLE: condition question yes no ;
C: <condition> condition
GENERIC# replace-question 2 ( class from to -- new-class )
M:: object replace-question ( class from to -- new-class )
class from = to class ? ;
: replace-compound ( class from to -- seq )
[ seq>> ] 2dip '[ _ _ replace-question ] map ;
M: and-class replace-question
replace-compound <and-class> ;
M: or-class replace-question
replace-compound <or-class> ;
M: not-class replace-question
[ class>> ] 2dip replace-question <not-class> ;
: answer ( table question answer -- new-table )
'[ _ _ replace-question ] assoc-map
[ nip ] assoc-filter ;
: answers ( table questions answer -- new-table )
'[ _ answer ] each ;
DEFER: make-condition
: (make-condition) ( table questions question -- condition )
[ 2nip ]
[ swap [ t answer ] dip make-condition ]
[ swap [ f answer ] dip make-condition ] 3tri
2dup = [ 2nip ] [ <condition> ] if ;
: make-condition ( table questions -- condition )
[ keys ] [ unclip (make-condition) ] if-empty ;
GENERIC: class>questions ( class -- questions )
: compound-questions ( class -- questions ) seq>> [ class>questions ] gather ;
M: or-class class>questions compound-questions ;
M: and-class class>questions compound-questions ;
M: not-class class>questions class>> class>questions ;
M: object class>questions 1array ;
: table>questions ( table -- questions )
values [ class>questions ] gather >array t swap remove ;
: table>condition ( table -- condition )
! input table is state => class
>alist dup table>questions make-condition ;
: condition-map ( condition quot: ( obj -- obj' ) -- new-condition )
over condition? [
[ [ question>> ] [ yes>> ] [ no>> ] tri ] dip
'[ _ condition-map ] bi@ <condition>
] [ call ] if ; inline recursive
: condition-states ( condition -- states )
dup condition? [
[ yes>> ] [ no>> ] bi
[ condition-states ] bi@ append prune
] [ 1array ] if ;
: condition-at ( condition assoc -- new-condition )
'[ _ at ] condition-map ;

View File

@ -0,0 +1 @@
Daniel Ehrenberg

View File

@ -0,0 +1,54 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.syntax help.markup regexp strings ;
IN: regexp.combinators
ABOUT: "regexp.combinators"
ARTICLE: "regexp.combinators" "Regular expression combinators"
"The " { $vocab-link "regexp.combinators" } " vocabulary defines combinators which can be used to build up regular expressions to match strings. This is in addition to the traditional syntax defined in the " { $vocab-link "regexp" } " vocabulary."
{ $subsection <literal> }
{ $subsection <nothing> }
{ $subsection <or> }
{ $subsection <and> }
{ $subsection <not> }
{ $subsection <sequence> }
{ $subsection <zero-or-more> }
{ $subsection <one-or-more> }
{ $subsection <option> } ;
HELP: <literal>
{ $values { "string" string } { "regexp" regexp } }
{ $description "Creates a regular expression which matches the given literal string." } ;
HELP: <nothing>
{ $values { "value" regexp } }
{ $description "The empty regular language." } ;
HELP: <or>
{ $values { "regexps" "a sequence of regular expressions" } { "disjunction" regexp } }
{ $description "Creates a new regular expression which matches the union of what elements of the sequence match." } ;
HELP: <and>
{ $values { "regexps" "a sequence of regular expressions" } { "conjunction" regexp } }
{ $description "Creates a new regular expression which matches the intersection of what elements of the sequence match." } ;
HELP: <sequence>
{ $values { "regexps" "a sequence of regular expressions" } { "regexp" regexp } }
{ $description "Creates a new regular expression which matches strings that match each element of the sequence in order." } ;
HELP: <not>
{ $values { "regexp" regexp } { "not-regexp" regexp } }
{ $description "Creates a new regular expression which matches everything that the given regexp does not match." } ;
HELP: <one-or-more>
{ $values { "regexp" regexp } { "regexp+" regexp } }
{ $description "Creates a new regular expression which matches one or more copies of the given regexp." } ;
HELP: <option>
{ $values { "regexp" regexp } { "regexp?" regexp } }
{ $description "Creates a new regular expression which matches zero or one copies of the given regexp." } ;
HELP: <zero-or-more>
{ $values { "regexp" regexp } { "regexp*" regexp } }
{ $description "Creates a new regular expression which matches zero or more copies of the given regexp." } ;

View File

@ -0,0 +1,32 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.combinators tools.test regexp kernel sequences regexp.matchers ;
IN: regexp.combinators.tests
: strings ( -- regexp )
{ "foo" "bar" "baz" } <any-of> ;
[ t t t ] [ "foo" "bar" "baz" [ strings matches? ] tri@ ] unit-test
[ f f f ] [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test
USE: multiline
/*
! Why is conjuction broken?
: conj ( -- regexp )
{ R' .*a' R' b.*' } <and> ;
[ t ] [ "bljhasflsda" conj matches? ] unit-test
[ f ] [ "bsdfdfs" conj matches? ] unit-test ! why does this fail?
[ f ] [ "fsfa" conj matches? ] unit-test
[ f ] [ "bljhasflsda" conj <not> matches? ] unit-test
[ t ] [ "bsdfdfs" conj <not> matches? ] unit-test
[ t ] [ "fsfa" conj <not> matches? ] unit-test
*/
[ f f ] [ "" "hi" [ <nothing> matches? ] bi@ ] unit-test
[ t t ] [ "" "hi" [ <nothing> <not> matches? ] bi@ ] unit-test
[ { t t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <zero-or-more> matches? ] map ] unit-test
[ { f t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <one-or-more> matches? ] map ] unit-test
[ { t t f f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <option> matches? ] map ] unit-test

View File

@ -0,0 +1,56 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: regexp sequences kernel regexp.negation regexp.ast
accessors fry regexp.classes ;
IN: regexp.combinators
<PRIVATE
: modify-regexp ( regexp raw-quot tree-quot -- new-regexp )
[ '[ raw>> @ ] ]
[ '[ parse-tree>> @ ] ] bi* bi
make-regexp ; inline
PRIVATE>
CONSTANT: <nothing> R/ (?~.*)/
: <literal> ( string -- regexp )
[ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ; foldable
: <char-range> ( char1 char2 -- regexp )
[ [ "[" "-" surround ] [ "]" append ] bi* append ]
[ <range> ]
2bi make-regexp ;
: <or> ( regexps -- disjunction )
[ [ raw>> "(" ")" surround ] map "|" join ]
[ [ parse-tree>> ] map <alternation> ] bi
make-regexp ; foldable
: <any-of> ( strings -- regexp )
[ <literal> ] map <or> ; foldable
: <sequence> ( regexps -- regexp )
[ [ raw>> ] map concat ]
[ [ parse-tree>> ] map <concatenation> ] bi
make-regexp ; foldable
: <not> ( regexp -- not-regexp )
[ "(?~" ")" surround ]
[ <negation> ] modify-regexp ; foldable
: <and> ( regexps -- conjunction )
[ <not> ] map <or> <not> ; foldable
: <zero-or-more> ( regexp -- regexp* )
[ "(" ")*" surround ]
[ <star> ] modify-regexp ; foldable
: <one-or-more> ( regexp -- regexp+ )
[ "(" ")+" surround ]
[ <plus> ] modify-regexp ; foldable
: <option> ( regexp -- regexp? )
[ "(" ")?" surround ]
[ <maybe> ] modify-regexp ; foldable

View File

@ -0,0 +1 @@
Combinators for creating regular expressions

View File

@ -0,0 +1 @@
parsing

View File

@ -0,0 +1,153 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.classes kernel sequences regexp.negation
quotations regexp.minimize assocs fry math locals combinators
accessors words compiler.units kernel.private strings
sequences.private arrays regexp.matchers call namespaces
regexp.transition-tables combinators.short-circuit ;
IN: regexp.compiler
GENERIC: question>quot ( question -- quot )
<PRIVATE
SYMBOL: shortest?
SYMBOL: backwards?
M: t question>quot drop [ 2drop t ] ;
M: beginning-of-input question>quot
drop [ drop zero? ] ;
M: end-of-input question>quot
drop [ length = ] ;
M: end-of-file question>quot
drop [
{
[ length swap - 2 <= ]
[ swap tail { "\n" "\r\n" "\r" "" } member? ]
} 2&&
] ;
M: $ question>quot
drop [ { [ length = ] [ ?nth "\r\n" member? ] } 2|| ] ;
M: ^ question>quot
drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
: (execution-quot) ( next-state -- quot )
! The conditions here are for lookaround and anchors, etc
dup condition? [
[ question>> question>quot ] [ yes>> ] [ no>> ] tri
[ (execution-quot) ] bi@
'[ 2dup @ _ _ if ]
] [ '[ _ execute ] ] if ;
: execution-quot ( next-state -- quot )
dup sequence? [ first ] when
(execution-quot) ;
TUPLE: box contents ;
C: <box> box
: condition>quot ( condition -- quot )
! Conditions here are for different classes
dup condition? [
[ question>> ] [ yes>> ] [ no>> ] tri
[ condition>quot ] bi@
'[ dup _ class-member? _ _ if ]
] [
contents>>
[ [ 3drop ] ] [ execution-quot '[ drop @ ] ] if-empty
] if ;
: non-literals>dispatch ( literals non-literals -- quot )
[ swap ] assoc-map ! we want state => predicate, and get the opposite as input
swap keys f answers
table>condition [ <box> ] condition-map condition>quot ;
: literals>cases ( literal-transitions -- case-body )
[ execution-quot ] assoc-map ;
: expand-one-or ( or-class transition -- alist )
[ seq>> ] dip '[ _ 2array ] map ;
: expand-or ( alist -- new-alist )
[
first2 over or-class?
[ expand-one-or ] [ 2array 1array ] if
] map concat ;
: split-literals ( transitions -- case default )
>alist expand-or [ first integer? ] partition
[ [ literals>cases ] keep ] dip non-literals>dispatch ;
:: step ( last-match index str quot final? direction -- last-index/f )
final? index last-match ?
index str bounds-check? [
index direction + str
index str nth-unsafe
quot call
] when ; inline
: direction ( -- n )
backwards? get -1 1 ? ;
: transitions>quot ( transitions final-state? -- quot )
dup shortest? get and [ 2drop [ drop nip ] ] [
[ split-literals swap case>quot ] dip direction
'[ { array-capacity string } declare _ _ _ step ]
] if ;
: word>quot ( word dfa -- quot )
[ transitions>> at ]
[ final-states>> key? ] 2bi
transitions>quot ;
: states>code ( words dfa -- )
'[
[
dup _ word>quot
(( last-match index string -- ? ))
define-declared
] each
] with-compilation-unit ;
: states>words ( dfa -- words dfa )
dup transitions>> keys [ gensym ] H{ } map>assoc
[ transitions-at ]
[ values ]
bi swap ;
: dfa>word ( dfa -- word )
states>words [ states>code ] keep start-state>> ;
: check-string ( string -- string )
! Make this configurable
dup string? [ "String required" throw ] unless ;
: setup-regexp ( start-index string -- f start-index string )
[ f ] [ >fixnum ] [ check-string ] tri* ; inline
PRIVATE>
! The quotation returned is ( start-index string -- i/f )
: dfa>quotation ( dfa -- quot )
dfa>word execution-quot '[ setup-regexp @ ] ;
: dfa>shortest-quotation ( dfa -- quot )
t shortest? [ dfa>quotation ] with-variable ;
: dfa>reverse-quotation ( dfa -- quot )
t backwards? [ dfa>quotation ] with-variable ;
: dfa>reverse-shortest-quotation ( dfa -- quot )
t backwards? [ dfa>shortest-quotation ] with-variable ;
TUPLE: quot-matcher quot ;
C: <quot-matcher> quot-matcher
M: quot-matcher match-index-from
quot>> call( index string -- i/f ) ;

View File

@ -0,0 +1,3 @@
USING: regexp.dfa tools.test ;
IN: regexp.dfa.tests

View File

@ -1,84 +1,84 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp.nfa regexp.transition-tables sequences math math.order regexp.nfa regexp.transition-tables sequences
sets sorting vectors regexp.utils sequences.deep ; sets sorting vectors regexp.ast regexp.classes ;
USING: io prettyprint threads ;
IN: regexp.dfa IN: regexp.dfa
: find-delta ( states transition regexp -- new-states ) : find-delta ( states transition nfa -- new-states )
nfa-table>> transitions>> transitions>> '[ _ swap _ at at ] gather sift ;
rot [ swap at at ] with with gather sift ;
: (find-epsilon-closure) ( states regexp -- new-states ) :: epsilon-loop ( state table nfa question -- )
eps swap find-delta ; state table at :> old-value
old-value question 2array <or-class> :> new-question
new-question old-value = [
new-question state table set-at
state nfa transitions>> at
[ drop tagged-epsilon? ] assoc-filter
[| trans to |
to [
table nfa
trans tag>> new-question 2array <and-class>
epsilon-loop
] each
] assoc-each
] unless ;
: find-epsilon-closure ( states regexp -- new-states ) : epsilon-table ( states nfa -- table )
'[ dup _ (find-epsilon-closure) union ] [ length ] while-changes [ H{ } clone tuck ] dip
natural-sort ; '[ _ _ t epsilon-loop ] each ;
: find-closure ( states transition regexp -- new-states ) : find-epsilon-closure ( states nfa -- dfa-state )
[ find-delta ] 2keep nip find-epsilon-closure ; epsilon-table table>condition ;
: find-start-state ( regexp -- state ) : find-closure ( states transition nfa -- new-states )
[ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ; [ find-delta ] keep find-epsilon-closure ;
: find-transitions ( seq1 regexp -- seq2 ) : find-start-state ( nfa -- state )
nfa-table>> transitions>> [ start-state>> 1array ] keep find-epsilon-closure ;
[ at keys ] curry gather
eps swap remove ;
: add-todo-state ( state regexp -- ) : find-transitions ( dfa-state nfa -- next-dfa-state )
2dup visited-states>> key? [ transitions>>
2drop '[ _ at keys [ condition-states ] map concat ] gather
] [ [ tagged-epsilon? not ] filter ;
[ visited-states>> conjoin ]
[ new-states>> push ] 2bi : add-todo-state ( state visited-states new-states -- )
3dup drop key? [ 3drop ] [
[ conjoin ] [ push ] bi-curry* bi
] if ; ] if ;
: new-transitions ( regexp -- ) : add-todo-states ( state/condition visited-states new-states -- )
dup new-states>> [ [ condition-states ] 2dip
drop '[ _ _ add-todo-state ] each ;
] [
dupd pop dup pick find-transitions rot :: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
[ new-states [ nfa dfa ] [
[ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep pop :> state
[ swapd transition make-transition ] dip state dfa transitions>> maybe-initialize-key
dfa-table>> add-transition state nfa find-transitions
] curry with each [| trans |
new-transitions state trans nfa find-closure :> new-state
new-state visited-states new-states add-todo-states
state new-state trans dfa set-transition
] each
nfa dfa new-states visited-states new-transitions
] if-empty ; ] if-empty ;
: states ( hashtable -- array ) : set-final-states ( nfa dfa -- )
[ keys ] [
[ values [ values concat ] map concat append ] bi ; [ final-states>> keys ]
[ transitions>> keys ] bi*
[ intersects? ] with filter
unique
] keep (>>final-states) ;
: set-final-states ( regexp -- ) : initialize-dfa ( nfa -- dfa )
dup <transition-table>
[ nfa-table>> final-states>> keys ] swap find-start-state >>start-state ;
[ dfa-table>> transitions>> states ] bi
[ intersects? ] with filter
swap dfa-table>> final-states>> : construct-dfa ( nfa -- dfa )
[ conjoin ] curry each ; dup initialize-dfa
dup start-state>> condition-states >vector
: set-initial-state ( regexp -- ) H{ } clone
dup new-transitions
[ dfa-table>> ] [ find-start-state ] bi [ set-final-states ] keep ;
[ >>start-state drop ] keep
1vector >>new-states drop ;
: set-traversal-flags ( regexp -- )
dup
[ nfa-traversal-flags>> ]
[ dfa-table>> transitions>> keys ] bi
[ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc
>>dfa-traversal-flags drop ;
: construct-dfa ( regexp -- )
{
[ set-initial-state ]
[ new-transitions ]
[ set-final-states ]
[ set-traversal-flags ]
} cleave ;

View File

@ -0,0 +1,44 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel accessors regexp.classes math.bits assocs sequences
arrays sets regexp.dfa math fry regexp.minimize regexp.ast ;
IN: regexp.disambiguate
TUPLE: parts in out ;
: make-partition ( choices classes -- partition )
zip [ first ] partition [ values ] bi@ parts boa ;
: powerset-partition ( classes -- partitions )
[ length [ 2^ ] keep ] keep '[
_ <bits> _ make-partition
] map rest ;
: partition>class ( parts -- class )
[ out>> [ <not-class> ] map ]
[ in>> <and-class> ] bi
prefix <and-class> ;
: get-transitions ( partition state-transitions -- next-states )
[ in>> ] dip '[ _ at ] gather sift ;
: new-transitions ( transitions -- assoc ) ! assoc is class, partition
values [ keys ] gather
[ tagged-epsilon? not ] filter
powerset-partition
[ [ partition>class ] keep ] { } map>assoc
[ drop ] assoc-filter ;
: preserving-epsilon ( state-transitions quot -- new-state-transitions )
[ [ drop tagged-epsilon? ] assoc-filter ] bi
assoc-union H{ } assoc-like ; inline
: disambiguate ( nfa -- nfa )
[
dup new-transitions '[
[
_ swap '[ _ get-transitions ] assoc-map
[ nip empty? not ] assoc-filter
] preserving-epsilon
] assoc-map
] change-transitions ;

View File

@ -0,0 +1,59 @@
! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel sequences math splitting make fry locals math.ranges
accessors arrays ;
IN: regexp.matchers
! For now, a matcher is just something with a method to do the
! equivalent of match.
GENERIC: match-index-from ( i string matcher -- index/f )
: match-index-head ( string matcher -- index/f )
[ 0 ] 2dip match-index-from ;
: match-slice ( i string matcher -- slice/f )
[ 2dup ] dip match-index-from
[ swap <slice> ] [ 2drop f ] if* ;
: matches? ( string matcher -- ? )
dupd match-index-head
[ swap length = ] [ drop f ] if* ;
: match-from ( i string matcher -- slice/f )
[ [ length [a,b) ] keep ] dip
'[ _ _ match-slice ] map-find drop ;
: match-head ( str matcher -- slice/f )
[ 0 ] 2dip match-from ;
<PRIVATE
: next-match ( i string matcher -- i match/f )
match-from [ dup [ to>> ] when ] keep ;
PRIVATE>
:: all-matches ( string matcher -- seq )
0 [ dup ] [ string matcher next-match ] produce nip but-last ;
: count-matches ( string matcher -- n )
all-matches length ;
<PRIVATE
:: split-slices ( string slices -- new-slices )
slices [ to>> ] map 0 prefix
slices [ from>> ] map string length suffix
[ string <slice> ] 2map ;
PRIVATE>
: re-split1 ( string matcher -- before after/f )
dupd match-head [ 1array split-slices first2 ] [ f ] if* ;
: re-split ( string matcher -- seq )
dupd all-matches split-slices ;
: re-replace ( string matcher replacement -- result )
[ re-split ] dip join ;

View File

@ -0,0 +1,58 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: tools.test regexp.minimize assocs regexp
accessors regexp.transition-tables regexp.parser
regexp.classes regexp.negation ;
IN: regexp.minimize.tests
[ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test
[ t ] [ 2 1 H{ { { 1 2 } t } } same-partition? ] unit-test
[ f ] [ 2 3 H{ { { 1 2 } t } } same-partition? ] unit-test
[ H{ { 1 1 } { 2 1 } { 3 3 } { 4 3 } } ]
[ { { 1 1 } { 1 2 } { 2 2 } { 3 3 } { 3 4 } { 4 4 } } [ t ] H{ } map>assoc partition>classes ] unit-test
[ { { 1 2 } { 3 4 } } ] [ H{ { "elephant" 1 } { "tiger" 3 } } H{ { "elephant" 2 } { "tiger" 4 } } assemble-values ] unit-test
: regexp-states ( string -- n )
parse-regexp ast>dfa transitions>> assoc-size ;
[ 3 ] [ "ab|ac" regexp-states ] unit-test
[ 3 ] [ "a(b|c)" regexp-states ] unit-test
[ 1 ] [ "((aa*)*)*" regexp-states ] unit-test
[ 1 ] [ "a|((aa*)*)*" regexp-states ] unit-test
[ 2 ] [ "ab|((aa*)*)*b" regexp-states ] unit-test
[ 4 ] [ "ab|cd" regexp-states ] unit-test
[ 1 ] [ "(?i:[a-z]*|[A-Z]*)" regexp-states ] unit-test
[
T{ transition-table
{ transitions H{
{ 0 H{ { CHAR: a 1 } { CHAR: b 1 } } }
{ 1 H{ { CHAR: a 2 } { CHAR: b 2 } } }
{ 2 H{ { CHAR: c 3 } } }
{ 3 H{ } }
} }
{ start-state 0 }
{ final-states H{ { 3 3 } } }
}
] [
T{ transition-table
{ transitions H{
{ 0 H{ { CHAR: a 1 } { CHAR: b 4 } } }
{ 1 H{ { CHAR: a 2 } { CHAR: b 5 } } }
{ 2 H{ { CHAR: c 3 } } }
{ 3 H{ } }
{ 4 H{ { CHAR: a 2 } { CHAR: b 5 } } }
{ 5 H{ { CHAR: c 6 } } }
{ 6 H{ } }
} }
{ start-state 0 }
{ final-states H{ { 3 3 } { 6 6 } } }
} combine-states
] unit-test
[ [ ] [ ] while-changes ] must-infer
[ H{ { T{ or-class f { 1 2 } } 3 } { 4 5 } } ]
[ H{ { 1 3 } { 2 3 } { 4 5 } } combine-state-transitions ] unit-test

View File

@ -0,0 +1,99 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel sequences regexp.transition-tables fry assocs
accessors locals math sorting arrays sets hashtables regexp.dfa
combinators.short-circuit regexp.classes ;
IN: regexp.minimize
: table>state-numbers ( table -- assoc )
transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
: number-states ( table -- newtable )
dup table>state-numbers transitions-at ;
: has-conditions? ( assoc -- ? )
values [ condition? ] any? ;
: initially-same? ( s1 s2 transition-table -- ? )
{
[ drop <= ]
[ transitions>> '[ _ at keys ] bi@ set= ]
[ final-states>> '[ _ key? ] bi@ = ]
} 3&& ;
:: initialize-partitions ( transition-table -- partitions )
! Partition table is sorted-array => ?
H{ } clone :> out
transition-table transitions>> keys :> states
states [| s1 |
states [| s2 |
s1 s2 transition-table initially-same?
[ s1 s2 2array out conjoin ] when
] each
] each out ;
: same-partition? ( s1 s2 partitions -- ? )
{ [ [ 2array natural-sort ] dip key? ] [ drop = ] } 3|| ;
: assemble-values ( assoc1 assoc2 -- values )
dup keys '[ _ swap [ at ] curry map ] bi@ zip ;
: stay-same? ( s1 s2 transition partitions -- ? )
[ '[ _ transitions>> at ] bi@ assemble-values ] dip
'[ _ same-partition? ] assoc-all? ;
: partition-more ( partitions transition-table -- partitions )
over '[ drop first2 _ _ stay-same? ] assoc-filter ;
: partition>classes ( partitions -- synonyms ) ! old-state => new-state
>alist sort-keys
[ drop first2 swap ] assoc-map
<reversed>
>hashtable ;
:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj )
obj quot call :> new-obj
new-obj comp call :> new-key
new-key old-key =
[ new-obj ]
[ new-obj quot comp new-key (while-changes) ]
if ; inline recursive
: while-changes ( obj quot pred -- obj' )
3dup nip call (while-changes) ; inline
: (state-classes) ( transition-table -- partition )
[ initialize-partitions ] keep
'[ _ partition-more ] [ assoc-size ] while-changes ;
: assoc>set ( assoc -- keys-set )
[ drop dup ] assoc-map ;
: state-classes ( transition-table -- synonyms )
clone [ [ nip has-conditions? ] assoc-partition ] change-transitions
[ assoc>set ] [ (state-classes) partition>classes ] bi* assoc-union ;
: canonical-state? ( state transitions state-classes -- ? )
'[ dup _ at = ] swap '[ _ at has-conditions? ] bi or ;
: delete-duplicates ( transitions state-classes -- new-transitions )
dupd '[ drop _ _ canonical-state? ] assoc-filter ;
: combine-states ( table -- smaller-table )
dup state-classes
[ transitions-at ] keep
'[ _ delete-duplicates ] change-transitions ;
: combine-state-transitions ( hash -- hash )
H{ } clone tuck '[
_ [ 2array <or-class> ] change-at
] assoc-each [ swap ] assoc-map ;
: combine-transitions ( table -- table )
[ [ combine-state-transitions ] assoc-map ] change-transitions ;
: minimize ( table -- minimal-table )
clone
number-states
combine-states
combine-transitions ;

View File

@ -0,0 +1,27 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
IN: regexp.negation.tests
[
! R/ |[^a]|.+/
T{ transition-table
{ transitions H{
{ 0 H{ { CHAR: a 1 } { T{ not-class f CHAR: a } -1 } } }
{ 1 H{ { t -1 } } }
{ -1 H{ { t -1 } } }
} }
{ start-state 0 }
{ final-states H{ { 0 0 } { -1 -1 } } }
}
] [
! R/ a/
T{ transition-table
{ transitions H{
{ 0 H{ { CHAR: a 1 } } }
{ 1 H{ } }
} }
{ start-state 0 }
{ final-states H{ { 1 1 } } }
} negate-table
] unit-test

View File

@ -0,0 +1,53 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.nfa regexp.disambiguate kernel sequences
assocs regexp.classes hashtables accessors fry vectors
regexp.ast regexp.transition-tables regexp.minimize
regexp.dfa namespaces ;
IN: regexp.negation
: ast>dfa ( parse-tree -- minimal-dfa )
construct-nfa disambiguate construct-dfa minimize ;
CONSTANT: fail-state -1
: add-default-transition ( state's-transitions -- new-state's-transitions )
clone dup
[ [ fail-state ] dip keys [ <not-class> ] map <and-class> ] keep set-at ;
: fail-state-recurses ( transitions -- new-transitions )
clone dup
[ fail-state t associate fail-state ] dip set-at ;
: add-fail-state ( transitions -- new-transitions )
[ add-default-transition ] assoc-map
fail-state-recurses ;
: inverse-final-states ( transition-table -- final-states )
[ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
: negate-table ( transition-table -- transition-table )
clone
[ add-fail-state ] change-transitions
dup inverse-final-states >>final-states ;
: renumber-states ( transition-table -- transition-table )
dup transitions>> keys [ next-state ] H{ } map>assoc
transitions-at ;
: box-transitions ( transition-table -- transition-table )
[ [ [ 1vector ] assoc-map ] assoc-map ] change-transitions ;
: unify-final-state ( transition-table -- transition-table )
dup [ final-states>> keys ] keep
'[ -2 epsilon _ set-transition ] each
H{ { -2 -2 } } >>final-states ;
: adjoin-dfa ( transition-table -- start end )
unify-final-state renumber-states box-transitions
[ start-state>> ]
[ final-states>> keys first ]
[ nfa-table get [ transitions>> ] bi@ swap update ] tri ;
M: negation nfa-node ( node -- start end )
term>> ast>dfa negate-table adjoin-dfa ;

View File

@ -1,235 +1,153 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs grouping kernel regexp.backend USING: accessors arrays assocs grouping kernel
locals math namespaces regexp.parser sequences fry quotations locals math namespaces sequences fry quotations
math.order math.ranges vectors unicode.categories regexp.utils math.order math.ranges vectors unicode.categories
regexp.transition-tables words sets regexp.classes unicode.case.private ; regexp.transition-tables words sets hashtables combinators.short-circuit
unicode.case.private regexp.ast regexp.classes ;
IN: regexp.nfa
! This uses unicode.case.private for ch>upper and ch>lower ! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything ! but case-insensitive matching should be done by case-folding everything
! before processing starts ! before processing starts
IN: regexp.nfa
ERROR: feature-is-broken feature ; SYMBOL: option-stack
SYMBOL: negation-mode SYMBOL: state
: negated? ( -- ? ) negation-mode get 0 or odd? ;
SINGLETON: eps : next-state ( -- state )
state [ get ] [ inc ] bi ;
MIXIN: traversal-flag SYMBOL: nfa-table
SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
: options ( -- obj ) current-regexp get options>> ; : set-each ( keys value hashtable -- )
'[ _ swap _ set-at ] each ;
: option? ( obj -- ? ) options key? ; : options>hash ( options -- hashtable )
H{ } clone [
[ [ on>> t ] dip set-each ]
[ [ off>> f ] dip set-each ] 2bi
] keep ;
: option-on ( obj -- ) options conjoin ; : using-options ( options quot -- )
[ options>hash option-stack [ ?push ] change ] dip
call option-stack get pop* ; inline
: option-off ( obj -- ) options delete-at ; : option? ( obj -- ? )
option-stack get assoc-stack ;
: next-state ( regexp -- state ) GENERIC: nfa-node ( node -- start-state end-state )
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
: set-start-state ( regexp -- ) : add-simple-entry ( obj -- start-state end-state )
dup stack>> [ [ next-state next-state 2dup ] dip
drop nfa-table get add-transition ;
] [
[ nfa-table>> ] [ pop first ] bi* >>start-state drop
] if-empty ;
GENERIC: nfa-node ( node -- ) : epsilon-transition ( source target -- )
epsilon nfa-table get add-transition ;
:: add-simple-entry ( obj class -- ) M:: star nfa-node ( node -- start end )
[let* | regexp [ current-regexp get ] node term>> nfa-node :> s1 :> s0
s0 [ regexp next-state ] next-state :> s2
s1 [ regexp next-state ] next-state :> s3
stack [ regexp stack>> ] s1 s0 epsilon-transition
table [ regexp nfa-table>> ] | s2 s0 epsilon-transition
negated? [ s2 s3 epsilon-transition
s0 f obj class make-transition table add-transition s1 s3 epsilon-transition
s0 s1 <default-transition> table add-transition s2 s3 ;
] [
s0 s1 obj class make-transition table add-transition
] if
s0 s1 2array stack push
t s1 table final-states>> set-at ] ;
: add-traversal-flag ( flag -- ) GENERIC: modify-epsilon ( tag -- newtag )
stack peek second ! Potential off-by-one errors when lookaround nested in lookbehind
current-regexp get nfa-traversal-flags>> push-at ;
:: concatenate-nodes ( -- ) M: object modify-epsilon ;
[let* | regexp [ current-regexp get ]
stack [ regexp stack>> ]
table [ regexp nfa-table>> ]
s2 [ stack peek first ]
s3 [ stack pop second ]
s0 [ stack peek first ]
s1 [ stack pop second ] |
s1 s2 eps <literal-transition> table add-transition
s1 table final-states>> delete-at
s0 s3 2array stack push ] ;
:: alternate-nodes ( -- ) M: $ modify-epsilon
[let* | regexp [ current-regexp get ] multiline option? [ drop end-of-input ] unless ;
stack [ regexp stack>> ]
table [ regexp nfa-table>> ]
s2 [ stack peek first ]
s3 [ stack pop second ]
s0 [ stack peek first ]
s1 [ stack pop second ]
s4 [ regexp next-state ]
s5 [ regexp next-state ] |
s4 s0 eps <literal-transition> table add-transition
s4 s2 eps <literal-transition> table add-transition
s1 s5 eps <literal-transition> table add-transition
s3 s5 eps <literal-transition> table add-transition
s1 table final-states>> delete-at
s3 table final-states>> delete-at
t s5 table final-states>> set-at
s4 s5 2array stack push ] ;
M: kleene-star nfa-node ( node -- ) M: ^ modify-epsilon
term>> nfa-node multiline option? [ drop beginning-of-input ] unless ;
[let* | regexp [ current-regexp get ]
stack [ regexp stack>> ]
s0 [ stack peek first ]
s1 [ stack pop second ]
s2 [ regexp next-state ]
s3 [ regexp next-state ]
table [ regexp nfa-table>> ] |
s1 table final-states>> delete-at
t s3 table final-states>> set-at
s1 s0 eps <literal-transition> table add-transition
s2 s0 eps <literal-transition> table add-transition
s2 s3 eps <literal-transition> table add-transition
s1 s3 eps <literal-transition> table add-transition
s2 s3 2array stack push ] ;
M: concatenation nfa-node ( node -- ) M: tagged-epsilon nfa-node
seq>> clone [ modify-epsilon ] change-tag add-simple-entry ;
reversed-regexp option? [ <reversed> ] when
[ [ nfa-node ] each ]
[ length 1- [ concatenate-nodes ] times ] bi ;
M: alternation nfa-node ( node -- ) M: concatenation nfa-node ( node -- start end )
seq>> [ first>> ] [ second>> ] bi
[ [ nfa-node ] each ] reversed-regexp option? [ swap ] when
[ length 1- [ alternate-nodes ] times ] bi ; [ nfa-node ] bi@
[ epsilon-transition ] dip ;
M: constant nfa-node ( node -- ) :: alternate-nodes ( s0 s1 s2 s3 -- start end )
next-state :> s4
next-state :> s5
s4 s0 epsilon-transition
s4 s2 epsilon-transition
s1 s5 epsilon-transition
s3 s5 epsilon-transition
s4 s5 ;
M: alternation nfa-node ( node -- start end )
[ first>> ] [ second>> ] bi
[ nfa-node ] bi@
alternate-nodes ;
GENERIC: modify-class ( char-class -- char-class' )
M: object modify-class ;
M: integer modify-class
case-insensitive option? [ case-insensitive option? [
dup char>> [ ch>lower ] [ ch>upper ] bi dup Letter? [
2dup = [ [ ch>lower ] [ ch>upper ] bi 2array <or-class>
2drop ] when
char>> literal-transition add-simple-entry ] when ;
] [
[ literal-transition add-simple-entry ] bi@
alternate-nodes drop
] if
] [
char>> literal-transition add-simple-entry
] if ;
M: epsilon nfa-node ( node -- ) M: integer nfa-node ( node -- start end )
drop eps literal-transition add-simple-entry ; modify-class add-simple-entry ;
M: word nfa-node ( node -- ) class-transition add-simple-entry ; M: primitive-class modify-class
class>> modify-class <primitive-class> ;
M: any-char nfa-node ( node -- ) M: or-class modify-class
[ dotall option? ] dip any-char-no-nl ? seq>> [ modify-class ] map <or-class> ;
class-transition add-simple-entry ;
! M: beginning-of-text nfa-node ( node -- ) ; M: not-class modify-class
class>> modify-class <not-class> ;
M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ; M: any-char modify-class
drop dotall option? t any-char-no-nl ? ;
M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ; : modify-letter-class ( class -- newclass )
case-insensitive option? [ drop Letter-class ] when ;
M: letter-class modify-class modify-letter-class ;
M: LETTER-class modify-class modify-letter-class ;
: choose-letter-class ( node -- node' ) : cased-range? ( range -- ? )
case-insensitive option? Letter-class rot ? ; [ from>> ] [ to>> ] bi {
[ [ letter? ] bi@ and ]
[ [ LETTER? ] bi@ and ]
} 2|| ;
M: letter-class nfa-node ( node -- ) M: range modify-class
choose-letter-class class-transition add-simple-entry ;
M: LETTER-class nfa-node ( node -- )
choose-letter-class class-transition add-simple-entry ;
M: character-class-range nfa-node ( node -- )
case-insensitive option? [ case-insensitive option? [
! This should be implemented for Unicode by case-folding dup cased-range? [
! the input and all strings in the regexp. [ from>> ] [ to>> ] bi
dup [ from>> ] [ to>> ] bi [ [ ch>lower ] bi@ <range> ]
2dup [ Letter? ] bi@ and [ [ [ ch>upper ] bi@ <range> ] 2bi
rot drop 2array <or-class>
[ [ ch>lower ] bi@ character-class-range boa ] ] when
[ [ ch>upper ] bi@ character-class-range boa ] 2bi ] when ;
[ class-transition add-simple-entry ] bi@
alternate-nodes
] [
2drop
class-transition add-simple-entry
] if
] [
class-transition add-simple-entry
] if ;
M: capture-group nfa-node ( node -- ) M: class nfa-node
"capture-groups" feature-is-broken modify-class add-simple-entry ;
eps literal-transition add-simple-entry
capture-group-on add-traversal-flag
term>> nfa-node
eps literal-transition add-simple-entry
capture-group-off add-traversal-flag
2 [ concatenate-nodes ] times ;
! xyzzy M: with-options nfa-node ( node -- start end )
M: non-capture-group nfa-node ( node -- ) dup options>> [ tree>> nfa-node ] using-options ;
term>> nfa-node ;
M: reluctant-kleene-star nfa-node ( node -- ) : construct-nfa ( ast -- nfa-table )
term>> <kleene-star> nfa-node ;
M: negation nfa-node ( node -- )
negation-mode inc
term>> nfa-node
negation-mode dec ;
M: lookahead nfa-node ( node -- )
"lookahead" feature-is-broken
eps literal-transition add-simple-entry
lookahead-on add-traversal-flag
term>> nfa-node
eps literal-transition add-simple-entry
lookahead-off add-traversal-flag
2 [ concatenate-nodes ] times ;
M: lookbehind nfa-node ( node -- )
"lookbehind" feature-is-broken
eps literal-transition add-simple-entry
lookbehind-on add-traversal-flag
term>> nfa-node
eps literal-transition add-simple-entry
lookbehind-off add-traversal-flag
2 [ concatenate-nodes ] times ;
M: option nfa-node ( node -- )
[ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
eps literal-transition add-simple-entry ;
: construct-nfa ( regexp -- )
[ [
reset-regexp 0 state set
negation-mode off <transition-table> nfa-table set
[ current-regexp set ] nfa-node
[ parse-tree>> nfa-node ] nfa-table get
[ set-start-state ] tri swap dup associate >>final-states
swap >>start-state
] with-scope ; ] with-scope ;

View File

@ -1,34 +1,24 @@
USING: kernel tools.test regexp.backend regexp ; USING: kernel tools.test regexp.parser fry sequences ;
IN: regexp.parser IN: regexp.parser.tests
: test-regexp ( string -- ) : regexp-parses ( string -- )
default-regexp parse-regexp ; [ [ ] ] dip '[ _ parse-regexp drop ] unit-test ;
! [ "(" ] [ unmatched-parentheses? ] must-fail-with : regexp-fails ( string -- )
'[ _ parse-regexp ] must-fail ;
[ ] [ "a|b" test-regexp ] unit-test {
[ ] [ "a.b" test-regexp ] unit-test "a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||"
[ ] [ "a|b|c" test-regexp ] unit-test "(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|"
[ ] [ "abc|b" test-regexp ] unit-test "[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]"
[ ] [ "a|bcd" test-regexp ] unit-test "[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
[ ] [ "a|(b)" test-regexp ] unit-test "(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]"
[ ] [ "(a)|b" test-regexp ] unit-test "[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)"
[ ] [ "(a|b)" test-regexp ] unit-test "\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}"
[ ] [ "((a)|(b))" test-regexp ] unit-test "\\ueeee" "\\0333" "\\xff" "\\\\" "\\w"
} [ regexp-parses ] each
[ ] [ "(?:a)" test-regexp ] unit-test {
[ ] [ "(?i:a)" test-regexp ] unit-test "[^]" "[]" "a{foo}" "a{,}" "a{}" "(?)" "\\p{foo}" "\\P{foo}"
[ ] [ "(?-i:a)" test-regexp ] unit-test "\\ueeeg" "\\0339" "\\xfg"
[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with } [ regexp-fails ] each
[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
[ ] [ "(?=a)" test-regexp ] unit-test
[ ] [ "[abc]" test-regexp ] unit-test
[ ] [ "[a-c]" test-regexp ] unit-test
[ ] [ "[^a-c]" test-regexp ] unit-test
[ "[^]" test-regexp ] must-fail
[ ] [ "|b" test-regexp ] unit-test
[ ] [ "b|" test-regexp ] unit-test
[ ] [ "||" test-regexp ] unit-test

View File

@ -1,437 +1,177 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators io io.streams.string USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
kernel math math.parser namespaces sets combinators regexp.classes strings splitting peg locals accessors
quotations sequences splitting vectors math.order regexp.ast ;
strings regexp.backend regexp.utils
unicode.case unicode.categories words locals regexp.classes ;
IN: regexp.parser IN: regexp.parser
FROM: math.ranges => [a,b] ; : allowed-char? ( ch -- ? )
".()|[*+?$^" member? not ;
TUPLE: concatenation seq ; INSTANCE: concatenation node ERROR: bad-number ;
TUPLE: alternation seq ; INSTANCE: alternation node
TUPLE: kleene-star term ; INSTANCE: kleene-star node
! !!!!!!!! : ensure-number ( n -- n )
TUPLE: possessive-question term ; INSTANCE: possessive-question node [ bad-number ] unless* ;
TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node
! !!!!!!!! :: at-error ( key assoc quot: ( key -- replacement ) -- value )
TUPLE: reluctant-question term ; INSTANCE: reluctant-question node key assoc at* [ drop key quot call ] unless ; inline
TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
TUPLE: negation term ; INSTANCE: negation node ERROR: bad-class name ;
TUPLE: constant char ; INSTANCE: constant node
TUPLE: range from to ; INSTANCE: range node
MIXIN: parentheses-group : name>class ( name -- class )
TUPLE: lookahead term ; INSTANCE: lookahead node {
INSTANCE: lookahead parentheses-group { "Lower" letter-class }
TUPLE: lookbehind term ; INSTANCE: lookbehind node { "Upper" LETTER-class }
INSTANCE: lookbehind parentheses-group { "Alpha" Letter-class }
TUPLE: capture-group term ; INSTANCE: capture-group node { "ASCII" ascii-class }
INSTANCE: capture-group parentheses-group { "Digit" digit-class }
TUPLE: non-capture-group term ; INSTANCE: non-capture-group node { "Alnum" alpha-class }
INSTANCE: non-capture-group parentheses-group { "Punct" punctuation-class }
TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group { "Graph" java-printable-class }
INSTANCE: independent-group parentheses-group { "Print" java-printable-class }
TUPLE: comment-group term ; INSTANCE: comment-group node { "Blank" non-newline-blank-class }
INSTANCE: comment-group parentheses-group { "Cntrl" control-character-class }
{ "XDigit" hex-digit-class }
{ "Space" java-blank-class }
! TODO: unicode-character-class
} [ bad-class ] at-error ;
SINGLETON: epsilon INSTANCE: epsilon node : lookup-escape ( char -- ast )
{
{ CHAR: t [ CHAR: \t ] }
{ CHAR: n [ CHAR: \n ] }
{ CHAR: r [ CHAR: \r ] }
{ CHAR: f [ HEX: c ] }
{ CHAR: a [ HEX: 7 ] }
{ CHAR: e [ HEX: 1b ] }
{ CHAR: \\ [ CHAR: \\ ] }
TUPLE: option option on? ; INSTANCE: option node { CHAR: w [ c-identifier-class <primitive-class> ] }
{ CHAR: W [ c-identifier-class <primitive-class> <not-class> ] }
{ CHAR: s [ java-blank-class <primitive-class> ] }
{ CHAR: S [ java-blank-class <primitive-class> <not-class> ] }
{ CHAR: d [ digit-class <primitive-class> ] }
{ CHAR: D [ digit-class <primitive-class> <not-class> ] }
SINGLETONS: unix-lines dotall multiline comments case-insensitive { CHAR: z [ end-of-input <tagged-epsilon> ] }
unicode-case reversed-regexp ; { CHAR: Z [ end-of-file <tagged-epsilon> ] }
{ CHAR: A [ beginning-of-input <tagged-epsilon> ] }
[ ]
} case ;
SINGLETONS: beginning-of-character-class end-of-character-class : options-assoc ( -- assoc )
left-parenthesis pipe caret dash ; H{
{ CHAR: i case-insensitive }
: push1 ( obj -- ) input-stream get stream>> push ; { CHAR: d unix-lines }
: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ; { CHAR: m multiline }
: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ; { CHAR: n multiline }
: drop1 ( -- ) read1 drop ; { CHAR: r reversed-regexp }
{ CHAR: s dotall }
: stack ( -- obj ) current-regexp get stack>> ; { CHAR: u unicode-case }
: change-whole-stack ( quot -- ) { CHAR: x comments }
current-regexp get } ;
[ stack>> swap call ] keep (>>stack) ; inline
: push-stack ( obj -- ) stack push ;
: pop-stack ( -- obj ) stack pop ;
: cut-out ( vector n -- vector' vector ) cut rest ;
ERROR: cut-stack-error ;
: cut-stack ( obj vector -- vector' vector )
[ nip ] [ last-index ] 2bi [ cut-stack-error ] unless* cut-out swap ;
: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
: <possessive-question> ( obj -- kleene ) possessive-question boa ;
: <reluctant-question> ( obj -- kleene ) reluctant-question boa ;
: <negation> ( obj -- negation ) negation boa ;
: <concatenation> ( seq -- concatenation )
>vector [ epsilon ] [ concatenation boa ] if-empty ;
: <alternation> ( seq -- alternation ) >vector alternation boa ;
: <capture-group> ( obj -- capture-group ) capture-group boa ;
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
: <constant> ( obj -- constant ) constant boa ;
: first|concatenation ( seq -- first/concatenation )
dup length 1 = [ first ] [ <concatenation> ] if ;
: first|alternation ( seq -- first/alternation )
dup length 1 = [ first ] [ <alternation> ] if ;
: <character-class-range> ( from to -- obj )
2dup <
[ character-class-range boa ] [ 2drop unmatchable-class ] if ;
ERROR: unmatched-parentheses ;
ERROR: unknown-regexp-option option ;
: ch>option ( ch -- singleton ) : ch>option ( ch -- singleton )
{ options-assoc at ;
{ CHAR: i [ case-insensitive ] }
{ CHAR: d [ unix-lines ] }
{ CHAR: m [ multiline ] }
{ CHAR: n [ multiline ] }
{ CHAR: r [ reversed-regexp ] }
{ CHAR: s [ dotall ] }
{ CHAR: u [ unicode-case ] }
{ CHAR: x [ comments ] }
[ unknown-regexp-option ]
} case ;
: option>ch ( option -- string ) : option>ch ( option -- string )
{ options-assoc value-at ;
{ case-insensitive [ CHAR: i ] }
{ multiline [ CHAR: m ] }
{ reversed-regexp [ CHAR: r ] }
{ dotall [ CHAR: s ] }
[ unknown-regexp-option ]
} case ;
: toggle-option ( ch ? -- ) : parse-options ( on off -- options )
[ ch>option ] dip option boa push-stack ; [ [ ch>option ] { } map-as ] bi@ <options> ;
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ; : string>options ( string -- options )
"-" split1 parse-options ;
: options>string ( options -- string )
[ on>> ] [ off>> ] bi
[ [ option>ch ] map ] bi@
[ "-" glue ] unless-empty
"" like ;
: parse-options ( string -- ) ! TODO: add syntax for various parenthized things,
"-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; ! add greedy and nongreedy forms of matching
! (once it's all implemented)
ERROR: bad-special-group string ; EBNF: parse-regexp
DEFER: (parse-regexp) CharacterInBracket = !("}") Character
: nested-parse-regexp ( token ? -- )
[ push-stack (parse-regexp) pop-stack ] dip
[ <negation> ] when pop-stack new swap >>term push-stack ;
! non-capturing groups QuotedCharacter = !("\\E") .
: (parse-special-group) ( -- )
read1 {
{ [ dup CHAR: # = ] ! comment
[ drop comment-group f nested-parse-regexp pop-stack drop ] }
{ [ dup CHAR: : = ]
[ drop non-capture-group f nested-parse-regexp ] }
{ [ dup CHAR: = = ]
[ drop lookahead f nested-parse-regexp ] }
{ [ dup CHAR: ! = ]
[ drop lookahead t nested-parse-regexp ] }
{ [ dup CHAR: > = ]
[ drop non-capture-group f nested-parse-regexp ] }
{ [ dup CHAR: < = peek1 CHAR: = = and ]
[ drop drop1 lookbehind f nested-parse-regexp ] }
{ [ dup CHAR: < = peek1 CHAR: ! = and ]
[ drop drop1 lookbehind t nested-parse-regexp ] }
[
":)" read-until
[ swap prefix ] dip
{
{ CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] }
{ CHAR: ) [ parse-options ] }
[ drop bad-special-group ]
} case
]
} cond ;
: handle-left-parenthesis ( -- ) Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
peek1 CHAR: ? = | "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
[ drop1 (parse-special-group) ] | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
[ capture-group f nested-parse-regexp ] if ; | "u" Character:a Character:b Character:c Character:d
=> [[ { a b c d } hex> ensure-number ]]
| "x" Character:a Character:b
=> [[ { a b } hex> ensure-number ]]
| "0" Character:a Character:b Character:c
=> [[ { a b c } oct> ensure-number ]]
| . => [[ lookup-escape ]]
: handle-dot ( -- ) any-char push-stack ; EscapeSequence = "\\" Escape:e => [[ e ]]
: handle-pipe ( -- ) pipe push-stack ;
: (handle-star) ( obj -- kleene-star )
peek1 {
{ CHAR: + [ drop1 <possessive-kleene-star> ] }
{ CHAR: ? [ drop1 <reluctant-kleene-star> ] }
[ drop <kleene-star> ]
} case ;
: handle-star ( -- ) stack pop (handle-star) push-stack ;
: handle-question ( -- )
stack pop peek1 {
{ CHAR: + [ drop1 <possessive-question> ] }
{ CHAR: ? [ drop1 <reluctant-question> ] }
[ drop epsilon 2array <alternation> ]
} case push-stack ;
: handle-plus ( -- )
stack pop dup (handle-star)
2array <concatenation> push-stack ;
ERROR: unmatched-brace ; Character = EscapeSequence
: parse-repetition ( -- start finish ? ) | "$" => [[ $ <tagged-epsilon> ]]
"}" read-until [ unmatched-brace ] unless | "^" => [[ ^ <tagged-epsilon> ]]
[ "," split1 [ string>number ] bi@ ] | . ?[ allowed-char? ]?
[ CHAR: , swap index >boolean ] bi ;
: replicate/concatenate ( n obj -- obj' ) AnyRangeCharacter = EscapeSequence | .
over zero? [ 2drop epsilon ]
[ <repetition> first|concatenation ] if ;
: exactly-n ( n -- ) RangeCharacter = !("]") AnyRangeCharacter
stack pop replicate/concatenate push-stack ;
: at-least-n ( n -- ) Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
stack pop | RangeCharacter
[ replicate/concatenate ] keep
<kleene-star> 2array <concatenation> push-stack ;
: at-most-n ( n -- ) StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
1+ | AnyRangeCharacter
stack pop
[ replicate/concatenate ] curry map <alternation> push-stack ;
: from-m-to-n ( m n -- ) Ranges = StartRange:s Range*:r => [[ r s prefix ]]
[a,b]
stack pop
[ replicate/concatenate ] curry map
<alternation> push-stack ;
ERROR: invalid-range a b ; CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
: handle-left-brace ( -- ) Options = [idmsux]*
parse-repetition
[ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
[
2dup and [ from-m-to-n ]
[ [ nip at-most-n ] [ at-least-n ] if* ] if
] [ drop 0 max exactly-n ] if ;
: handle-front-anchor ( -- ) beginning-of-line push-stack ; Parenthized = "?:" Alternation:a => [[ a ]]
: handle-back-anchor ( -- ) end-of-line push-stack ; | "?" Options:on "-"? Options:off ":" Alternation:a
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]]
| "?=" Alternation:a => [[ a t <lookahead> <tagged-epsilon> ]]
| "?!" Alternation:a => [[ a f <lookahead> <tagged-epsilon> ]]
| "?<=" Alternation:a => [[ a t <lookbehind> <tagged-epsilon> ]]
| "?<!" Alternation:a => [[ a f <lookbehind> <tagged-epsilon> ]]
| Alternation
ERROR: bad-character-class obj ; Element = "(" Parenthized:p ")" => [[ p ]]
ERROR: expected-posix-class ; | "[" CharClass:r "]" => [[ r ]]
| ".":d => [[ any-char <primitive-class> ]]
| Character
: parse-posix-class ( -- obj ) Number = (!(","|"}").)* => [[ string>number ensure-number ]]
read1 CHAR: { = [ expected-posix-class ] unless
"}" read-until [ bad-character-class ] unless
{
{ "Lower" [ letter-class ] }
{ "Upper" [ LETTER-class ] }
{ "Alpha" [ Letter-class ] }
{ "ASCII" [ ascii-class ] }
{ "Digit" [ digit-class ] }
{ "Alnum" [ alpha-class ] }
{ "Punct" [ punctuation-class ] }
{ "Graph" [ java-printable-class ] }
{ "Print" [ java-printable-class ] }
{ "Blank" [ non-newline-blank-class ] }
{ "Cntrl" [ control-character-class ] }
{ "XDigit" [ hex-digit-class ] }
{ "Space" [ java-blank-class ] }
! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
[ bad-character-class ]
} case ;
: parse-octal ( -- n ) 3 read oct> check-octal ; Times = "," Number:n "}" => [[ 0 n <from-to> ]]
: parse-short-hex ( -- n ) 2 read hex> check-hex ; | Number:n ",}" => [[ n <at-least> ]]
: parse-long-hex ( -- n ) 6 read hex> check-hex ; | Number:n "}" => [[ n n <from-to> ]]
: parse-control-character ( -- n ) read1 ; | "}" => [[ bad-number ]]
| Number:n "," Number:m "}" => [[ n m <from-to> ]]
ERROR: bad-escaped-literals seq ; Repeated = Element:e "{" Times:t => [[ e t <times> ]]
| Element:e "??" => [[ e <maybe> ]]
| Element:e "*?" => [[ e <star> ]]
| Element:e "+?" => [[ e <plus> ]]
| Element:e "?" => [[ e <maybe> ]]
| Element:e "*" => [[ e <star> ]]
| Element:e "+" => [[ e <plus> ]]
| Element
: parse-til-E ( -- obj ) Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
"\\E" read-until [ bad-escaped-literals ] unless ;
:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
parse-til-E
drop1
[ epsilon ] [
quot call [ <constant> ] V{ } map-as
first|concatenation
] if-empty ; inline
: parse-escaped-literals ( -- obj ) Alternation = Concatenation:c ("|" Concatenation)*:a
[ ] (parse-escaped-literals) ; => [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
: lower-case-literals ( -- obj ) End = !(.)
[ >lower ] (parse-escaped-literals) ;
: upper-case-literals ( -- obj ) Main = Alternation End
[ >upper ] (parse-escaped-literals) ; ;EBNF
: parse-escaped ( -- obj )
read1
{
{ CHAR: t [ CHAR: \t <constant> ] }
{ CHAR: n [ CHAR: \n <constant> ] }
{ CHAR: r [ CHAR: \r <constant> ] }
{ CHAR: f [ HEX: c <constant> ] }
{ CHAR: a [ HEX: 7 <constant> ] }
{ CHAR: e [ HEX: 1b <constant> ] }
{ CHAR: w [ c-identifier-class ] }
{ CHAR: W [ c-identifier-class <negation> ] }
{ CHAR: s [ java-blank-class ] }
{ CHAR: S [ java-blank-class <negation> ] }
{ CHAR: d [ digit-class ] }
{ CHAR: D [ digit-class <negation> ] }
{ CHAR: p [ parse-posix-class ] }
{ CHAR: P [ parse-posix-class <negation> ] }
{ CHAR: x [ parse-short-hex <constant> ] }
{ CHAR: u [ parse-long-hex <constant> ] }
{ CHAR: 0 [ parse-octal <constant> ] }
{ CHAR: c [ parse-control-character ] }
{ CHAR: Q [ parse-escaped-literals ] }
! { CHAR: b [ word-boundary-class ] }
! { CHAR: B [ word-boundary-class <negation> ] }
! { CHAR: A [ handle-beginning-of-input ] }
! { CHAR: z [ handle-end-of-input ] }
! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
! m//g mode
! { CHAR: G [ end of previous match ] }
! Group capture
! { CHAR: 1 [ CHAR: 1 <constant> ] }
! { CHAR: 2 [ CHAR: 2 <constant> ] }
! { CHAR: 3 [ CHAR: 3 <constant> ] }
! { CHAR: 4 [ CHAR: 4 <constant> ] }
! { CHAR: 5 [ CHAR: 5 <constant> ] }
! { CHAR: 6 [ CHAR: 6 <constant> ] }
! { CHAR: 7 [ CHAR: 7 <constant> ] }
! { CHAR: 8 [ CHAR: 8 <constant> ] }
! { CHAR: 9 [ CHAR: 9 <constant> ] }
! Perl extensions
! can't do \l and \u because \u is already a 4-hex
{ CHAR: L [ lower-case-literals ] }
{ CHAR: U [ upper-case-literals ] }
[ <constant> ]
} case ;
: handle-escape ( -- ) parse-escaped push-stack ;
: handle-dash ( vector -- vector' )
H{ { dash CHAR: - } } substitute ;
: character-class>alternation ( seq -- alternation )
[ dup number? [ <constant> ] when ] map first|alternation ;
: handle-caret ( vector -- vector' )
dup [ length 2 >= ] [ first caret eq? ] bi and [
rest-slice character-class>alternation <negation>
] [
character-class>alternation
] if ;
: make-character-class ( -- character-class )
[ beginning-of-character-class swap cut-stack ] change-whole-stack
handle-dash handle-caret ;
: apply-dash ( -- )
stack [ pop3 nip <character-class-range> ] keep push ;
: apply-dash? ( -- ? )
stack dup length 3 >=
[ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
ERROR: empty-negated-character-class ;
DEFER: handle-left-bracket
: (parse-character-class) ( -- )
read1 [ empty-negated-character-class ] unless* {
{ CHAR: [ [ handle-left-bracket t ] }
{ CHAR: ] [ make-character-class push-stack f ] }
{ CHAR: - [ dash push-stack t ] }
{ CHAR: \ [ parse-escaped push-stack t ] }
[ push-stack apply-dash? [ apply-dash ] when t ]
} case
[ (parse-character-class) ] when ;
: push-constant ( ch -- ) <constant> push-stack ;
: parse-character-class-second ( -- )
read1 {
{ CHAR: [ [ CHAR: [ push-constant ] }
{ CHAR: ] [ CHAR: ] push-constant ] }
{ CHAR: - [ CHAR: - push-constant ] }
[ push1 ]
} case ;
: parse-character-class-first ( -- )
read1 {
{ CHAR: ^ [ caret push-stack parse-character-class-second ] }
{ CHAR: [ [ CHAR: [ push-constant ] }
{ CHAR: ] [ CHAR: ] push-constant ] }
{ CHAR: - [ CHAR: - push-constant ] }
[ push1 ]
} case ;
: handle-left-bracket ( -- )
beginning-of-character-class push-stack
parse-character-class-first (parse-character-class) ;
: finish-regexp-parse ( stack -- obj )
{ pipe } split
[ first|concatenation ] map first|alternation ;
: handle-right-parenthesis ( -- )
stack dup [ parentheses-group "members" word-prop member? ] find-last
-rot cut rest
[ [ push ] keep current-regexp get (>>stack) ]
[ finish-regexp-parse push-stack ] bi* ;
: parse-regexp-token ( token -- ? )
{
{ CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning?
{ CHAR: ) [ handle-right-parenthesis f ] }
{ CHAR: . [ handle-dot t ] }
{ CHAR: | [ handle-pipe t ] }
{ CHAR: ? [ handle-question t ] }
{ CHAR: * [ handle-star t ] }
{ CHAR: + [ handle-plus t ] }
{ CHAR: { [ handle-left-brace t ] }
{ CHAR: [ [ handle-left-bracket t ] }
{ CHAR: \ [ handle-escape t ] }
[
dup CHAR: $ = peek1 f = and
[ drop handle-back-anchor f ]
[ push-constant t ] if
]
} case ;
: (parse-regexp) ( -- )
read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
: parse-regexp-beginning ( -- )
peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ;
: parse-regexp ( regexp -- )
dup current-regexp [
raw>> [
<string-reader> [
parse-regexp-beginning (parse-regexp)
] with-input-stream
] unless-empty
current-regexp get [ finish-regexp-parse ] change-stack
dup stack>> >>parse-tree drop
] with-variable ;

View File

@ -1,8 +1,87 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings help.markup help.syntax regexp.backend ; USING: kernel strings help.markup help.syntax regexp.matchers math ;
IN: regexp IN: regexp
ABOUT: "regexp"
ARTICLE: "regexp" "Regular expressions"
"The " { $vocab-link "regexp" } " vocabulary provides word for creating and using regular expressions."
{ $subsection { "regexp" "syntax" } }
{ $subsection { "regexp" "construction" } }
{ $vocab-subsection "regexp.combinators" "Regular expression combinators" }
{ $subsection { "regexp" "operations" } }
{ $subsection regexp }
{ $subsection { "regexp" "theory" } } ;
ARTICLE: { "regexp" "construction" } "Constructing regular expressions"
"Words which are useful for creating regular expressions:"
{ $subsection POSTPONE: R/ }
{ $subsection <regexp> }
{ $subsection <optioned-regexp> }
{ $heading "See also" }
{ $vocab-link "regexp.combinators" } ;
ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
"Regexp syntax is largely compatible with Perl, Java and extended POSTFIX regexps, but not completely." $nl
"A new addition is the inclusion of a negation operator, with the syntax " { $snippet "(?~foo)" } " to match everything that does not match " { $snippet "foo" } "." $nl
"One missing feature is backreferences. This is because of a design decision to allow only regular expressions following the formal theory of regular languages. For more information, see " { $link { "regexp" "theory" } } ". You can create a new regular expression to match a particular string using " { $vocab-link "regexp.combinators" } " and group capture is available to extract parts of a regular expression match." $nl
"A distinction from Perl is that " { $snippet "\\G" } ", which references the previous match, is not included. This is because that sequence is inherently stateful, and Factor regexps don't hold state." $nl
"Additionally, none of the operations which embed code into a regexp are supported, as this would require the inclusion of the Factor parser and compiler in any application which wants to expose regexps to the user. None of the casing operations are included, for simplicity." ; ! Also describe syntax, from the beginning
ARTICLE: { "regexp" "theory" } "The theory of regular expressions"
"Far from being just a practical tool invented by Unix hackers, regular expressions were studied formally before computer programs were written to process them." $nl
"A regular language is a set of strings that is matched by a regular expression, which is defined to have characters and the empty string, along with the operations concatenation, disjunction and Kleene star. Another way to define the class of regular languages is as the class of languages which can be recognized with constant space overhead, ie with a DFA. These two definitions are provably equivalent." $nl
"One basic result in the theory of regular language is that the complement of a regular language is regular. In other words, for any regular expression, there exists another regular expression which matches exactly the strings that the first one doesn't match." $nl
"This implies, by DeMorgan's law, that, if you have two regular languages, their intersection is also regular. That is, for any two regular expressions, there exists a regular expression which matches strings that match both inputs." $nl
"Traditionally, regular expressions on computer support an additional operation: backreferences. For example, the Perl regexp " { $snippet "/(.*)$1/" } " matches a string repated twice. If a backreference refers to a string with a predetermined maximum length, then the resulting language is still regular." $nl
"But, if not, the language is not regular. There is strong evidence that there is no efficient way to parse with backreferences in the general case. Perl uses a naive backtracking algorithm which has pathological behavior in some cases, taking exponential time to match even if backreferences aren't used. Additionally, expressions with backreferences don't have the properties with negation and intersection described above." $nl
"The Factor regular expression engine was built with the design decision to support negation and intersection at the expense of backreferences. This lets us have a guaranteed linear-time matching algorithm. Systems like Ragel and Lex also use this algorithm, but in the Factor regular expression engine, all other features of regexps are still present." ;
ARTICLE: { "regexp" "operations" } "Matching operations with regular expressions"
{ $subsection all-matches }
{ $subsection matches? }
{ $subsection re-split1 }
{ $subsection re-split }
{ $subsection re-replace }
{ $subsection count-matches }
{ $subsection re-replace } ;
HELP: <regexp> HELP: <regexp>
{ $values { "string" string } { "regexp" regexp } } { $values { "string" string } { "regexp" regexp } }
{ $description "Compiles a regular expression into a DFA and returns this object. Regular expressions only have to be compiled once and can then be used multiple times to match input strings." } ; { $description "Creates a regular expression object, given a string in regular expression syntax. When it is first used for matching, a DFA is compiled, and this DFA is stored for reuse so it is only compiled once." } ;
HELP: <optioned-regexp>
{ $values { "string" string } { "options" string } { "regexp" regexp } }
{ $description "Given a string in regular expression syntax, and a string of options, creates a regular expression object. When it is first used for matching, a DFA is compiled, and this DFA is stored for reuse so it is only compiled once." } ;
HELP: R/
{ $syntax "R/ foo.*|[a-zA-Z]bar/i" }
{ $description "Literal syntax for a regular expression. When this syntax is used, the DFA is compiled at compile-time, rather than on first use." } ;
HELP: regexp
{ $class-description "The class of regular expressions. To construct these, see " { $link { "regexp" "construction" } } "." } ;
HELP: matches?
{ $values { "string" string } { "matcher" regexp } { "?" "a boolean" } }
{ $description "Tests if the string as a whole matches the given regular expression." } ;
HELP: re-split1
{ $values { "string" string } { "matcher" regexp } { "before" string } { "after/f" string } }
{ $description "Searches the string for a substring which matches the pattern. If found, the input string is split on the leftmost and longest occurence of the match, and the two halves are given as output. If no match is found, then the input string and " { $link f } " are output." } ;
HELP: all-matches
{ $values { "string" string } { "matcher" regexp } { "seq" "a sequence of slices of the input" } }
{ $description "Finds a sequence of disjoint substrings which each match the pattern. It chooses this by finding the leftmost longest match, and then the leftmost longest match which starts after the end of the previous match, and so on." } ;
HELP: count-matches
{ $values { "string" string } { "matcher" regexp } { "n" integer } }
{ $description "Counts how many disjoint matches the regexp has in the string, as made unambiguous by " { $link all-matches } "." } ;
HELP: re-split
{ $values { "string" string } { "matcher" regexp } { "seq" "a sequence of slices of the input" } }
{ $description "Splits the input string into chunks separated by the regular expression. Each chunk contains no match of the regexp. The chunks are chosen by the strategy of " { $link all-matches } "." } ;
HELP: re-replace
{ $values { "string" string } { "matcher" regexp } { "replacement" string } { "result" string } }
{ $description "Replaces substrings which match the input regexp with the given replacement text. The boundaries of the substring are chosen by the strategy used by " { $link all-matches } "." } ;

View File

@ -1,9 +1,13 @@
USING: regexp tools.test kernel sequences regexp.parser ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg
regexp.traversal eval strings multiline ; ! See http://factorcode.org/license.txt for BSD license.
USING: regexp tools.test kernel sequences regexp.parser regexp.private
eval strings multiline accessors regexp.matchers ;
IN: regexp-tests IN: regexp-tests
\ <regexp> must-infer \ <regexp> must-infer
\ matches? must-infer ! the following don't compile because [ ] with-compilation-unit doesn't compile
! \ compile-regexp must-infer
! \ matches? must-infer
[ f ] [ "b" "a*" <regexp> matches? ] unit-test [ f ] [ "b" "a*" <regexp> matches? ] unit-test
[ t ] [ "" "a*" <regexp> matches? ] unit-test [ t ] [ "" "a*" <regexp> matches? ] unit-test
@ -21,8 +25,8 @@ IN: regexp-tests
[ t ] [ "b" "b|" <regexp> matches? ] unit-test [ t ] [ "b" "b|" <regexp> matches? ] unit-test
[ t ] [ "" "b|" <regexp> matches? ] unit-test [ t ] [ "" "b|" <regexp> matches? ] unit-test
[ t ] [ "" "b|" <regexp> matches? ] unit-test [ t ] [ "" "b|" <regexp> matches? ] unit-test
[ f ] [ "" "|" <regexp> matches? ] unit-test [ t ] [ "" "|" <regexp> matches? ] unit-test
[ f ] [ "" "|||||||" <regexp> matches? ] unit-test [ t ] [ "" "|||||||" <regexp> matches? ] unit-test
[ f ] [ "aa" "a|b|c" <regexp> matches? ] unit-test [ f ] [ "aa" "a|b|c" <regexp> matches? ] unit-test
[ f ] [ "bb" "a|b|c" <regexp> matches? ] unit-test [ f ] [ "bb" "a|b|c" <regexp> matches? ] unit-test
@ -44,9 +48,9 @@ IN: regexp-tests
! Dotall mode -- when on, . matches newlines. ! Dotall mode -- when on, . matches newlines.
! Off by default. ! Off by default.
[ f ] [ "\n" "." <regexp> matches? ] unit-test [ f ] [ "\n" "." <regexp> matches? ] unit-test
[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test [ t ] [ "\n" "(?s:.)" <regexp> matches? ] unit-test
[ t ] [ "\n" R/ ./s matches? ] unit-test [ t ] [ "\n" R/ ./s matches? ] unit-test
[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test [ f ] [ "\n\n" "(?s:.)." <regexp> matches? ] unit-test
[ f ] [ "" ".+" <regexp> matches? ] unit-test [ f ] [ "" ".+" <regexp> matches? ] unit-test
[ t ] [ "a" ".+" <regexp> matches? ] unit-test [ t ] [ "a" ".+" <regexp> matches? ] unit-test
@ -76,8 +80,6 @@ IN: regexp-tests
[ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test [ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test [ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test
/*
! FIXME
[ f ] [ "" "(a)" <regexp> matches? ] unit-test [ f ] [ "" "(a)" <regexp> matches? ] unit-test
[ t ] [ "a" "(a)" <regexp> matches? ] unit-test [ t ] [ "a" "(a)" <regexp> matches? ] unit-test
[ f ] [ "aa" "(a)" <regexp> matches? ] unit-test [ f ] [ "aa" "(a)" <regexp> matches? ] unit-test
@ -85,7 +87,6 @@ IN: regexp-tests
[ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test [ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test
[ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test [ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test
*/
[ f ] [ "" "a{1}" <regexp> matches? ] unit-test [ f ] [ "" "a{1}" <regexp> matches? ] unit-test
[ t ] [ "a" "a{1}" <regexp> matches? ] unit-test [ t ] [ "a" "a{1}" <regexp> matches? ] unit-test
@ -168,12 +169,9 @@ IN: regexp-tests
[ f ] [ "0" "[^\\d]" <regexp> matches? ] unit-test [ f ] [ "0" "[^\\d]" <regexp> matches? ] unit-test
[ t ] [ "a" "[^\\d]" <regexp> matches? ] unit-test [ t ] [ "a" "[^\\d]" <regexp> matches? ] unit-test
/*
! FIXME
[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" <regexp> matches? ] unit-test [ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" <regexp> matches? ] unit-test
[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" <regexp> matches? ] unit-test [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" <regexp> matches? ] unit-test
[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" <regexp> matches? ] unit-test [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" <regexp> matches? ] unit-test
*/
[ t ] [ "1000" "\\d{4,6}" <regexp> matches? ] unit-test [ t ] [ "1000" "\\d{4,6}" <regexp> matches? ] unit-test
[ t ] [ "1000" "[0-9]{4,6}" <regexp> matches? ] unit-test [ t ] [ "1000" "[0-9]{4,6}" <regexp> matches? ] unit-test
@ -185,7 +183,7 @@ IN: regexp-tests
[ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test [ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
[ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test [ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
[ f ] [ "" "\\Q\\E" <regexp> matches? ] unit-test [ t ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
[ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test [ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
[ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test [ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
[ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test [ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test
@ -195,8 +193,8 @@ IN: regexp-tests
[ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test [ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
[ t ] [ "x" "\\x78" <regexp> matches? ] unit-test [ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
[ f ] [ "y" "\\x78" <regexp> matches? ] unit-test [ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test [ t ] [ "x" "\\u0078" <regexp> matches? ] unit-test
[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test [ f ] [ "y" "\\u0078" <regexp> matches? ] unit-test
[ t ] [ "ab" "a+b" <regexp> matches? ] unit-test [ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
[ f ] [ "b" "a+b" <regexp> matches? ] unit-test [ f ] [ "b" "a+b" <regexp> matches? ] unit-test
@ -214,8 +212,8 @@ IN: regexp-tests
[ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test [ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test
[ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test [ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test
[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test [ 3 ] [ "aaacb" "a*" <regexp> match-index-head ] unit-test
[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test [ 2 ] [ "aaacb" "aa?" <regexp> match-index-head ] unit-test
[ t ] [ "aaa" R/ AAA/i matches? ] unit-test [ t ] [ "aaa" R/ AAA/i matches? ] unit-test
[ f ] [ "aax" R/ AAA/i matches? ] unit-test [ f ] [ "aax" R/ AAA/i matches? ] unit-test
@ -226,15 +224,15 @@ IN: regexp-tests
[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test [ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test [ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "a" "(?i:a)" <regexp> matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "a" "(?i:a)" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "A" "(?i:a)" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "A" "(?i:a)" <regexp> matches? ] unit-test
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test [ t ] [ "a" R/ (?-i:a)/i matches? ] unit-test
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test [ t ] [ "a" R/ (?-i:a)/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" R/ (?-i:a)/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" R/ (?-i:a)/i matches? ] unit-test
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test [ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
[ t ] [ "A" R/ [a-z]/i matches? ] unit-test [ t ] [ "A" R/ [a-z]/i matches? ] unit-test
@ -242,9 +240,11 @@ IN: regexp-tests
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test [ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test [ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test [ t ] [ "abc" reverse R/ abc/r matches? ] unit-test
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test [ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test
! [ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/ matches? ] unit-test ! FIXME
[ t ] [ 3 "xabc" R/ abc/ <reverse-matcher> match-index-from >boolean ] unit-test
[ t ] [ 3 "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-from >boolean ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -253,8 +253,6 @@ IN: regexp-tests
[ t ] [ "abc*" "[^\\*]*\\*" <regexp> matches? ] unit-test [ t ] [ "abc*" "[^\\*]*\\*" <regexp> matches? ] unit-test
[ t ] [ "bca" "[^a]*a" <regexp> matches? ] unit-test [ t ] [ "bca" "[^a]*a" <regexp> matches? ] unit-test
/*
! FIXME
[ ] [ [ ] [
"(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
<regexp> drop <regexp> drop
@ -271,18 +269,13 @@ IN: regexp-tests
[ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test [ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test
[ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test [ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
[ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test [ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
[ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test [ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test [ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> first-match >string ] unit-test [ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> match-head >string ] unit-test
*/
! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
! [ 1 ] [ "hello" ".+?" <regexp> match length ] unit-test
[ { "1" "2" "3" "4" } ] [ { "1" "2" "3" "4" } ]
[ "1ABC2DEF3GHI4" R/ [A-Z]+/ re-split [ >string ] map ] unit-test [ "1ABC2DEF3GHI4" R/ [A-Z]+/ re-split [ >string ] map ] unit-test
@ -308,90 +301,129 @@ IN: regexp-tests
[ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
/* [ "" ] [ "ab" "a(?!b)" <regexp> match-head >string ] unit-test
! FIXME [ "a" ] [ "ac" "a(?!b)" <regexp> match-head >string ] unit-test
[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test [ t ] [ "fxxbar" ".{3}(?!foo)bar" <regexp> matches? ] unit-test
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test [ t ] [ "foobar" ".{3}(?!foo)bar" <regexp> matches? ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test [ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> match-head >string ] unit-test
[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test [ "a" ] [ "ba" "(?<=b)(?<=b)a" <regexp> match-head >string ] unit-test
[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test [ "a" ] [ "cab" "(?<=c)a(?=b)" <regexp> match-head >string ] unit-test
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-index-head ] unit-test
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-index-head ] unit-test
*/
! Bug in parsing word ! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test [ t ] [ "a" R' a' matches? ] unit-test
! Convert to lowercase until E ! Testing negation
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test [ f ] [ "a" R/ (?~a)/ matches? ] unit-test
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test [ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
[ t ] [ "" R/ (?~a)/ matches? ] unit-test
! Convert to uppercase until E [ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test [ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test [ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with ! Intersecting classes
[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
[ t ] [ "ac" R/ ac|\p{Lower}b/ matches? ] unit-test
[ t ] [ "ac" R/ ac|[a-z]b/ matches? ] unit-test
[ t ] [ "ac" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
[ t ] [ "ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
[ t ] [ "πb" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
[ f ] [ "πc" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
[ f ] [ "Ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
! [ t ] [ "a" R/ ^a/ matches? ] unit-test [ t ] [ "aaaa" R/ .*a./ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\ra" R/ ^a/ matches? ] unit-test
! [ t ] [ "a" R/ a$/ matches? ] unit-test ! DFA is compiled when needed, or when literal
! [ f ] [ "a\n" R/ a$/ matches? ] unit-test [ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test
! [ f ] [ "a\r" R/ a$/ matches? ] unit-test [ t ] [ R/ foo/ dfa>> >boolean ] unit-test
! [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
! [ t ] [ "a" R/ a$|b$/ matches? ] unit-test [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ t ] [ "b" R/ a$|b$/ matches? ] unit-test [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ t ] [ "ab" R/ a$|b$/ matches? ] unit-test [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
! [ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test [ f ] [ "\ra" R/ ^a/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test [ 1 ] [ "a" R/ ^a/ count-matches ] unit-test
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test [ 0 ] [ "\na" R/ ^a/ count-matches ] unit-test
! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test [ 0 ] [ "\r\na" R/ ^a/ count-matches ] unit-test
! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test [ 0 ] [ "\ra" R/ ^a/ count-matches ] unit-test
! [ t ] [ "a" R/ \Aa/m matches? ] unit-test [ t ] [ "a" R/ a$/ matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test [ f ] [ "a\n" R/ a$/ matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test [ f ] [ "a\r" R/ a$/ matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test [ 1 ] [ "a" R/ a$/ count-matches ] unit-test
[ 0 ] [ "a\n" R/ a$/ count-matches ] unit-test
[ 0 ] [ "a\r" R/ a$/ count-matches ] unit-test
[ 0 ] [ "a\r\n" R/ a$/ count-matches ] unit-test
! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test [ t ] [ "a" R/ a$|b$/ matches? ] unit-test
! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test [ t ] [ "b" R/ a$|b$/ matches? ] unit-test
[ f ] [ "ab" R/ a$|b$/ matches? ] unit-test
[ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test
! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test [ t ] [ "a" R/ \Aa/ matches? ] unit-test
! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
[ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test
[ f ] [ "\ra" R/ \Aa/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test [ t ] [ "a" R/ \Aa/m matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test
[ 0 ] [ "\ra" R/ \Aa/m count-matches ] unit-test
! [ t ] [ "a" R/ ^a/m matches? ] unit-test [ f ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test
! [ t ] [ "\na" R/ ^a/m matches? ] unit-test [ 1 ] [ "\r\n\n\n\nam" R/ ^am/m count-matches ] unit-test
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test [ f ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test [ f ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test
[ 1 ] [ "a\r\n" R/ \Aa\Z/m count-matches ] unit-test
[ 1 ] [ "a\n" R/ \Aa\Z/m count-matches ] unit-test
[ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test
[ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test
[ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test
[ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test
[ 1 ] [ "a" R/ \Aa\Z/m count-matches ] unit-test
[ 0 ] [ "\na" R/ \Aaa\Z/m count-matches ] unit-test
[ 0 ] [ "\r\na" R/ \Aa\Z/m count-matches ] unit-test
[ 0 ] [ "\ra" R/ \Aa\Z/m count-matches ] unit-test
[ t ] [ "a" R/ ^a/m matches? ] unit-test
[ f ] [ "\na" R/ ^a/m matches? ] unit-test
[ 1 ] [ "\na" R/ ^a/m count-matches ] unit-test
[ 1 ] [ "\r\na" R/ ^a/m count-matches ] unit-test
[ 1 ] [ "\ra" R/ ^a/m count-matches ] unit-test
[ t ] [ "a" R/ a$/m matches? ] unit-test
[ f ] [ "a\n" R/ a$/m matches? ] unit-test
[ 1 ] [ "a\n" R/ a$/m count-matches ] unit-test
[ 1 ] [ "a\r" R/ a$/m count-matches ] unit-test
[ 1 ] [ "a\r\n" R/ a$/m count-matches ] unit-test
[ f ] [ "foobxr" "foo\\z" <regexp> match-index-head ] unit-test
[ 3 ] [ "foo" "foo\\z" <regexp> match-index-head ] unit-test
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test ! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test ! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test ! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test ! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test ! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-index-head ] unit-test
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test ! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test ! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test ! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
@ -399,18 +431,18 @@ IN: regexp-tests
! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test ! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test ! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-head ] unit-test ! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-index-head ] unit-test
! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test ! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test ! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test ! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test ! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test ! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test ! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test ! [ 3 ] [ "aacb" "aa?c" <regexp> match-index-head ] unit-test
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test ! [ 3 ] [ "aacb" "aa??c" <regexp> match-index-head ] unit-test
! "ab" "a(?=b*)" <regexp> match ! "ab" "a(?=b*)" <regexp> match
! "abbbbbc" "a(?=b*c)" <regexp> match ! "abbbbbc" "a(?=b*c)" <regexp> match

View File

@ -1,87 +1,75 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences strings sets USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry regexp.backend regexp.utils namespaces parser arrays fry locals regexp.minimize
regexp.parser regexp.nfa regexp.dfa regexp.traversal regexp.parser regexp.nfa regexp.dfa regexp.classes
regexp.transition-tables splitting sorting ; regexp.transition-tables splitting sorting regexp.ast
regexp.negation regexp.matchers regexp.compiler ;
IN: regexp IN: regexp
: default-regexp ( string -- regexp ) TUPLE: regexp
regexp new { raw read-only }
swap >>raw { parse-tree read-only }
<transition-table> >>nfa-table { options read-only }
<transition-table> >>dfa-table dfa reverse-dfa ;
<transition-table> >>minimized-table
H{ } clone >>nfa-traversal-flags
H{ } clone >>dfa-traversal-flags
H{ } clone >>options
H{ } clone >>matchers
reset-regexp ;
: construct-regexp ( regexp -- regexp' ) : make-regexp ( string ast -- regexp )
{ f f <options> f f regexp boa ; foldable
[ parse-regexp ] ! Foldable because, when the dfa slot is set,
[ construct-nfa ] ! it'll be set to the same thing regardless of who sets it
[ construct-dfa ]
[ ]
} cleave ;
: (match) ( string regexp -- dfa-traverser ) : <optioned-regexp> ( string options -- regexp )
<dfa-traverser> do-match ; inline [ dup parse-regexp ] [ string>options ] bi*
f f regexp boa ;
: match ( string regexp -- slice/f ) : <regexp> ( string -- regexp ) "" <optioned-regexp> ;
(match) return-match ;
: match* ( string regexp -- slice/f captured-groups ) TUPLE: reverse-matcher regexp ;
(match) [ return-match ] [ captured-groups>> ] bi ; C: <reverse-matcher> reverse-matcher
! Reverse matchers won't work properly with most combinators, for now
: matches? ( string regexp -- ? )
dupd match
[ [ length ] bi@ = ] [ drop f ] if* ;
: match-head ( string regexp -- end/f ) match [ length ] [ f ] if* ;
: match-at ( string m regexp -- n/f finished? )
[
2dup swap length > [ 2drop f f ] [ tail-slice t ] if
] dip swap [ match-head f ] [ 2drop f t ] if ;
: match-range ( string m regexp -- a/f b/f )
3dup match-at over [
drop nip rot drop dupd +
] [
[ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if
] if ;
: first-match ( string regexp -- slice/f )
dupd 0 swap match-range rot over [ <slice> ] [ 3drop f ] if ;
: re-cut ( string regexp -- end/f start )
dupd first-match
[ split1-slice swap ] [ "" like f swap ] if* ;
: (re-split) ( string regexp -- )
over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
: re-split ( string regexp -- seq )
[ (re-split) ] { } make ;
: re-replace ( string regexp replacement -- result )
[ re-split ] dip join ;
: next-match ( string regexp -- end/f match/f )
dupd first-match dup
[ [ split1-slice nip ] keep ] [ 2drop f f ] if ;
: all-matches ( string regexp -- seq )
[ dup ] swap '[ _ next-match ] produce nip harvest ;
: count-matches ( string regexp -- n )
all-matches length ;
<PRIVATE <PRIVATE
: get-ast ( regexp -- ast )
[ parse-tree>> ] [ options>> ] bi <with-options> ;
: compile-regexp ( regexp -- regexp )
dup '[ [ _ get-ast ast>dfa dfa>quotation ] unless* ] change-dfa ;
: <reversed-option> ( ast -- reversed )
"r" string>options <with-options> ;
: maybe-negated ( lookaround quot -- regexp-quot )
'[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
M: lookahead question>quot ! Returns ( index string -- ? )
[ ast>dfa dfa>shortest-quotation ] maybe-negated ;
M: lookbehind question>quot ! Returns ( index string -- ? )
[
<reversed-option>
ast>dfa dfa>reverse-shortest-quotation
[ [ 1- ] dip ] prepose
] maybe-negated ;
: compile-reverse ( regexp -- regexp )
dup '[
[
_ get-ast <reversed-option>
ast>dfa dfa>reverse-quotation
] unless*
] change-reverse-dfa ;
M: regexp match-index-from
compile-regexp dfa>> <quot-matcher> match-index-from ;
M: reverse-matcher match-index-from
regexp>> compile-reverse reverse-dfa>>
<quot-matcher> match-index-from ;
! The following two should do some caching
: find-regexp-syntax ( string -- prefix suffix ) : find-regexp-syntax ( string -- prefix suffix )
{ {
{ "R/ " "/" } { "R/ " "/" }
@ -97,28 +85,19 @@ IN: regexp
{ "R| " "|" } { "R| " "|" }
} swap [ subseq? not nip ] curry assoc-find drop ; } swap [ subseq? not nip ] curry assoc-find drop ;
: string>options ( string -- options ) : take-until ( end lexer -- string )
[ ch>option dup ] H{ } map>assoc ; dup skip-blank [
[ index-from ] 2keep
[ swapd subseq ]
[ 2drop 1+ ] 3bi
] change-lexer-column ;
: options>string ( options -- string ) : parse-noblank-token ( lexer -- str/f )
keys [ option>ch ] map natural-sort >string ; dup still-parsing-line? [ (parse-token) ] [ drop f ] if ;
PRIVATE>
: <optioned-regexp> ( string option-string -- regexp )
[ default-regexp ] [ string>options ] bi* >>options
construct-regexp ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
<PRIVATE
: parsing-regexp ( accum end -- accum ) : parsing-regexp ( accum end -- accum )
lexer get dup skip-blank lexer get [ take-until ] [ parse-noblank-token ] bi
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column <optioned-regexp> compile-regexp parsed ;
lexer get dup still-parsing-line?
[ (parse-token) ] [ drop f ] if
<optioned-regexp> parsed ;
PRIVATE> PRIVATE>

View File

@ -1,32 +1,9 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs fry hashtables kernel sequences USING: accessors arrays assocs fry hashtables kernel sequences
vectors regexp.utils ; vectors locals regexp.classes ;
IN: regexp.transition-tables IN: regexp.transition-tables
TUPLE: transition from to obj ;
TUPLE: literal-transition < transition ;
TUPLE: class-transition < transition ;
TUPLE: default-transition < transition ;
TUPLE: literal obj ;
TUPLE: class obj ;
TUPLE: default ;
: make-transition ( from to obj class -- obj )
new
swap >>obj
swap >>to
swap >>from ;
: <literal-transition> ( from to obj -- transition )
literal-transition make-transition ;
: <class-transition> ( from to obj -- transition )
class-transition make-transition ;
: <default-transition> ( from to -- transition )
t default-transition make-transition ;
TUPLE: transition-table transitions start-state final-states ; TUPLE: transition-table transitions start-state final-states ;
: <transition-table> ( -- transition-table ) : <transition-table> ( -- transition-table )
@ -35,14 +12,38 @@ TUPLE: transition-table transitions start-state final-states ;
H{ } clone >>final-states ; H{ } clone >>final-states ;
: maybe-initialize-key ( key hashtable -- ) : maybe-initialize-key ( key hashtable -- )
! Why do we have to do this?
2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ; 2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ;
: set-transition ( transition hash -- ) :: (set-transition) ( from to obj hash -- )
#! set the state as a key to condition? [ to hash maybe-initialize-key ] unless
2dup [ to>> ] dip maybe-initialize-key from hash at
[ [ to>> ] [ obj>> ] [ from>> ] tri ] dip [ [ to obj ] dip set-at ]
2dup at* [ 2nip insert-at ] [ to obj associate from hash set-at ] if* ;
[ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ;
: add-transition ( transition transition-table -- ) : set-transition ( from to obj transition-table -- )
transitions>> set-transition ; transitions>> (set-transition) ;
:: (add-transition) ( from to obj hash -- )
to hash maybe-initialize-key
from hash at
[ [ to obj ] dip push-at ]
[ to 1vector obj associate from hash set-at ] if* ;
: add-transition ( from to obj transition-table -- )
transitions>> (add-transition) ;
: map-set ( assoc quot -- new-assoc )
'[ drop @ dup ] assoc-map ; inline
: number-transitions ( transitions numbering -- new-transitions )
dup '[
[ _ at ]
[ [ _ condition-at ] assoc-map ] bi*
] assoc-map ;
: transitions-at ( transition-table assoc -- transition-table )
[ clone ] dip
[ '[ _ condition-at ] change-start-state ]
[ '[ [ _ at ] map-set ] change-final-states ]
[ '[ _ number-transitions ] change-transitions ] tri ;

View File

@ -1,178 +1,56 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs combinators kernel math USING: accessors assocs combinators kernel math
quotations sequences regexp.parser regexp.classes fry arrays quotations sequences regexp.classes fry arrays regexp.matchers
combinators.short-circuit regexp.utils prettyprint regexp.nfa ; combinators.short-circuit prettyprint regexp.nfa ;
IN: regexp.traversal IN: regexp.traversal
TUPLE: dfa-traverser TUPLE: dfa-traverser
dfa-table dfa-table
traversal-flags current-state
traverse-forward
lookahead-counters
lookbehind-counters
capture-counters
captured-groups
capture-group-index
last-state current-state
text text
match-failed? current-index
start-index current-index match-index ;
matches ;
: <dfa-traverser> ( text regexp -- match ) : <dfa-traverser> ( start-index text dfa -- match )
[ dfa-table>> ] [ dfa-traversal-flags>> ] bi
dfa-traverser new dfa-traverser new
swap >>traversal-flags
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
swap >>text swap >>text
t >>traverse-forward swap >>current-index ;
0 >>start-index
0 >>current-index
0 >>capture-group-index
V{ } clone >>matches
V{ } clone >>capture-counters
V{ } clone >>lookbehind-counters
V{ } clone >>lookahead-counters
H{ } clone >>captured-groups ;
: final-state? ( dfa-traverser -- ? ) : final-state? ( dfa-traverser -- ? )
[ current-state>> ] [ current-state>> ]
[ dfa-table>> final-states>> ] bi key? ; [ dfa-table>> final-states>> ] bi key? ;
: beginning-of-text? ( dfa-traverser -- ? )
current-index>> 0 <= ; inline
: end-of-text? ( dfa-traverser -- ? ) : end-of-text? ( dfa-traverser -- ? )
[ current-index>> ] [ text>> length ] bi >= ; inline [ current-index>> ] [ text>> length ] bi >= ; inline
: text-finished? ( dfa-traverser -- ? ) : text-finished? ( dfa-traverser -- ? )
{ {
[ current-state>> empty? ] [ current-state>> not ]
[ end-of-text? ] [ end-of-text? ]
[ match-failed?>> ]
} 1|| ; } 1|| ;
: save-final-state ( dfa-straverser -- ) : save-final-state ( dfa-traverser -- dfa-traverser )
[ current-index>> ] [ matches>> ] bi push ; dup current-index>> >>match-index ;
: match-done? ( dfa-traverser -- ? ) : match-done? ( dfa-traverser -- ? )
dup final-state? [ dup final-state? [ save-final-state ] when text-finished? ;
dup save-final-state
] when text-finished? ;
: previous-text-character ( dfa-traverser -- ch )
[ text>> ] [ current-index>> 1- ] bi nth ;
: current-text-character ( dfa-traverser -- ch )
[ text>> ] [ current-index>> ] bi nth ;
: next-text-character ( dfa-traverser -- ch )
[ text>> ] [ current-index>> 1+ ] bi nth ;
GENERIC: flag-action ( dfa-traverser flag -- )
M: beginning-of-input flag-action ( dfa-traverser flag -- )
drop
dup beginning-of-text? [ t >>match-failed? ] unless drop ;
M: end-of-input flag-action ( dfa-traverser flag -- )
drop
dup end-of-text? [ t >>match-failed? ] unless drop ;
M: beginning-of-line flag-action ( dfa-traverser flag -- )
drop
dup {
[ beginning-of-text? ]
[ previous-text-character terminator-class class-member? ]
} 1|| [ t >>match-failed? ] unless drop ;
M: end-of-line flag-action ( dfa-traverser flag -- )
drop
dup {
[ end-of-text? ]
[ next-text-character terminator-class class-member? ]
} 1|| [ t >>match-failed? ] unless drop ;
M: word-boundary flag-action ( dfa-traverser flag -- )
drop
dup {
[ end-of-text? ]
[ current-text-character terminator-class class-member? ]
} 1|| [ t >>match-failed? ] unless drop ;
M: lookahead-on flag-action ( dfa-traverser flag -- )
drop
lookahead-counters>> 0 swap push ;
M: lookahead-off flag-action ( dfa-traverser flag -- )
drop
dup lookahead-counters>>
[ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ;
M: lookbehind-on flag-action ( dfa-traverser flag -- )
drop
f >>traverse-forward
[ 2 - ] change-current-index
lookbehind-counters>> 0 swap push ;
M: lookbehind-off flag-action ( dfa-traverser flag -- )
drop
t >>traverse-forward
dup lookbehind-counters>>
[ drop ] [ pop '[ _ + 2 + ] change-current-index drop ] if-empty ;
M: capture-group-on flag-action ( dfa-traverser flag -- )
drop
[ current-index>> 0 2array ]
[ capture-counters>> ] bi push ;
M: capture-group-off flag-action ( dfa-traverser flag -- )
drop
dup capture-counters>> empty? [
drop
] [
{
[ capture-counters>> pop first2 dupd + ]
[ text>> <slice> ]
[ [ 1+ ] change-capture-group-index capture-group-index>> ]
[ captured-groups>> set-at ]
} cleave
] if ;
: process-flags ( dfa-traverser -- )
[ [ 1+ ] map ] change-lookahead-counters
[ [ 1+ ] map ] change-lookbehind-counters
[ [ first2 1+ 2array ] map ] change-capture-counters
! dup current-state>> .
dup [ current-state>> ] [ traversal-flags>> ] bi
at [ flag-action ] with each ;
: increment-state ( dfa-traverser state -- dfa-traverser ) : increment-state ( dfa-traverser state -- dfa-traverser )
[ >>current-state
dup traverse-forward>> [ 1 + ] change-current-index ;
[ [ 1+ ] change-current-index ]
[ [ 1- ] change-current-index ] if
dup current-state>> >>last-state
] [ first ] bi* >>current-state ;
: match-literal ( transition from-state table -- to-state/f ) : match-literal ( transition from-state table -- to-state/f )
transitions>> at at ; transitions>> at at ;
: match-class ( transition from-state table -- to-state/f ) : match-class ( transition from-state table -- to-state/f )
transitions>> at* [ transitions>> at* [
[ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if swap '[ drop _ swap class-member? ] assoc-find spin ?
] [ drop ] if ; ] [ drop ] if ;
: match-default ( transition from-state table -- to-state/f )
[ drop ] 2dip transitions>> at t swap at ;
: match-transition ( obj from-state dfa -- to-state/f ) : match-transition ( obj from-state dfa -- to-state/f )
{ [ match-literal ] [ match-class ] [ match-default ] } 3|| ; { [ match-literal ] [ match-class ] } 3|| ;
: setup-match ( match -- obj state dfa-table ) : setup-match ( match -- obj state dfa-table )
[ [ current-index>> ] [ text>> ] bi nth ] [ [ current-index>> ] [ text>> ] bi nth ]
@ -180,16 +58,12 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
[ dfa-table>> ] tri ; [ dfa-table>> ] tri ;
: do-match ( dfa-traverser -- dfa-traverser ) : do-match ( dfa-traverser -- dfa-traverser )
dup process-flags
dup match-done? [ dup match-done? [
dup setup-match match-transition dup setup-match match-transition
[ increment-state do-match ] when* [ increment-state do-match ] when*
] unless ; ] unless ;
: return-match ( dfa-traverser -- slice/f ) TUPLE: dfa-matcher dfa ;
dup matches>> C: <dfa-matcher> dfa-matcher
[ drop f ] M: dfa-matcher match-index-from
[ dfa>> <dfa-traverser> do-match match-index>> ;
[ [ text>> ] [ start-index>> ] bi ]
[ peek ] bi* rot <slice>
] if-empty ;

View File

@ -1,4 +0,0 @@
USING: regexp.utils tools.test ;
IN: regexp.utils.tests
[ [ ] [ ] while-changes ] must-infer

View File

@ -1,64 +0,0 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs io kernel math math.order
namespaces regexp.backend sequences unicode.categories
math.ranges fry combinators.short-circuit vectors ;
IN: regexp.utils
: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
[ [ dup slip ] dip pick over call ] dip dupd =
[ 3drop ] [ (while-changes) ] if ; inline recursive
: while-changes ( obj quot pred -- obj' )
pick over call (while-changes) ; inline
: assoc-with ( param assoc quot -- assoc curry )
swapd [ [ -rot ] dip call ] 2curry ; inline
: insert-at ( value key hash -- )
2dup at* [
2nip push
] [
drop
[ dup vector? [ 1vector ] unless ] 2dip set-at
] if ;
: ?insert-at ( value key hash/f -- hash )
[ H{ } clone ] unless* [ insert-at ] keep ;
ERROR: bad-octal number ;
ERROR: bad-hex number ;
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
: ascii? ( n -- ? ) 0 HEX: 7f between? ;
: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
: hex-digit? ( n -- ? )
[
[ decimal-digit? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: A CHAR: F between? ]
] 1|| ;
: control-char? ( n -- ? )
[
[ 0 HEX: 1f between? ]
[ HEX: 7f = ]
] 1|| ;
: punct? ( n -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
[ [ alpha? ] [ CHAR: _ = ] ] 1|| ;
: java-blank? ( n -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
: java-printable? ( n -- ? )
[ [ alpha? ] [ punct? ] ] 1|| ;

View File

@ -29,11 +29,14 @@ selection-color caret mark selecting? ;
: init-current ( pane -- pane ) : init-current ( pane -- pane )
dup prototype>> clone >>current ; inline dup prototype>> clone >>current ; inline
: focus-input ( pane -- )
input>> [ request-focus ] when* ;
: next-line ( pane -- ) : next-line ( pane -- )
clear-selection clear-selection
[ input>> unparent ] [ input>> unparent ]
[ init-current prepare-last-line ] [ init-current prepare-last-line ]
[ input>> [ request-focus ] when* ] tri ; [ focus-input ] tri ;
: pane-caret&mark ( pane -- caret mark ) : pane-caret&mark ( pane -- caret mark )
[ caret>> ] [ mark>> ] bi ; inline [ caret>> ] [ mark>> ] bi ; inline
@ -364,9 +367,8 @@ M: paragraph stream-format
interleave interleave
] if ; ] if ;
: caret>mark ( pane -- pane ) : caret>mark ( pane -- )
dup caret>> >>mark dup caret>> >>mark relayout-1 ;
dup relayout-1 ;
GENERIC: sloppy-pick-up* ( loc gadget -- n ) GENERIC: sloppy-pick-up* ( loc gadget -- n )
@ -388,45 +390,46 @@ M: f sloppy-pick-up*
[ 3drop { } ] [ 3drop { } ]
if ; if ;
: move-caret ( pane loc -- pane ) : move-caret ( pane loc -- )
over screen-loc v- over sloppy-pick-up >>caret over screen-loc v- over sloppy-pick-up >>caret
dup relayout-1 ; relayout-1 ;
: begin-selection ( pane -- ) : begin-selection ( pane -- )
f >>selecting? f >>selecting?
hand-loc get move-caret dup hand-loc get move-caret
f >>mark f >>mark
drop ; drop ;
: extend-selection ( pane -- ) : extend-selection ( pane -- )
hand-moved? [ hand-moved? [
dup selecting?>> [ [
hand-loc get move-caret dup selecting?>> [
] [ hand-loc get move-caret
dup hand-clicked get child? [ ] [
t >>selecting? dup hand-clicked get child? [
dup hand-clicked set-global t >>selecting?
hand-click-loc get move-caret [ hand-clicked set-global ]
caret>mark [ hand-click-loc get move-caret ]
] when [ caret>mark ]
] if tri
dup dup caret>> gadget-at-path scroll>gadget ] [ drop ] if
] when drop ; ] if
] [ dup caret>> gadget-at-path scroll>gadget ] bi
] [ drop ] if ;
: end-selection ( pane -- ) : end-selection ( pane -- )
f >>selecting? f >>selecting?
hand-moved? [ hand-moved?
[ com-copy-selection ] [ request-focus ] bi [ [ com-copy-selection ] [ request-focus ] bi ]
] [ [ [ relayout-1 ] [ focus-input ] bi ]
relayout-1 if ;
] if ;
: select-to-caret ( pane -- ) : select-to-caret ( pane -- )
t >>selecting? t >>selecting?
dup mark>> [ caret>mark ] unless [ dup mark>> [ dup caret>mark ] unless hand-loc get move-caret ]
hand-loc get move-caret [ com-copy-selection ]
dup request-focus [ request-focus ]
com-copy-selection ; tri ;
: pane-menu ( pane -- ) { com-copy } show-commands-menu ; : pane-menu ( pane -- ) { com-copy } show-commands-menu ;

View File

@ -1,13 +1,14 @@
USING: xmode.loader xmode.utilities xmode.rules namespaces USING: xmode.loader xmode.utilities xmode.rules namespaces
strings splitting assocs sequences kernel io.files xml memoize strings splitting assocs sequences kernel io.files xml memoize
words globs combinators io.encodings.utf8 sorting accessors xml.data ; words globs combinators io.encodings.utf8 sorting accessors xml.data
xml.traversal xml.syntax ;
IN: xmode.catalog IN: xmode.catalog
TUPLE: mode file file-name-glob first-line-glob ; TUPLE: mode file file-name-glob first-line-glob ;
<TAGS: parse-mode-tag ( modes tag -- ) TAGS: parse-mode-tag ( modes tag -- )
TAG: MODE TAG: MODE parse-mode-tag
dup "NAME" attr [ dup "NAME" attr [
mode new { mode new {
{ "FILE" f (>>file) } { "FILE" f (>>file) }
@ -17,11 +18,9 @@ TAG: MODE
] dip ] dip
rot set-at ; rot set-at ;
TAGS>
: parse-modes-tag ( tag -- modes ) : parse-modes-tag ( tag -- modes )
H{ } clone [ H{ } clone [
swap child-tags [ parse-mode-tag ] with each swap children-tags [ parse-mode-tag ] with each
] keep ; ] keep ;
MEMO: modes ( -- modes ) MEMO: modes ( -- modes )
@ -97,8 +96,8 @@ ERROR: mutually-recursive-rulesets ruleset ;
] if ; ] if ;
: finalize-mode ( rulesets -- ) : finalize-mode ( rulesets -- )
rule-sets [ dup rule-sets [
dup [ nip finalize-rule-set ] assoc-each [ nip finalize-rule-set ] assoc-each
] with-variable ; ] with-variable ;
: load-mode ( name -- rule-sets ) : load-mode ( name -- rule-sets )

View File

@ -1,56 +1,54 @@
USING: xmode.loader.syntax xmode.tokens xmode.rules USING: xmode.loader.syntax xmode.tokens xmode.rules
xmode.keyword-map xml.data xml.traversal xml assocs kernel xmode.keyword-map xml.data xml.traversal xml assocs kernel
combinators sequences math.parser namespaces parser combinators sequences math.parser namespaces parser
xmode.utilities parser-combinators.regexp io.files accessors ; xmode.utilities regexp io.files accessors xml.syntax ;
IN: xmode.loader IN: xmode.loader
! Based on org.gjt.sp.jedit.XModeHandler ! Based on org.gjt.sp.jedit.XModeHandler
! RULES and its children ! RULES and its children
<TAGS: parse-rule-tag ( rule-set tag -- ) TAGS: parse-rule-tag ( rule-set tag -- )
TAG: PROPS TAG: PROPS parse-rule-tag
parse-props-tag >>props drop ; parse-props-tag >>props drop ;
TAG: IMPORT TAG: IMPORT parse-rule-tag
"DELEGATE" attr swap import-rule-set ; "DELEGATE" attr swap import-rule-set ;
TAG: TERMINATE TAG: TERMINATE parse-rule-tag
"AT_CHAR" attr string>number >>terminate-char drop ; "AT_CHAR" attr string>number >>terminate-char drop ;
RULE: SEQ seq-rule RULE: SEQ seq-rule parse-rule-tag
shared-tag-attrs delegate-attr literal-start ; shared-tag-attrs delegate-attr literal-start ;
RULE: SEQ_REGEXP seq-rule RULE: SEQ_REGEXP seq-rule parse-rule-tag
shared-tag-attrs delegate-attr regexp-attr regexp-start ; shared-tag-attrs delegate-attr regexp-attr regexp-start ;
RULE: SPAN span-rule RULE: SPAN span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr span-attrs parse-begin/end-tags init-span-tag ; shared-tag-attrs delegate-attr match-type-attr span-attrs parse-begin/end-tags init-span-tag ;
RULE: SPAN_REGEXP span-rule RULE: SPAN_REGEXP span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr span-attrs regexp-attr parse-begin/end-tags init-span-tag ; shared-tag-attrs delegate-attr match-type-attr span-attrs regexp-attr parse-begin/end-tags init-span-tag ;
RULE: EOL_SPAN eol-span-rule RULE: EOL_SPAN eol-span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr literal-start init-eol-span-tag ; shared-tag-attrs delegate-attr match-type-attr literal-start init-eol-span-tag ;
RULE: EOL_SPAN_REGEXP eol-span-rule RULE: EOL_SPAN_REGEXP eol-span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr regexp-attr regexp-start init-eol-span-tag ; shared-tag-attrs delegate-attr match-type-attr regexp-attr regexp-start init-eol-span-tag ;
RULE: MARK_FOLLOWING mark-following-rule RULE: MARK_FOLLOWING mark-following-rule parse-rule-tag
shared-tag-attrs match-type-attr literal-start ; shared-tag-attrs match-type-attr literal-start ;
RULE: MARK_PREVIOUS mark-previous-rule RULE: MARK_PREVIOUS mark-previous-rule parse-rule-tag
shared-tag-attrs match-type-attr literal-start ; shared-tag-attrs match-type-attr literal-start ;
TAG: KEYWORDS ( rule-set tag -- key value ) TAG: KEYWORDS parse-rule-tag
rule-set get ignore-case?>> <keyword-map> rule-set get ignore-case?>> <keyword-map>
swap child-tags [ over parse-keyword-tag ] each swap children-tags [ over parse-keyword-tag ] each
swap (>>keywords) ; swap (>>keywords) ;
TAGS>
: ?<regexp> ( string/f -- regexp/f ) : ?<regexp> ( string/f -- regexp/f )
dup [ rule-set get ignore-case?>> <regexp> ] when ; dup [ rule-set get ignore-case?>> <?insensitive-regexp> ] when ;
: (parse-rules-tag) ( tag -- rule-set ) : (parse-rules-tag) ( tag -- rule-set )
<rule-set> dup rule-set set <rule-set> dup rule-set set
@ -66,7 +64,7 @@ TAGS>
: parse-rules-tag ( tag -- rule-set ) : parse-rules-tag ( tag -- rule-set )
[ [
[ (parse-rules-tag) ] [ child-tags ] bi [ (parse-rules-tag) ] [ children-tags ] bi
[ parse-rule-tag ] with each [ parse-rule-tag ] with each
rule-set get rule-set get
] with-scope ; ] with-scope ;

View File

@ -3,7 +3,7 @@
USING: accessors xmode.tokens xmode.rules xmode.keyword-map USING: accessors xmode.tokens xmode.rules xmode.keyword-map
xml.data xml.traversal xml assocs kernel combinators sequences xml.data xml.traversal xml assocs kernel combinators sequences
math.parser namespaces make parser lexer xmode.utilities math.parser namespaces make parser lexer xmode.utilities
parser-combinators.regexp io.files splitting arrays ; regexp io.files splitting arrays xml.syntax xml.syntax.private ;
IN: xmode.loader.syntax IN: xmode.loader.syntax
! Rule tag parsing utilities ! Rule tag parsing utilities
@ -11,9 +11,10 @@ IN: xmode.loader.syntax
new swap init-from-tag swap add-rule ; inline new swap init-from-tag swap add-rule ; inline
: RULE: : RULE:
scan scan-word scan scan-word scan-word [
parse-definition { } make parse-definition { } make
swap [ (parse-rule-tag) ] 2curry (TAG:) ; parsing swap [ (parse-rule-tag) ] 2curry
] dip swap define-tag ; parsing
! Attribute utilities ! Attribute utilities
: string>boolean ( string -- ? ) "TRUE" = ; : string>boolean ( string -- ? ) "TRUE" = ;
@ -32,7 +33,7 @@ IN: xmode.loader.syntax
[ "NAME" attr ] [ "VALUE" attr ] bi ; [ "NAME" attr ] [ "VALUE" attr ] bi ;
: parse-props-tag ( tag -- assoc ) : parse-props-tag ( tag -- assoc )
child-tags children-tags
[ parse-prop-tag ] H{ } map>assoc ; [ parse-prop-tag ] H{ } map>assoc ;
: position-attrs ( tag -- at-line-start? at-whitespace-end? at-word-start? ) : position-attrs ( tag -- at-line-start? at-whitespace-end? at-word-start? )
@ -46,7 +47,8 @@ IN: xmode.loader.syntax
swap position-attrs <matcher> ; swap position-attrs <matcher> ;
: parse-regexp-matcher ( tag -- matcher ) : parse-regexp-matcher ( tag -- matcher )
dup children>string rule-set get ignore-case?>> <regexp> dup children>string
rule-set get ignore-case?>> <?insensitive-regexp>
swap position-attrs <matcher> ; swap position-attrs <matcher> ;
: shared-tag-attrs ( -- ) : shared-tag-attrs ( -- )
@ -79,22 +81,20 @@ IN: xmode.loader.syntax
[ parse-literal-matcher >>end drop ] , ; [ parse-literal-matcher >>end drop ] , ;
! SPAN's children ! SPAN's children
<TAGS: parse-begin/end-tag ( rule tag -- ) TAGS: parse-begin/end-tag ( rule tag -- )
TAG: BEGIN TAG: BEGIN parse-begin/end-tag
! XXX ! XXX
parse-literal-matcher >>start drop ; parse-literal-matcher >>start drop ;
TAG: END TAG: END parse-begin/end-tag
! XXX ! XXX
parse-literal-matcher >>end drop ; parse-literal-matcher >>end drop ;
TAGS>
: parse-begin/end-tags ( -- ) : parse-begin/end-tags ( -- )
[ [
! XXX: handle position attrs on span tag itself ! XXX: handle position attrs on span tag itself
child-tags [ parse-begin/end-tag ] with each children-tags [ parse-begin/end-tag ] with each
] , ; ] , ;
: init-span-tag ( -- ) [ drop init-span ] , ; : init-span-tag ( -- ) [ drop init-span ] , ;

View File

@ -4,9 +4,24 @@ IN: xmode.marker
USING: kernel namespaces make xmode.rules xmode.tokens USING: kernel namespaces make xmode.rules xmode.tokens
xmode.marker.state xmode.marker.context xmode.utilities xmode.marker.state xmode.marker.context xmode.utilities
xmode.catalog sequences math assocs combinators strings xmode.catalog sequences math assocs combinators strings
parser-combinators.regexp splitting parser-combinators ascii regexp splitting ascii unicode.case regexp.matchers
ascii combinators.short-circuit accessors ; ascii combinators.short-circuit accessors ;
! Next two words copied from parser-combinators
! Just like head?, but they optionally ignore case
: string= ( str1 str2 ignore-case -- ? )
[ [ >upper ] bi@ ] when sequence= ;
: string-head? ( str1 str2 ignore-case -- ? )
2over shorter?
[ 3drop f ] [
[
[ nip ]
[ length head-slice ] 2bi
] dip string=
] if ;
! Based on org.gjt.sp.jedit.syntax.TokenMarker ! Based on org.gjt.sp.jedit.syntax.TokenMarker
: current-keyword ( -- string ) : current-keyword ( -- string )
@ -150,7 +165,7 @@ M: escape-rule handle-rule-start
process-escape? get [ process-escape? get [
escaped? [ not ] change escaped? [ not ] change
position [ + ] change position [ + ] change
] [ 2drop ] if ; ] [ drop ] if ;
M: seq-rule handle-rule-start M: seq-rule handle-rule-start
?end-rule ?end-rule

View File

@ -1,6 +1,8 @@
! Copyright (C) 2008 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors xmode.tokens xmode.keyword-map kernel USING: accessors xmode.tokens xmode.keyword-map kernel
sequences vectors assocs strings memoize unicode.case sequences vectors assocs strings memoize unicode.case
parser-combinators.regexp ; regexp ;
IN: xmode.rules IN: xmode.rules
TUPLE: string-matcher string ignore-case? ; TUPLE: string-matcher string ignore-case? ;

View File

@ -1,45 +1,2 @@
USING: assocs xmode.utilities tools.test ;
IN: xmode.utilities.tests IN: xmode.utilities.tests
USING: accessors xmode.utilities tools.test xml xml.data kernel
strings vectors sequences io.files prettyprint assocs
unicode.case ;
TUPLE: company employees type ;
: <company> V{ } clone f company boa ;
: add-employee employees>> push ;
<TAGS: parse-employee-tag
TUPLE: employee name description ;
TAG: employee
employee new
{ { "name" f (>>name) } { f (>>description) } }
init-from-tag swap add-employee ;
TAGS>
\ parse-employee-tag see
: parse-company-tag
[
<company>
{ { "type" >upper (>>type) } }
init-from-tag dup
] keep
children>> [ tag? ] filter
[ parse-employee-tag ] with each ;
[
T{ company f
V{
T{ employee f "Joe" "VP Sales" }
T{ employee f "Jane" "CFO" }
}
"PUBLIC"
}
] [
"vocab:xmode/utilities/test.xml"
file>xml parse-company-tag
] unit-test

View File

@ -1,11 +1,10 @@
USING: accessors sequences assocs kernel quotations namespaces USING: accessors sequences assocs kernel quotations namespaces
xml.data xml.traversal combinators macros parser lexer words fry ; xml.data xml.traversal combinators macros parser lexer words fry
regexp ;
IN: xmode.utilities IN: xmode.utilities
: implies ( x y -- z ) [ not ] dip or ; inline : implies ( x y -- z ) [ not ] dip or ; inline
: child-tags ( tag -- seq ) children>> [ tag? ] filter ;
: tag-init-form ( spec -- quot ) : tag-init-form ( spec -- quot )
{ {
{ [ dup quotation? ] [ [ object get tag get ] prepose ] } { [ dup quotation? ] [ [ object get tag get ] prepose ] }
@ -33,20 +32,5 @@ MACRO: (init-from-tag) ( specs -- )
: init-from-tag ( tag tuple specs -- tuple ) : init-from-tag ( tag tuple specs -- tuple )
over [ (init-from-tag) ] dip ; inline over [ (init-from-tag) ] dip ; inline
SYMBOL: tag-handlers : <?insensitive-regexp> ( string ? -- regexp )
SYMBOL: tag-handler-word "i" "" ? <optioned-regexp> ;
: <TAGS:
CREATE tag-handler-word set
H{ } clone tag-handlers set ; parsing
: (TAG:) ( name quot -- ) swap tag-handlers get set-at ;
: TAG:
scan parse-definition
(TAG:) ; parsing
: TAGS>
tag-handler-word get
tag-handlers get >alist [ [ dup main>> ] dip case ] curry
define ; parsing

View File

@ -41,7 +41,7 @@ M: assoc assoc-like drop ;
: substituter ( assoc -- quot ) : substituter ( assoc -- quot )
[ ?at drop ] curry ; inline [ ?at drop ] curry ; inline
: with-assoc ( assoc quot: ( value key -- assoc ) -- quot: ( key value -- ) ) : with-assoc ( assoc quot: ( value key assoc -- ) -- quot: ( key value -- ) )
curry [ swap ] prepose ; inline curry [ swap ] prepose ; inline
PRIVATE> PRIVATE>

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Slava Pestov. ! Copyright (C) 2008 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors regexp prettyprint io io.encodings.ascii USING: accessors regexp.matchers prettyprint io io.encodings.ascii
io.files kernel sequences assocs namespaces ; io.files kernel sequences assocs namespaces regexp ;
IN: benchmark.regex-dna IN: benchmark.regex-dna
! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1 ! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1