Merge branch 'master' of git://factorcode.org/git/factor

db4
Bruno Deferrari 2008-08-27 02:42:15 -03:00
commit 9ccb68a883
18 changed files with 168 additions and 65 deletions

View File

@ -147,6 +147,7 @@ M: float >base
{ [ dup fp-nan? ] [ drop "0.0/0.0" ] }
{ [ dup 1.0/0.0 = ] [ drop "1.0/0.0" ] }
{ [ dup -1.0/0.0 = ] [ drop "-1.0/0.0" ] }
{ [ dup double>bits HEX: 8000000000000000 = ] [ drop "-0.0" ] }
[ float>string fix-float ]
} cond ;

View File

@ -7,6 +7,7 @@ TUPLE: regexp
raw
{ stack vector }
parse-tree
{ options hashtable }
nfa-table
dfa-table
minimized-table
@ -18,6 +19,7 @@ TUPLE: regexp
0 >>state
V{ } clone >>stack
V{ } clone >>new-states
H{ } clone >>options
H{ } clone >>visited-states ;
SYMBOL: current-regexp

View File

@ -21,6 +21,9 @@ M: letter-class class-member? ( obj class -- ? )
M: LETTER-class class-member? ( obj class -- ? )
drop LETTER? ;
M: Letter-class class-member? ( obj class -- ? )
drop Letter? ;
M: ascii-class class-member? ( obj class -- ? )
drop ascii? ;
@ -47,3 +50,6 @@ M: hex-digit-class class-member? ( obj class -- ? )
M: java-blank-class class-member? ( obj class -- ? )
drop java-blank? ;
M: unmatchable-class class-member? ( obj class -- ? )
2drop f ;

View File

@ -1,10 +1,10 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser multi-methods namespaces qualified
kernel math math.parser multi-methods namespaces qualified sets
quotations sequences sequences.lib splitting symbols vectors
dlists math.order combinators.lib unicode.categories
sequences.lib regexp2.backend regexp2.utils ;
dlists math.order combinators.lib unicode.categories strings
sequences.lib regexp2.backend regexp2.utils unicode.case ;
IN: regexp2.parser
FROM: math.ranges => [a,b] ;
@ -30,30 +30,41 @@ SINGLETON: back-anchor INSTANCE: back-anchor node
TUPLE: option-on option ; INSTANCE: option-on node
TUPLE: option-off option ; INSTANCE: option-off node
SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case ;
MIXIN: regexp-option
INSTANCE: unix-lines regexp-option
INSTANCE: dotall regexp-option
INSTANCE: multiline regexp-option
INSTANCE: comments regexp-option
INSTANCE: case-insensitive regexp-option
INSTANCE: unicode-case regexp-option
SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ;
SINGLETONS: letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class ;
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class ;
SINGLETONS: beginning-of-group end-of-group
beginning-of-character-class end-of-character-class
left-parenthesis pipe caret dash ;
: <constant> ( obj -- constant ) constant boa ;
: get-option ( option -- ? ) current-regexp get options>> at ;
: get-unix-lines ( -- ? ) unix-lines get-option ;
: get-dotall ( -- ? ) dotall get-option ;
: get-multiline ( -- ? ) multiline get-option ;
: get-comments ( -- ? ) comments get-option ;
: get-case-insensitive ( -- ? ) case-insensitive get-option ;
: get-unicode-case ( -- ? ) unicode-case get-option ;
: get-reversed-regexp ( -- ? ) reversed-regexp get-option ;
: <negation> ( obj -- negation ) negation boa ;
: <concatenation> ( seq -- concatenation ) >vector concatenation boa ;
: <concatenation> ( seq -- concatenation )
>vector get-reversed-regexp [ reverse ] when
concatenation boa ;
: <alternation> ( seq -- alternation ) >vector alternation boa ;
: <capture-group> ( obj -- capture-group ) capture-group boa ;
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
: <constant> ( obj -- constant )
dup Letter? get-case-insensitive and [
[ ch>lower constant boa ]
[ ch>upper constant boa ] bi 2array <alternation>
] [
constant boa
] if ;
: first|concatenation ( seq -- first/concatenation )
dup length 1 = [ first ] [ <concatenation> ] if ;
@ -61,6 +72,17 @@ left-parenthesis pipe caret dash ;
: first|alternation ( seq -- first/alternation )
dup length 1 = [ first ] [ <alternation> ] if ;
: <character-class-range> ( from to -- obj )
2dup [ Letter? ] bi@ or get-case-insensitive and [
[ [ ch>lower ] bi@ character-class-range boa ]
[ [ ch>upper ] bi@ character-class-range boa ] 2bi
2array [ [ from>> ] [ to>> ] bi < ] filter
[ unmatchable-class ] [ first|alternation ] if-empty
] [
2dup <
[ character-class-range boa ] [ 2drop unmatchable-class ] if
] if ;
ERROR: unmatched-parentheses ;
: make-positive-lookahead ( string -- )
@ -90,24 +112,26 @@ ERROR: bad-option ch ;
{ CHAR: i [ case-insensitive ] }
{ CHAR: d [ unix-lines ] }
{ CHAR: m [ multiline ] }
{ CHAR: r [ reversed-regexp ] }
{ CHAR: s [ dotall ] }
{ CHAR: u [ unicode-case ] }
{ CHAR: x [ comments ] }
[ bad-option ]
} case ;
: option-on ( ch -- ) option \ option-on boa push-stack ;
: option-off ( ch -- ) option \ option-off boa push-stack ;
: toggle-option ( ch ? -- ) [ option-on ] [ option-off ] if ;
: option-on ( option -- ) current-regexp get options>> conjoin ;
: option-off ( option -- ) current-regexp get options>> delete-at ;
: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
: parse-options ( string -- )
"-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
DEFER: (parse-regexp)
: parse-special-group-options ( options -- )
: parse-special-group ( -- )
beginning-of-group push-stack
parse-options (parse-regexp) pop-stack make-non-capturing-group ;
(parse-regexp) pop-stack make-non-capturing-group ;
ERROR: bad-special-group string ;
@ -126,8 +150,13 @@ ERROR: bad-special-group string ;
{ [ dup CHAR: < = peek1 CHAR: ! = and ]
[ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] }
[
":" read-until [ bad-special-group ] unless
swap prefix parse-special-group-options
":)" read-until
[ swap prefix ] dip
{
{ CHAR: : [ parse-options parse-special-group ] }
{ CHAR: ) [ parse-options ] }
[ drop bad-special-group ]
} case
]
} cond ;
@ -193,10 +222,10 @@ ERROR: expected-posix-class ;
read1 CHAR: { = [ expected-posix-class ] unless
"}" read-until [ bad-character-class ] unless
{
{ "Lower" [ letter-class ] }
{ "Upper" [ LETTER-class ] }
{ "ASCII" [ ascii-class ] }
{ "Lower" [ get-case-insensitive Letter-class letter-class ? ] }
{ "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] }
{ "Alpha" [ Letter-class ] }
{ "ASCII" [ ascii-class ] }
{ "Digit" [ digit-class ] }
{ "Alnum" [ alpha-class ] }
{ "Punct" [ punctuation-class ] }
@ -250,6 +279,13 @@ ERROR: bad-escaped-literals seq ;
{ CHAR: 0 [ parse-octal <constant> ] }
{ CHAR: c [ parse-control-character ] }
! { CHAR: b [ handle-word-boundary ] }
! { CHAR: B [ handle-word-boundary <negation> ] }
! { CHAR: A [ handle-beginning-of-input ] }
! { CHAR: G [ end of previous match ] }
! { CHAR: Z [ handle-end-of-input ] }
! { CHAR: z [ handle-end-of-input ] } ! except for terminator
{ CHAR: Q [ parse-escaped-literals ] }
} case ;
@ -273,7 +309,7 @@ ERROR: bad-escaped-literals seq ;
handle-dash handle-caret ;
: apply-dash ( -- )
stack [ pop3 nip character-class-range boa ] keep push ;
stack [ pop3 nip <character-class-range> ] keep push ;
: apply-dash? ( -- ? )
stack dup length 3 >=
@ -312,16 +348,9 @@ DEFER: handle-left-bracket
beginning-of-character-class push-stack
parse-character-class-first (parse-character-class) ;
ERROR: empty-regexp ;
: finish-regexp-parse ( stack -- obj )
dup length {
{ 0 [ empty-regexp ] }
{ 1 [ first ] }
[
drop { pipe } split
[ first|concatenation ] map first|alternation
]
} case ;
{ pipe } split
[ first|concatenation ] map first|alternation ;
: handle-right-parenthesis ( -- )
stack beginning-of-group over last-index cut rest

View File

@ -0,0 +1,14 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings help.markup help.syntax regexp2.backend ;
IN: regexp2
HELP: <regexp>
{ $values { "string" string } { "regexp" regexp } }
{ $description "Compiles a regular expression into a DFA and returns this object. Regular expressions only have to be compiled once and can then be used multiple times to match input strings." } ;
HELP: <iregexp>
{ $values { "string" string } { "regexp" regexp } }
{ $description "Compiles a case-insensitive regular expression into a DFA and returns this object. Otherwise, exactly the same as " { $link <regexp> } } ;
{ <regexp> <iregexp> } related-words

View File

@ -1,4 +1,5 @@
USING: regexp2 tools.test kernel regexp2.traversal ;
USING: regexp2 tools.test kernel sequences regexp2.parser
regexp2.traversal ;
IN: regexp2-tests
[ f ] [ "b" "a*" <regexp> matches? ] unit-test
@ -151,7 +152,7 @@ IN: regexp2-tests
[ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
[ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
[ t ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
[ f ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
[ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
[ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
[ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test
@ -203,6 +204,8 @@ IN: regexp2-tests
<regexp> drop
] unit-test
[ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
@ -226,9 +229,29 @@ IN: regexp2-tests
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
! [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
! [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
! [ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test
! Bug in parsing word
! [ t ] [ "a" R' a' matches? ] unit-test

View File

@ -1,7 +1,8 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel regexp2.backend regexp2.utils
regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal state-tables
USING: accessors combinators kernel math math.ranges
sequences regexp2.backend regexp2.utils memoize sets
regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal
regexp2.transition-tables ;
IN: regexp2
@ -13,8 +14,7 @@ IN: regexp2
<transition-table> >>minimized-table
reset-regexp ;
: <regexp> ( string -- regexp )
default-regexp
: construct-regexp ( regexp -- regexp' )
{
[ parse-regexp ]
[ construct-nfa ]
@ -22,6 +22,30 @@ IN: regexp2
[ ]
} cleave ;
: match ( string regexp -- pair )
<dfa-traverser> do-match return-match ;
: matches? ( string regexp -- ? )
dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
: match-head ( string regexp -- end ) match length>> 1- ;
: initial-option ( regexp option -- regexp' )
over options>> conjoin ;
: <regexp> ( string -- regexp )
default-regexp construct-regexp ;
: <iregexp> ( string -- regexp )
default-regexp
case-insensitive initial-option
construct-regexp ;
: <rregexp> ( string -- regexp )
default-regexp
reversed-regexp initial-option
construct-regexp ;
: R! CHAR: ! <regexp> ; parsing
: R" CHAR: " <regexp> ; parsing
: R# CHAR: # <regexp> ; parsing

View File

@ -78,11 +78,3 @@ TUPLE: dfa-traverser
dup matches>>
[ drop f ]
[ [ start-index>> ] [ peek ] bi* 1 <range> ] if-empty ;
: match ( string regexp -- pair )
<dfa-traverser> do-match return-match ;
: matches? ( string regexp -- ? )
dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
: match-head ( string regexp -- end ) match length>> 1- ;

View File

@ -8,7 +8,7 @@ SYMBOL: errored
SYMBOL: before
SYMBOL: after
SYMBOL: quot
TUPLE: random-tester-error ;
ERROR: random-tester-error ;
: setup-test ( #data #code -- data... quot )
#! Variable stack effect
@ -35,7 +35,7 @@ TUPLE: random-tester-error ;
"--" print
[ . ] each
quot get .
random-tester-error construct-empty throw
random-tester-error
] if
] unless clear ;

View File

@ -1,5 +1,7 @@
USING: kernel namespaces sequences sorting vocabs ;
USING: arrays assocs generic hashtables math math.intervals math.parser math.functions refs shuffle vectors words ;
USING: kernel namespaces sequences sets sorting vocabs ;
USING: arrays assocs generic hashtables
math math.intervals math.parser math.order math.functions
refs shuffle vectors words ;
IN: random-tester.safe-words
: ?-words
@ -16,7 +18,11 @@ IN: random-tester.safe-words
array? integer? complex? value-ref? ref? key-ref?
interval? number?
wrapper? tuple?
[-1,1]? between? bignum? both? either? eq? equal? even? fixnum? float? fp-nan? hashtable? interval-contains? interval-subset? interval? key-ref? key? number? odd? pair? power-of-2? ratio? rational? real? subassoc? zero? assoc? curry? vector? callstack? ! clear 3.14 [ <vector> assoc? ] compile-1
[-1,1]? between? bignum? both? either? eq? equal? even? fixnum?
float? fp-nan? hashtable? interval-contains? interval-subset?
interval? key-ref? key? number? odd? pair? power-of-2?
ratio? rational? real? zero? assoc? curry? vector? callstack?
2^ not
! arrays
resize-array <array>
@ -64,6 +70,9 @@ IN: random-tester.safe-words
retainstack callstack
datastack
callstack>array
curry 2curry 3curry compose 3compose
(assoc-each)
} ;
: exit-words
@ -83,28 +92,31 @@ IN: random-tester.safe-words
] { } make ;
: safe-words ( -- array )
bad-words {
"alists" "arrays" "assocs" ! "bit-arrays" "byte-arrays"
{
! "accessors"
"alists" "arrays" "assocs" "bit-arrays" "byte-arrays"
! "classes" "combinators" "compiler" "continuations"
! "core-foundation" "definitions" "documents"
! "float-arrays" "generic" "graphs" "growable"
"hashtables" ! io.*
"kernel" "math"
"kernel" "math"
"math.bitfields" "math.complex" "math.constants" "math.floats"
"math.functions" "math.integers" "math.intervals" "math.libm"
"math.parser" "math.ratios" "math.vectors"
! "namespaces" "quotations" "sbufs"
"math.parser" "math.order" "math.ratios" "math.vectors"
! "namespaces"
"quotations" "sbufs"
! "queues" "strings" "sequences"
"sets"
"vectors"
! "words"
} [ words ] map concat seq-diff natural-sort ;
} [ words ] map concat bad-words diff natural-sort ;
safe-words \ safe-words set-global
! foo dup (clone) = .
! foo dup clone = .
! f [ byte-array>bignum assoc-clone-like ] compile-1
! 2 3.14 [ construct-empty number= ] compile-1
! 2 3.14 [ number= ] compile-1
! 3.14 [ <vector> assoc? ] compile-1
! -3 [ ] 2 [ byte-array>bignum denominator ] compile-1
! : foo ( x -- y ) euler bitand ; { foo } compile 20 foo