Merge branch 'master' of git://factorcode.org/git/factor

2008-08-27 02:42:15 -03:00 · 2008-08-27 02:42:15 -03:00 · 9ccb68a883
parent 4778e90d7d c9ea133b16
commit 9ccb68a883
18 changed files with 168 additions and 65 deletions
--- a/core/math/parser/parser.factor
+++ b/core/math/parser/parser.factor
@ -147,6 +147,7 @@ M: float >base
        { [ dup fp-nan? ] [ drop "0.0/0.0" ] }
        { [ dup 1.0/0.0 = ] [ drop "1.0/0.0" ] }
        { [ dup -1.0/0.0 = ] [ drop "-1.0/0.0" ] }
+        { [ dup double>bits HEX: 8000000000000000 = ] [ drop "-0.0" ] }
        [ float>string fix-float ]
    } cond ;

--- a/unfinished/regexp2/authors.txt
+++ b/unfinished/regexp2/authors.txt
--- a/unfinished/regexp2/backend/backend.factor
+++ b/unfinished/regexp2/backend/backend.factor
@ -7,6 +7,7 @@ TUPLE: regexp
    raw
    { stack vector }
    parse-tree
+    { options hashtable }
    nfa-table
    dfa-table
    minimized-table
@ -18,6 +19,7 @@ TUPLE: regexp
    0 >>state
    V{ } clone >>stack
    V{ } clone >>new-states
+    H{ } clone >>options
    H{ } clone >>visited-states ;

 SYMBOL: current-regexp
--- a/unfinished/regexp2/classes/classes.factor
+++ b/unfinished/regexp2/classes/classes.factor
@ -21,6 +21,9 @@ M: letter-class class-member? ( obj class -- ? )
 M: LETTER-class class-member? ( obj class -- ? )
    drop LETTER? ;

+M: Letter-class class-member? ( obj class -- ? )
+    drop Letter? ;
+
 M: ascii-class class-member? ( obj class -- ? )
    drop ascii? ;

@ -47,3 +50,6 @@ M: hex-digit-class class-member? ( obj class -- ? )

 M: java-blank-class class-member? ( obj class -- ? )
    drop java-blank? ;
+
+M: unmatchable-class class-member? ( obj class -- ? )
+    2drop f ;
--- a/unfinished/regexp2/dfa/dfa.factor
+++ b/unfinished/regexp2/dfa/dfa.factor
--- a/unfinished/regexp2/nfa/nfa.factor
+++ b/unfinished/regexp2/nfa/nfa.factor
--- a/unfinished/regexp2/parser/parser-tests.factor
+++ b/unfinished/regexp2/parser/parser-tests.factor
--- a/unfinished/regexp2/parser/parser.factor
+++ b/unfinished/regexp2/parser/parser.factor
@ -1,10 +1,10 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays assocs combinators io io.streams.string
-kernel math math.parser multi-methods namespaces qualified
+kernel math math.parser multi-methods namespaces qualified sets
 quotations sequences sequences.lib splitting symbols vectors
-dlists math.order combinators.lib unicode.categories
-sequences.lib regexp2.backend regexp2.utils ;
+dlists math.order combinators.lib unicode.categories strings
+sequences.lib regexp2.backend regexp2.utils unicode.case ;
 IN: regexp2.parser

 FROM: math.ranges => [a,b] ;
@ -30,30 +30,41 @@ SINGLETON: back-anchor INSTANCE: back-anchor node

 TUPLE: option-on option ; INSTANCE: option-on node
 TUPLE: option-off option ; INSTANCE: option-off node
-SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case ;
-MIXIN: regexp-option
-INSTANCE: unix-lines regexp-option
-INSTANCE: dotall regexp-option
-INSTANCE: multiline regexp-option
-INSTANCE: comments regexp-option
-INSTANCE: case-insensitive regexp-option
-INSTANCE: unicode-case regexp-option
+SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ;

 SINGLETONS: letter-class LETTER-class Letter-class digit-class
 alpha-class non-newline-blank-class
 ascii-class punctuation-class java-printable-class blank-class
-control-character-class hex-digit-class java-blank-class c-identifier-class ;
+control-character-class hex-digit-class java-blank-class c-identifier-class
+unmatchable-class ;

 SINGLETONS: beginning-of-group end-of-group
 beginning-of-character-class end-of-character-class
 left-parenthesis pipe caret dash ;

-: <constant> ( obj -- constant ) constant boa ;
+: get-option ( option -- ? ) current-regexp get options>> at ;
+: get-unix-lines ( -- ? ) unix-lines get-option ;
+: get-dotall ( -- ? ) dotall get-option ;
+: get-multiline ( -- ? ) multiline get-option ;
+: get-comments ( -- ? ) comments get-option ;
+: get-case-insensitive ( -- ? ) case-insensitive get-option ;
+: get-unicode-case ( -- ? ) unicode-case get-option ;
+: get-reversed-regexp ( -- ? ) reversed-regexp get-option ;
+
 : <negation> ( obj -- negation ) negation boa ;
-: <concatenation> ( seq -- concatenation ) >vector concatenation boa ;
+: <concatenation> ( seq -- concatenation )
+    >vector get-reversed-regexp [ reverse ] when
+    concatenation boa ;
 : <alternation> ( seq -- alternation ) >vector alternation boa ;
 : <capture-group> ( obj -- capture-group ) capture-group boa ;
 : <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
+: <constant> ( obj -- constant )
+    dup Letter? get-case-insensitive and [
+        [ ch>lower constant boa ]
+        [ ch>upper constant boa ] bi 2array <alternation>
+    ] [
+        constant boa
+    ] if ;

 : first|concatenation ( seq -- first/concatenation )
    dup length 1 = [ first ] [ <concatenation> ] if ;
@ -61,6 +72,17 @@ left-parenthesis pipe caret dash ;
 : first|alternation ( seq -- first/alternation )
    dup length 1 = [ first ] [ <alternation> ] if ;

+: <character-class-range> ( from to -- obj )
+    2dup [ Letter? ] bi@ or get-case-insensitive and [
+        [ [ ch>lower ] bi@ character-class-range boa ]
+        [ [ ch>upper ] bi@ character-class-range boa ] 2bi
+        2array [ [ from>> ] [ to>> ] bi < ] filter
+        [ unmatchable-class ] [ first|alternation ] if-empty
+    ] [
+        2dup <
+        [ character-class-range boa ] [ 2drop unmatchable-class ] if
+    ] if ;
+
 ERROR: unmatched-parentheses ;

 : make-positive-lookahead ( string -- )
@ -90,24 +112,26 @@ ERROR: bad-option ch ;
        { CHAR: i [ case-insensitive ] }
        { CHAR: d [ unix-lines ] }
        { CHAR: m [ multiline ] }
+        { CHAR: r [ reversed-regexp ] }
        { CHAR: s [ dotall ] }
        { CHAR: u [ unicode-case ] }
        { CHAR: x [ comments ] }
        [ bad-option ]
    } case ;
-    
-: option-on ( ch -- ) option \ option-on boa push-stack ;
-: option-off ( ch -- ) option \ option-off boa push-stack ;
-: toggle-option ( ch ? -- ) [ option-on ] [ option-off ] if ;
+
+: option-on ( option -- ) current-regexp get options>> conjoin ;
+: option-off ( option -- ) current-regexp get options>> delete-at ;
+
+: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
 : (parse-options) ( string ? -- ) [ toggle-option ] curry each ;

 : parse-options ( string -- )
    "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;

 DEFER: (parse-regexp)
-: parse-special-group-options ( options -- )
+: parse-special-group ( -- )
    beginning-of-group push-stack
-    parse-options (parse-regexp) pop-stack make-non-capturing-group ;
+    (parse-regexp) pop-stack make-non-capturing-group ;

 ERROR: bad-special-group string ;

@ -126,8 +150,13 @@ ERROR: bad-special-group string ;
        { [ dup CHAR: < = peek1 CHAR: ! = and ]
            [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] }
        [
-            ":" read-until [ bad-special-group ] unless
-            swap prefix parse-special-group-options
+            ":)" read-until
+            [ swap prefix ] dip
+            {
+                { CHAR: : [ parse-options parse-special-group ] }
+                { CHAR: ) [ parse-options ] }
+                [ drop bad-special-group ]
+            } case
        ]
    } cond ;

@ -193,10 +222,10 @@ ERROR: expected-posix-class ;
    read1 CHAR: { = [ expected-posix-class ] unless
    "}" read-until [ bad-character-class ] unless
    {
-        { "Lower" [ letter-class ] }
-        { "Upper" [ LETTER-class ] }
-        { "ASCII" [ ascii-class ] }
+        { "Lower" [ get-case-insensitive Letter-class letter-class ? ] }
+        { "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] }
        { "Alpha" [ Letter-class ] }
+        { "ASCII" [ ascii-class ] }
        { "Digit" [ digit-class ] }
        { "Alnum" [ alpha-class ] }
        { "Punct" [ punctuation-class ] }
@ -250,6 +279,13 @@ ERROR: bad-escaped-literals seq ;
        { CHAR: 0 [ parse-octal <constant> ] }
        { CHAR: c [ parse-control-character ] }

+        ! { CHAR: b [ handle-word-boundary ] }
+        ! { CHAR: B [ handle-word-boundary <negation> ] }
+        ! { CHAR: A [ handle-beginning-of-input ] }
+        ! { CHAR: G [ end of previous match ] }
+        ! { CHAR: Z [ handle-end-of-input ] }
+        ! { CHAR: z [ handle-end-of-input ] } ! except for terminator
+
        { CHAR: Q [ parse-escaped-literals ] }
    } case ;

@ -273,7 +309,7 @@ ERROR: bad-escaped-literals seq ;
    handle-dash handle-caret ;

 : apply-dash ( -- )
-    stack [ pop3 nip character-class-range boa ] keep push ;
+    stack [ pop3 nip <character-class-range> ] keep push ;

 : apply-dash? ( -- ? )
    stack dup length 3 >=
@ -312,16 +348,9 @@ DEFER: handle-left-bracket
    beginning-of-character-class push-stack
    parse-character-class-first (parse-character-class) ;

-ERROR: empty-regexp ;
 : finish-regexp-parse ( stack -- obj )
-    dup length {
-        { 0 [ empty-regexp ] }
-        { 1 [ first ] }
-        [
-            drop { pipe } split
-            [ first|concatenation ] map first|alternation
-        ]
-    } case ;
+    { pipe } split
+    [ first|concatenation ] map first|alternation ;

 : handle-right-parenthesis ( -- )
    stack beginning-of-group over last-index cut rest
--- a/unfinished/regexp2/regexp2-docs.factor
+++ b/unfinished/regexp2/regexp2-docs.factor
@ -0,0 +1,14 @@
+! Copyright (C) 2008 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel strings help.markup help.syntax regexp2.backend ;
+IN: regexp2
+
+HELP: <regexp>
+{ $values { "string" string } { "regexp" regexp } }
+{ $description "Compiles a regular expression into a DFA and returns this object.  Regular expressions only have to be compiled once and can then be used multiple times to match input strings." } ;
+
+HELP: <iregexp>
+{ $values { "string" string } { "regexp" regexp } }
+{ $description "Compiles a case-insensitive regular expression into a DFA and returns this object.  Otherwise, exactly the same as " { $link <regexp> } } ;
+
+{ <regexp> <iregexp> } related-words
--- a/unfinished/regexp2/regexp2-tests.factor
+++ b/unfinished/regexp2/regexp2-tests.factor
@ -1,4 +1,5 @@
-USING: regexp2 tools.test kernel regexp2.traversal ;
+USING: regexp2 tools.test kernel sequences regexp2.parser
+regexp2.traversal ;
 IN: regexp2-tests

 [ f ] [ "b" "a*" <regexp> matches? ] unit-test
@ -151,7 +152,7 @@ IN: regexp2-tests
 [ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
 [ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test

-[ t ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
+[ f ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
 [ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
 [ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
 [ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test
@ -203,6 +204,8 @@ IN: regexp2-tests
    <regexp> drop
 ] unit-test

+[ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
+
 [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
 [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test

@ -226,9 +229,29 @@ IN: regexp2-tests
 ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
 ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test

-! [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
-! [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
-! [ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
+[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
+[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
+[ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
+
+[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
+[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
+[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
+[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
+
+[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
+[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
+[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
+[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
+
+[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
+[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
+
+[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
+[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
+
+[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
+[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
+[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test

 ! Bug in parsing word
 ! [ t ] [ "a" R' a' matches?  ] unit-test
--- a/unfinished/regexp2/regexp2.factor
+++ b/unfinished/regexp2/regexp2.factor
@ -1,7 +1,8 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors combinators kernel regexp2.backend regexp2.utils
-regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal state-tables
+USING: accessors combinators kernel math math.ranges
+sequences regexp2.backend regexp2.utils memoize sets
+regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal
 regexp2.transition-tables ;
 IN: regexp2

@ -13,8 +14,7 @@ IN: regexp2
        <transition-table> >>minimized-table
        reset-regexp ;

-: <regexp> ( string -- regexp )
-    default-regexp
+: construct-regexp ( regexp -- regexp' )
    {
        [ parse-regexp ]
        [ construct-nfa ]
@ -22,6 +22,30 @@ IN: regexp2
        [ ]
    } cleave ;

+: match ( string regexp -- pair )
+    <dfa-traverser> do-match return-match ;
+
+: matches? ( string regexp -- ? )
+    dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
+
+: match-head ( string regexp -- end ) match length>> 1- ;
+
+: initial-option ( regexp option -- regexp' )
+    over options>> conjoin ;
+
+: <regexp> ( string -- regexp )
+    default-regexp construct-regexp ;
+
+: <iregexp> ( string -- regexp )
+    default-regexp
+    case-insensitive initial-option
+    construct-regexp ;
+
+: <rregexp> ( string -- regexp )
+    default-regexp
+    reversed-regexp initial-option
+    construct-regexp ;
+
 : R! CHAR: ! <regexp> ; parsing
 : R" CHAR: " <regexp> ; parsing
 : R# CHAR: # <regexp> ; parsing
--- a/unfinished/regexp2/summary.txt
+++ b/unfinished/regexp2/summary.txt
--- a/unfinished/regexp2/tags.txt
+++ b/unfinished/regexp2/tags.txt
--- a/unfinished/regexp2/transition-tables/transition-tables.factor
+++ b/unfinished/regexp2/transition-tables/transition-tables.factor
--- a/unfinished/regexp2/traversal/traversal.factor
+++ b/unfinished/regexp2/traversal/traversal.factor
@ -78,11 +78,3 @@ TUPLE: dfa-traverser
    dup matches>>
    [ drop f ]
    [ [ start-index>> ] [ peek ] bi* 1 <range> ] if-empty ;
-
-: match ( string regexp -- pair )
-    <dfa-traverser> do-match return-match ;
-
-: matches? ( string regexp -- ? )
-    dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
-
-: match-head ( string regexp -- end ) match length>> 1- ;
--- a/unfinished/regexp2/utils/utils.factor
+++ b/unfinished/regexp2/utils/utils.factor
--- a/unmaintained/random-tester/random-tester.factor
+++ b/unmaintained/random-tester/random-tester.factor
@ -8,7 +8,7 @@ SYMBOL: errored
 SYMBOL: before
 SYMBOL: after
 SYMBOL: quot
-TUPLE: random-tester-error ;
+ERROR: random-tester-error ;

 : setup-test ( #data #code -- data... quot )
    #! Variable stack effect
@ -35,7 +35,7 @@ TUPLE: random-tester-error ;
            "--" print
            [ . ] each
            quot get .
-            random-tester-error construct-empty throw
+            random-tester-error
        ] if
    ] unless clear ;

--- a/unmaintained/random-tester/safe-words/safe-words.factor
+++ b/unmaintained/random-tester/safe-words/safe-words.factor
@ -1,5 +1,7 @@
-USING: kernel namespaces sequences sorting vocabs ;
-USING: arrays assocs generic hashtables  math math.intervals math.parser math.functions refs shuffle vectors words ;
+USING: kernel namespaces sequences sets sorting vocabs ;
+USING: arrays assocs generic hashtables 
+math math.intervals math.parser math.order math.functions
+refs shuffle vectors words ;
 IN: random-tester.safe-words

 : ?-words
@ -16,7 +18,11 @@ IN: random-tester.safe-words
        array? integer? complex? value-ref? ref? key-ref?
        interval? number?
        wrapper? tuple?
-        [-1,1]? between? bignum? both? either? eq? equal? even? fixnum? float? fp-nan? hashtable? interval-contains? interval-subset? interval? key-ref? key? number? odd? pair? power-of-2? ratio? rational? real? subassoc? zero? assoc? curry? vector? callstack? ! clear 3.14 [ <vector> assoc? ] compile-1
+        [-1,1]? between? bignum? both? either? eq? equal? even? fixnum?
+        float? fp-nan? hashtable? interval-contains? interval-subset?
+        interval? key-ref? key? number? odd? pair? power-of-2?
+        ratio? rational? real? zero? assoc? curry? vector? callstack?
+
        2^ not
        ! arrays
        resize-array <array>
@ -64,6 +70,9 @@ IN: random-tester.safe-words
        retainstack callstack
        datastack
        callstack>array
+
+        curry 2curry 3curry compose 3compose
+        (assoc-each)
    } ;

 : exit-words
@ -83,28 +92,31 @@ IN: random-tester.safe-words
    ] { } make ;

 : safe-words ( -- array )
-    bad-words {
-        "alists" "arrays" "assocs" ! "bit-arrays" "byte-arrays"
+    {
+        ! "accessors"
+        "alists" "arrays" "assocs" "bit-arrays" "byte-arrays"
        ! "classes" "combinators" "compiler" "continuations"
        ! "core-foundation" "definitions" "documents"
        ! "float-arrays" "generic" "graphs" "growable"
        "hashtables"  ! io.*
-        "kernel" "math" 
+        "kernel" "math"
        "math.bitfields" "math.complex" "math.constants" "math.floats"
        "math.functions" "math.integers" "math.intervals" "math.libm"
-        "math.parser" "math.ratios" "math.vectors"
-        ! "namespaces" "quotations" "sbufs"
+        "math.parser" "math.order" "math.ratios" "math.vectors"
+        ! "namespaces"
+        "quotations" "sbufs"
        ! "queues" "strings" "sequences"
+        "sets"
        "vectors"
        ! "words"
-    } [ words ] map concat seq-diff natural-sort ;
+    } [ words ] map concat bad-words diff natural-sort ;
    
 safe-words \ safe-words set-global

 ! foo dup (clone) = .
 ! foo dup clone = .
 ! f [ byte-array>bignum assoc-clone-like ] compile-1
-! 2 3.14 [ construct-empty number= ] compile-1
+! 2 3.14 [ number= ] compile-1
 ! 3.14 [ <vector> assoc? ] compile-1
 ! -3 [ ] 2 [ byte-array>bignum denominator ] compile-1
-
+! : foo ( x -- y ) euler bitand ; { foo } compile 20 foo