From 21518ff78e52325e76aaf0c57e4009cc2d33c12e Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 16 Aug 2008 14:14:20 -0500 Subject: [PATCH 01/16] better error handling in smtp --- basis/smtp/smtp.factor | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/basis/smtp/smtp.factor b/basis/smtp/smtp.factor index 16a13eafe8..5351194598 100755 --- a/basis/smtp/smtp.factor +++ b/basis/smtp/smtp.factor @@ -58,19 +58,37 @@ LOG: log-smtp-connection NOTICE ( addrspec -- ) LOG: smtp-response DEBUG +ERROR: smtp-error message ; +ERROR: smtp-server-busy < smtp-error ; +ERROR: smtp-syntax-error < smtp-error ; +ERROR: smtp-command-not-implemented < smtp-error ; +ERROR: smtp-bad-authentication < smtp-error ; +ERROR: smtp-mailbox-unavailable < smtp-error ; +ERROR: smtp-user-not-local < smtp-error ; +ERROR: smtp-exceeded-storage-allocation < smtp-error ; +ERROR: smtp-bad-mailbox-name < smtp-error ; +ERROR: smtp-transaction-failed < smtp-error ; + : check-response ( response -- ) + dup smtp-response { - { [ dup "220" head? ] [ smtp-response ] } - { [ dup "235" swap subseq? ] [ smtp-response ] } - { [ dup "250" head? ] [ smtp-response ] } - { [ dup "221" head? ] [ smtp-response ] } - { [ dup "bye" head? ] [ smtp-response ] } - { [ dup "4" head? ] [ "server busy" throw ] } - { [ dup "354" head? ] [ smtp-response ] } - { [ dup "50" head? ] [ smtp-response "syntax error" throw ] } - { [ dup "53" head? ] [ smtp-response "invalid authentication data" throw ] } - { [ dup "55" head? ] [ smtp-response "fatal error" throw ] } - [ "unknown error" throw ] + { [ dup "bye" head? ] [ drop ] } + { [ dup "220" head? ] [ drop ] } + { [ dup "235" swap subseq? ] [ drop ] } + { [ dup "250" head? ] [ drop ] } + { [ dup "221" head? ] [ drop ] } + { [ dup "354" head? ] [ drop ] } + { [ dup "4" head? ] [ smtp-server-busy ] } + { [ dup "500" head? ] [ smtp-syntax-error ] } + { [ dup "501" head? ] [ smtp-command-not-implemented ] } + { [ dup "50" head? ] [ smtp-syntax-error ] } + { [ dup "53" head? ] [ smtp-bad-authentication ] } + { [ dup "550" head? ] [ smtp-mailbox-unavailable ] } + { [ dup "551" head? ] [ smtp-user-not-local ] } + { [ dup "552" head? ] [ smtp-exceeded-storage-allocation ] } + { [ dup "553" head? ] [ smtp-bad-mailbox-name ] } + { [ dup "554" head? ] [ smtp-transaction-failed ] } + [ smtp-error ] } cond ; : multiline? ( response -- boolean ) From fa5cff02e857cbf9a6f0cd42a018b3fab82848db Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 16 Aug 2008 14:20:13 -0500 Subject: [PATCH 02/16] move more throws to ERROR: --- basis/smtp/smtp.factor | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/basis/smtp/smtp.factor b/basis/smtp/smtp.factor index 5351194598..472fabf8b2 100755 --- a/basis/smtp/smtp.factor +++ b/basis/smtp/smtp.factor @@ -3,7 +3,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: namespaces io io.timeouts kernel logging io.sockets sequences combinators sequences.lib splitting assocs strings -math.parser random system calendar io.encodings.ascii +math.parser random system calendar io.encodings.ascii summary calendar.format accessors sets ; IN: smtp @@ -30,10 +30,12 @@ LOG: log-smtp-connection NOTICE ( addrspec -- ) : helo ( -- ) esmtp get "EHLO " "HELO " ? host-name append command ; +ERROR: bad-email-address email ; + : validate-address ( string -- string' ) #! Make sure we send funky stuff to the server by accident. dup "\r\n>" intersect empty? - [ "Bad e-mail address: " prepend throw ] unless ; + [ bad-email-address ] unless ; : mail-from ( fromaddr -- ) "MAIL FROM:<" swap validate-address ">" 3append command ; @@ -44,8 +46,15 @@ LOG: log-smtp-connection NOTICE ( addrspec -- ) : data ( -- ) "DATA" command ; +ERROR: message-contains-dot message ; + +M: message-contains-dot summary ( obj -- string ) + drop + "Message cannot contain . on a line by itself" ; + : validate-message ( msg -- msg' ) - "." over member? [ "Message cannot contain . on a line by itself" throw ] when ; + "." over member? + [ message-contains-dot ] when ; : send-body ( body -- ) string-lines @@ -107,9 +116,11 @@ ERROR: smtp-transaction-failed < smtp-error ; : get-ok ( -- ) receive-response check-response ; +ERROR: invalid-header-string string ; + : validate-header ( string -- string' ) dup "\r\n" intersect empty? - [ "Invalid header string: " prepend throw ] unless ; + [ invalid-header-string ] unless ; : write-header ( key value -- ) swap From bd8dcd4e0a0f5bcc78cd405ee0d3460ee154e231 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 16 Aug 2008 14:20:36 -0500 Subject: [PATCH 03/16] move quad to quadruple in utf8 --- core/io/encodings/utf8/utf8.factor | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index ae8a455c71..8030d6265e 100755 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -24,7 +24,7 @@ SINGLETON: utf8 : triple ( stream byte -- stream char ) BIN: 1111 bitand append-nums append-nums ; inline -: quad ( stream byte -- stream char ) +: quadruple ( stream byte -- stream char ) BIN: 111 bitand append-nums append-nums append-nums ; inline : begin-utf8 ( stream byte -- stream char ) @@ -32,7 +32,7 @@ SINGLETON: utf8 { [ dup -7 shift zero? ] [ ] } { [ dup -5 shift BIN: 110 number= ] [ double ] } { [ dup -4 shift BIN: 1110 number= ] [ triple ] } - { [ dup -3 shift BIN: 11110 number= ] [ quad ] } + { [ dup -3 shift BIN: 11110 number= ] [ quadruple ] } [ drop replacement-char ] } cond ; inline From 107cadf604bade7bbcdeebc0c008025f75038581 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 16 Aug 2008 15:20:34 -0500 Subject: [PATCH 04/16] add the %chance combinator --- extra/combinators/lib/lib-docs.factor | 9 +++++++++ extra/combinators/lib/lib.factor | 4 +++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/extra/combinators/lib/lib-docs.factor b/extra/combinators/lib/lib-docs.factor index fe2f3556ef..cde3b4d259 100755 --- a/extra/combinators/lib/lib-docs.factor +++ b/extra/combinators/lib/lib-docs.factor @@ -11,3 +11,12 @@ HELP: generate "[ 20 random-prime ] [ 4 mod 3 = ] generate ." "526367" } ; + +HELP: %chance +{ $values { "quot" quotation } { "n" integer } } +{ $description "Calls the quotation " { $snippet "n" } " percent of the time." } +{ $unchecked-example + "USING: io ;" + "[ \"hello, world! maybe.\" print ] 50 %chance" + "" +} ; diff --git a/extra/combinators/lib/lib.factor b/extra/combinators/lib/lib.factor index a7d5e4cf58..bbf6644b41 100755 --- a/extra/combinators/lib/lib.factor +++ b/extra/combinators/lib/lib.factor @@ -4,7 +4,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: kernel combinators fry namespaces quotations hashtables sequences assocs arrays inference effects math math.ranges -generalizations macros continuations locals ; +generalizations macros continuations random locals ; IN: combinators.lib @@ -147,3 +147,5 @@ MACRO: predicates ( seq -- quot/f ) dup [ 1quotation [ drop ] prepend ] map >r [ [ dup ] prepend ] map r> zip [ drop f ] suffix [ cond ] curry ; + +: %chance ( quot integer -- ) 100 random > swap when ; inline From dbddc8d590bdf90fc4ef5fd3e5ce5dfdee028587 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 16 Aug 2008 15:59:32 -0500 Subject: [PATCH 05/16] add types to TUPLE: email --- basis/smtp/smtp.factor | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/basis/smtp/smtp.factor b/basis/smtp/smtp.factor index 472fabf8b2..62fd9caab1 100755 --- a/basis/smtp/smtp.factor +++ b/basis/smtp/smtp.factor @@ -1,10 +1,10 @@ ! Copyright (C) 2007, 2008 Elie CHAFTARI, Dirk Vleugels, ! Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: namespaces io io.timeouts kernel logging io.sockets +USING: arrays namespaces io io.timeouts kernel logging io.sockets sequences combinators sequences.lib splitting assocs strings math.parser random system calendar io.encodings.ascii summary -calendar.format accessors sets ; +calendar.format accessors sets hashtables ; IN: smtp SYMBOL: smtp-domain @@ -132,7 +132,14 @@ ERROR: invalid-header-string string ; : write-headers ( assoc -- ) [ write-header ] assoc-each ; -TUPLE: email from to subject headers body ; +TUPLE: email + { from string } + { to array } + { subject string } + { headers hashtable } + { body string } ; + +: ( -- email ) email new ; M: email clone call-next-method [ clone ] change-headers ; @@ -177,10 +184,6 @@ M: email clone now timestamp>rfc822 "Date" set-header message-id "Message-Id" set-header ; -: ( -- email ) - email new - H{ } clone >>headers ; - : send-email ( email -- ) prepare (send) ; From 0eea37c13dabb02a18bd56804a3d05d9dbfedc8c Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 16 Aug 2008 16:05:00 -0500 Subject: [PATCH 06/16] add quad combinator inline the do-while combinator --- extra/combinators/lib/lib.factor | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/extra/combinators/lib/lib.factor b/extra/combinators/lib/lib.factor index bbf6644b41..3b92844b3f 100755 --- a/extra/combinators/lib/lib.factor +++ b/extra/combinators/lib/lib.factor @@ -31,6 +31,8 @@ IN: combinators.lib ! Generalized versions of core combinators ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +: quad ( x p q r s -- ) >r >r >r keep r> keep r> keep r> call ; inline + : 4slip ( quot a b c d -- a b c d ) 4 nslip ; inline : 4keep ( w x y z quot -- w x y z ) 4 nkeep ; inline @@ -137,7 +139,7 @@ MACRO: multikeep ( word out-indexes -- ... ) [ drop ] rot compose attempt-all ; inline : do-while ( pred body tail -- ) - >r tuck 2slip r> while ; + >r tuck 2slip r> while ; inline : generate ( generator predicate -- obj ) [ dup ] swap [ dup [ nip ] unless not ] 3compose From 5e247325b27f266d0de711d157dd726071046243 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 16 Aug 2008 17:20:18 -0500 Subject: [PATCH 07/16] refactor smtp to not clone the email add email>headers word instead --- basis/smtp/smtp-tests.factor | 19 ++++----- basis/smtp/smtp.factor | 79 ++++++++++++++++-------------------- 2 files changed, 41 insertions(+), 57 deletions(-) diff --git a/basis/smtp/smtp-tests.factor b/basis/smtp/smtp-tests.factor index 5d350d80c4..7cc0e7efbb 100755 --- a/basis/smtp/smtp-tests.factor +++ b/basis/smtp/smtp-tests.factor @@ -3,12 +3,6 @@ smtp.server kernel sequences namespaces logging accessors assocs sorting ; IN: smtp.tests -[ t ] [ - - dup clone "a" "b" set-header drop - headers>> assoc-empty? -] unit-test - { 0 0 } [ [ ] with-smtp-connection ] must-infer-as [ "hello\nworld" validate-address ] must-fail @@ -60,12 +54,13 @@ IN: smtp.tests "Ed " } >>to "Doug " >>from - prepare - dup headers>> >alist sort-keys [ - drop { "Date" "Message-Id" } member? not - ] assoc-filter - over to>> - rot from>> + [ + email>headers sort-keys [ + drop { "Date" "Message-Id" } member? not + ] assoc-filter + ] + [ to>> [ extract-email ] map ] + [ from>> extract-email ] tri ] unit-test [ ] [ [ 4321 mock-smtp-server ] "SMTP server" spawn drop ] unit-test diff --git a/basis/smtp/smtp.factor b/basis/smtp/smtp.factor index 62fd9caab1..7dbb105142 100755 --- a/basis/smtp/smtp.factor +++ b/basis/smtp/smtp.factor @@ -23,6 +23,14 @@ LOG: log-smtp-connection NOTICE ( addrspec -- ) call ] with-client ; inline +TUPLE: email + { from string } + { to array } + { subject string } + { body string } ; + +: ( -- email ) email new ; + : crlf ( -- ) "\r\n" write ; : command ( string -- ) write crlf flush ; @@ -123,43 +131,12 @@ ERROR: invalid-header-string string ; [ invalid-header-string ] unless ; : write-header ( key value -- ) - swap - validate-header write - ": " write - validate-header write - crlf ; + [ validate-header write ] + [ ": " write validate-header write ] bi* crlf ; : write-headers ( assoc -- ) [ write-header ] assoc-each ; -TUPLE: email - { from string } - { to array } - { subject string } - { headers hashtable } - { body string } ; - -: ( -- email ) email new ; - -M: email clone - call-next-method [ clone ] change-headers ; - -: (send) ( email -- ) - [ - helo get-ok - dup from>> mail-from get-ok - dup to>> [ rcpt-to get-ok ] each - data get-ok - dup headers>> write-headers - crlf - body>> send-body get-ok - quit get-ok - ] with-smtp-connection ; - -: extract-email ( recepient -- email ) - #! This could be much smarter. - " " last-split1 swap or "<" ?head drop ">" ?tail drop ; - : message-id ( -- string ) [ "<" % @@ -171,21 +148,33 @@ M: email clone ">" % ] "" make ; -: set-header ( email value key -- email ) - pick headers>> set-at ; +: extract-email ( recepient -- email ) + #! This could be much smarter. + " " last-split1 swap or "<" ?head drop ">" ?tail drop ; -: prepare ( email -- email ) - clone - dup from>> "From" set-header - [ extract-email ] change-from - dup to>> ", " join "To" set-header - [ [ extract-email ] map ] change-to - dup subject>> "Subject" set-header - now timestamp>rfc822 "Date" set-header - message-id "Message-Id" set-header ; +: email>headers ( email -- hashtable ) + [ + [ from>> "From" set ] + [ to>> ", " join "To" set ] + [ subject>> "Subject" set ] tri + now timestamp>rfc822 "Date" set + message-id "Message-Id" set + ] { } make-assoc ; + +: (send-email) ( headers email -- ) + [ + helo get-ok + dup from>> extract-email mail-from get-ok + dup to>> [ extract-email rcpt-to get-ok ] each + data get-ok + swap write-headers + crlf + body>> send-body get-ok + quit get-ok + ] with-smtp-connection ; : send-email ( email -- ) - prepare (send) ; + [ email>headers ] keep (send-email) ; ! Dirk's old AUTH CRAM-MD5 code. I don't know anything about ! CRAM MD5, and the old code didn't work properly either, so here From 4bbf2de24966885b2851301fdb62193b263a1981 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 16 Aug 2008 17:28:38 -0500 Subject: [PATCH 08/16] add support for cc and bcc to smtp --- basis/smtp/smtp.factor | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/basis/smtp/smtp.factor b/basis/smtp/smtp.factor index 7dbb105142..828d892e41 100755 --- a/basis/smtp/smtp.factor +++ b/basis/smtp/smtp.factor @@ -26,6 +26,8 @@ LOG: log-smtp-connection NOTICE ( addrspec -- ) TUPLE: email { from string } { to array } + { cc array } + { bcc array } { subject string } { body string } ; @@ -154,9 +156,12 @@ ERROR: invalid-header-string string ; : email>headers ( email -- hashtable ) [ - [ from>> "From" set ] - [ to>> ", " join "To" set ] - [ subject>> "Subject" set ] tri + { + [ from>> "From" set ] + [ to>> ", " join "To" set ] + [ cc>> ", " join "Cc" set ] + [ subject>> "Subject" set ] + } cleave now timestamp>rfc822 "Date" set message-id "Message-Id" set ] { } make-assoc ; @@ -166,6 +171,8 @@ ERROR: invalid-header-string string ; helo get-ok dup from>> extract-email mail-from get-ok dup to>> [ extract-email rcpt-to get-ok ] each + dup cc>> [ extract-email rcpt-to get-ok ] each + dup bcc>> [ extract-email rcpt-to get-ok ] each data get-ok swap write-headers crlf From a3e25491c0aec89f03b455e242d65f78a634f8ae Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 17 Aug 2008 09:37:04 -0500 Subject: [PATCH 09/16] dno't send Cc in header if it's not set --- basis/smtp/smtp.factor | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basis/smtp/smtp.factor b/basis/smtp/smtp.factor index 828d892e41..63a37acf36 100755 --- a/basis/smtp/smtp.factor +++ b/basis/smtp/smtp.factor @@ -159,7 +159,7 @@ ERROR: invalid-header-string string ; { [ from>> "From" set ] [ to>> ", " join "To" set ] - [ cc>> ", " join "Cc" set ] + [ cc>> ", " join [ "Cc" set ] unless-empty ] [ subject>> "Subject" set ] } cleave now timestamp>rfc822 "Date" set From 8f0073960155e9cd1b07871ce56728eaf044ec9f Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 17 Aug 2008 10:38:34 -0500 Subject: [PATCH 10/16] remove old accessors, cleanup --- extra/html/parser/analyzer/analyzer.factor | 6 +- extra/html/parser/parser-tests.factor | 29 +++-- extra/html/parser/parser.factor | 93 ++++++-------- extra/html/parser/printer/printer.factor | 140 ++++++++------------- extra/html/parser/utils/utils.factor | 10 +- 5 files changed, 114 insertions(+), 164 deletions(-) diff --git a/extra/html/parser/analyzer/analyzer.factor b/extra/html/parser/analyzer/analyzer.factor index f167feba06..29ccc345d3 100755 --- a/extra/html/parser/analyzer/analyzer.factor +++ b/extra/html/parser/analyzer/analyzer.factor @@ -56,8 +56,7 @@ TUPLE: link attributes clickable ; : trim-text ( vector -- vector' ) [ dup name>> text = [ - [ text>> [ blank? ] trim ] keep - [ set-tag-text ] keep + [ [ blank? ] trim ] change-text ] when ] map ; @@ -173,8 +172,7 @@ TUPLE: link attributes clickable ; [ { { [ dup name>> "form" = ] - [ "form action: " write attributes>> "action" swap at print - ] } + [ "form action: " write attributes>> "action" swap at print ] } { [ dup name>> "input" = ] [ input. ] } [ drop ] } cond diff --git a/extra/html/parser/parser-tests.factor b/extra/html/parser/parser-tests.factor index 0e98c1b998..9757f70a67 100644 --- a/extra/html/parser/parser-tests.factor +++ b/extra/html/parser/parser-tests.factor @@ -2,19 +2,19 @@ USING: html.parser kernel tools.test ; IN: html.parser.tests [ - V{ T{ tag f "html" H{ } f f f } } + V{ T{ tag f "html" H{ } f f } } ] [ "" parse-html ] unit-test [ - V{ T{ tag f "html" H{ } f f t } } + V{ T{ tag f "html" H{ } f t } } ] [ "" parse-html ] unit-test [ - V{ T{ tag f "a" H{ { "href" "http://factorcode.org/" } } f f f } } + V{ T{ tag f "a" H{ { "href" "http://factorcode.org/" } } f f } } ] [ "" parse-html ] unit-test [ - V{ T{ tag f "a" H{ { "href" "http://factorcode.org/" } } f f f } } + V{ T{ tag f "a" H{ { "href" "http://factorcode.org/" } } f f } } ] [ "" parse-html ] unit-test [ @@ -26,7 +26,6 @@ V{ H{ { "baz" "\"quux\"" } { "foo" "bar's" } } f f - f } } ] [ "" parse-html ] unit-test @@ -39,25 +38,25 @@ V{ { "foo" "bar" } { "href" "http://factorcode.org/" } { "baz" "quux" } - } f f f } + } f f } } ] [ "" parse-html ] unit-test [ V{ - T{ tag f "html" H{ } f f f } - T{ tag f "head" H{ } f f f } - T{ tag f "head" H{ } f f t } - T{ tag f "html" H{ } f f t } + T{ tag f "html" H{ } f f } + T{ tag f "head" H{ } f f } + T{ tag f "head" H{ } f t } + T{ tag f "html" H{ } f t } } ] [ "Spagna ( name attributes closing? -- tag ) tag new @@ -28,56 +24,55 @@ SYMBOL: tagstack swap >>attributes swap >>name ; -: make-tag ( str attribs -- tag ) +: make-tag ( string attribs -- tag ) >r [ closing-tag? ] keep "/" trim1 r> rot ; -: make-text-tag ( str -- tag ) - T{ tag f text } clone [ set-tag-text ] keep ; +: make-text-tag ( string -- tag ) + tag new + text >>name + swap >>text ; -: make-comment-tag ( str -- tag ) - T{ tag f comment } clone [ set-tag-text ] keep ; +: make-comment-tag ( string -- tag ) + tag new + comment >>name + swap >>text ; -: make-dtd-tag ( str -- tag ) - T{ tag f dtd } clone [ set-tag-text ] keep ; +: make-dtd-tag ( string -- tag ) + tag new + dtd >>name + swap >>text ; -: read-whitespace ( -- str ) +: read-whitespace ( -- string ) [ get-char blank? not ] take-until ; -: read-whitespace* ( -- ) - read-whitespace drop ; +: read-whitespace* ( -- ) read-whitespace drop ; -: read-token ( -- str ) +: read-token ( -- string ) read-whitespace* [ get-char blank? ] take-until ; -: read-single-quote ( -- str ) +: read-single-quote ( -- string ) [ get-char CHAR: ' = ] take-until ; -: read-double-quote ( -- str ) +: read-double-quote ( -- string ) [ get-char CHAR: " = ] take-until ; -: read-quote ( -- str ) - get-char next* CHAR: ' = [ - read-single-quote - ] [ - read-double-quote - ] if next* ; +: read-quote ( -- string ) + get-char next* CHAR: ' = + [ read-single-quote ] [ read-double-quote ] if next* ; -: read-key ( -- str ) +: read-key ( -- string ) read-whitespace* - [ get-char CHAR: = = get-char blank? or ] take-until ; + [ get-char [ CHAR: = = ] [ blank? ] bi or ] take-until ; : read-= ( -- ) read-whitespace* [ get-char CHAR: = = ] take-until drop next* ; -: read-value ( -- str ) +: read-value ( -- string ) read-whitespace* - get-char quote? [ - read-quote - ] [ - read-token - ] if [ blank? ] trim ; + get-char quote? [ read-quote ] [ read-token ] if + [ blank? ] trim ; : read-comment ( -- ) "-->" take-string* make-comment-tag push-tag ; @@ -97,14 +92,14 @@ SYMBOL: tagstack [ get-char CHAR: > = get-char CHAR: < = or ] take-until get-char CHAR: < = [ next* ] unless ; -: read-< ( -- str ) +: read-< ( -- string ) next* get-char CHAR: ! = [ read-bang f ] [ read-tag ] if ; -: read-until-< ( -- str ) +: read-until-< ( -- string ) [ get-char CHAR: < = ] take-until ; : parse-text ( -- ) @@ -131,11 +126,9 @@ SYMBOL: tagstack ] string-parse ; : parse-tag ( -- ) - read-< dup empty? [ - drop - ] [ + read-< [ (parse-tag) make-tag push-tag - ] if ; + ] unless-empty ; : (parse-html) ( -- ) get-next [ @@ -145,13 +138,7 @@ SYMBOL: tagstack ] when ; : tag-parse ( quot -- vector ) - [ - V{ } clone tagstack set - string-parse - ] with-scope ; + V{ } clone tagstack [ string-parse ] with-variable ; : parse-html ( string -- vector ) - [ - (parse-html) - tagstack get - ] tag-parse ; + [ (parse-html) tagstack get ] tag-parse ; diff --git a/extra/html/parser/printer/printer.factor b/extra/html/parser/printer/printer.factor index 27cb21a927..4419eec70e 100644 --- a/extra/html/parser/printer/printer.factor +++ b/extra/html/parser/printer/printer.factor @@ -1,123 +1,89 @@ -USING: assocs html.parser html.parser.utils combinators +USING: accessors assocs html.parser html.parser.utils combinators continuations hashtables hashtables.private io kernel math namespaces prettyprint quotations sequences splitting strings ; IN: html.parser.printer -SYMBOL: no-section -SYMBOL: html -SYMBOL: head -SYMBOL: body -TUPLE: state section ; +SYMBOL: printer -! TUPLE: text bold? underline? strikethrough? ; +TUPLE: html-printer ; +TUPLE: text-printer < html-printer ; +TUPLE: src-printer < html-printer ; +TUPLE: html-prettyprinter < html-printer ; -TUPLE: text-printer ; -TUPLE: ui-printer ; -TUPLE: src-printer ; -TUPLE: html-prettyprinter ; -UNION: printer text-printer ui-printer src-printer html-prettyprinter ; -HOOK: print-tag printer ( tag -- ) -HOOK: print-text-tag printer ( tag -- ) -HOOK: print-comment-tag printer ( tag -- ) -HOOK: print-dtd-tag printer ( tag -- ) -HOOK: print-opening-named-tag printer ( tag -- ) -HOOK: print-closing-named-tag printer ( tag -- ) +HOOK: print-text-tag html-printer ( tag -- ) +HOOK: print-comment-tag html-printer ( tag -- ) +HOOK: print-dtd-tag html-printer ( tag -- ) +HOOK: print-opening-tag html-printer ( tag -- ) +HOOK: print-closing-tag html-printer ( tag -- ) -: print-tags ( vector -- ) - [ print-tag ] each ; +ERROR: unknown-tag-error tag ; + +: print-tag ( tag -- ) + { + { [ dup name>> text = ] [ print-text-tag ] } + { [ dup name>> comment = ] [ print-comment-tag ] } + { [ dup name>> dtd = ] [ print-dtd-tag ] } + { [ dup [ name>> string? ] [ closing?>> ] bi and ] + [ print-closing-tag ] } + { [ dup name>> string? ] + [ print-opening-tag ] } + [ unknown-tag-error ] + } cond ; + +: print-tags ( vector -- ) [ print-tag ] each ; : html-text. ( vector -- ) - [ - T{ text-printer } printer set - print-tags - ] with-scope ; + T{ text-printer } html-printer [ print-tags ] with-variable ; : html-src. ( vector -- ) - [ - T{ src-printer } printer set - print-tags - ] with-scope ; + T{ src-printer } html-printer [ print-tags ] with-variable ; -M: printer print-text-tag ( tag -- ) - tag-text write ; +M: html-printer print-text-tag ( tag -- ) text>> write ; -M: printer print-comment-tag ( tag -- ) - "" write ; +M: html-printer print-comment-tag ( tag -- ) + "" write ; -M: printer print-dtd-tag ( tag -- ) - "" write ; - -M: printer print-opening-named-tag ( tag -- ) - dup tag-name { - { "html" [ drop ] } - { "head" [ drop ] } - { "body" [ drop ] } - { "title" [ "Title: " write tag-text print ] } - } case ; - -M: printer print-closing-named-tag ( tag -- ) - drop ; +M: html-printer print-dtd-tag ( tag -- ) + "> write ">" write ; : print-attributes ( hashtable -- ) - [ - swap bl write "=" write ?quote write - ] assoc-each ; + [ [ bl write "=" write ] [ ?quote write ] bi* ] assoc-each ; -M: src-printer print-opening-named-tag ( tag -- ) +M: src-printer print-opening-tag ( tag -- ) "<" write - [ tag-name write ] - [ tag-attributes dup assoc-empty? [ drop ] [ print-attributes ] if ] bi + [ name>> write ] + [ attributes>> dup assoc-empty? [ drop ] [ print-attributes ] if ] bi ">" write ; -M: src-printer print-closing-named-tag ( tag -- ) +M: src-printer print-closing-tag ( tag -- ) "> write ">" write ; SYMBOL: tab-width SYMBOL: #indentations +SYMBOL: tagstack + +: prettyprint-html ( vector -- ) + [ + T{ html-prettyprinter } printer set + V{ } clone tagstack set + 2 tab-width set + 0 #indentations set + print-tags + ] with-scope ; : print-tabs ( -- ) tab-width get #indentations get * CHAR: \s write ; -M: html-prettyprinter print-opening-named-tag ( tag -- ) +M: html-prettyprinter print-opening-tag ( tag -- ) print-tabs "<" write - tag-name write + name>> write ">\n" write ; -M: html-prettyprinter print-closing-named-tag ( tag -- ) +M: html-prettyprinter print-closing-tag ( tag -- ) "> write ">" write ; - -ERROR: unknown-tag-error tag ; - -M: printer print-tag ( tag -- ) - { - { [ dup tag-name text = ] [ print-text-tag ] } - { [ dup tag-name comment = ] [ print-comment-tag ] } - { [ dup tag-name dtd = ] [ print-dtd-tag ] } - { [ dup tag-name string? over tag-closing? and ] - [ print-closing-named-tag ] } - { [ dup tag-name string? ] - [ print-opening-named-tag ] } - [ unknown-tag-error ] - } cond ; - -! SYMBOL: tablestack -! : with-html-printer ( vector quot -- ) - ! [ V{ } clone tablestack set ] with-scope ; - -! { { 1 2 } { 3 4 } } -! H{ { table-gap { 10 10 } } } [ - ! [ [ [ [ . ] with-cell ] each ] with-row ] each -! ] tabular-output - -! : html-pp ( vector -- ) - ! [ 0 #indentations set 2 tab-width set ] with-scope ; diff --git a/extra/html/parser/utils/utils.factor b/extra/html/parser/utils/utils.factor index c3372d750a..04b3687f7d 100644 --- a/extra/html/parser/utils/utils.factor +++ b/extra/html/parser/utils/utils.factor @@ -4,8 +4,7 @@ namespaces prettyprint quotations sequences splitting state-parser strings sequences.lib ; IN: html.parser.utils -: string-parse-end? ( -- ? ) - get-next not ; +: string-parse-end? ( -- ? ) get-next not ; : take-string* ( match -- string ) dup length @@ -16,17 +15,18 @@ IN: html.parser.utils [ ?head drop ] [ ?tail drop ] bi ; : single-quote ( str -- newstr ) - >r "'" r> "'" 3append ; + "'" swap "'" 3append ; : double-quote ( str -- newstr ) - >r "\"" r> "\"" 3append ; + "\"" swap "\"" 3append ; : quote ( str -- newstr ) CHAR: ' over member? [ double-quote ] [ single-quote ] if ; : quoted? ( str -- ? ) - [ [ first ] [ peek ] bi [ = ] keep "'\"" member? and ] [ f ] if-seq ; + [ f ] + [ [ first ] [ peek ] bi [ = ] keep "'\"" member? and ] if-empty ; : ?quote ( str -- newstr ) dup quoted? [ quote ] unless ; From 507f814d48d1eecfbc2749ff1e7b00fd26a66a41 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 17 Aug 2008 10:50:26 -0500 Subject: [PATCH 11/16] remove old accessors before major refactoring --- extra/taxes/tags.txt | 2 +- extra/taxes/taxes.factor | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/extra/taxes/tags.txt b/extra/taxes/tags.txt index 8b13789179..2964ef21b1 100644 --- a/extra/taxes/tags.txt +++ b/extra/taxes/tags.txt @@ -1 +1 @@ - +taxes diff --git a/extra/taxes/taxes.factor b/extra/taxes/taxes.factor index 5522dd9bcb..5e2a395c40 100644 --- a/extra/taxes/taxes.factor +++ b/extra/taxes/taxes.factor @@ -1,5 +1,7 @@ -USING: arrays assocs kernel math math.intervals namespaces -sequences combinators.lib money math.order ; +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs kernel math math.intervals +namespaces sequences combinators.lib money math.order ; IN: taxes : monthly ( x -- y ) 12 / ; @@ -14,22 +16,21 @@ C: w4 : allowance ( -- x ) 3500 ; inline -: calculate-w4-allowances ( w4 -- x ) - w4-allowances allowance * ; +: calculate-w4-allowances ( w4 -- x ) allowances>> allowance * ; ! Withhold: FICA, Medicare, Federal (FICA is social security) : fica-tax-rate ( -- x ) DECIMAL: .062 ; inline ! Base rate -- income over this rate is not taxed -TUPLE: fica-base-unknown ; +ERROR: fica-base-unknown ; : fica-base-rate ( year -- x ) H{ { 2008 102000 } { 2007 97500 } - } at* [ T{ fica-base-unknown } throw ] unless ; + } at* [ fica-base-unknown ] unless ; : fica-tax ( salary w4 -- x ) - w4-year fica-base-rate min fica-tax-rate * ; + year>> fica-base-rate min fica-tax-rate * ; ! Employer tax only, not withheld : futa-tax-rate ( -- x ) DECIMAL: .062 ; inline @@ -64,8 +65,7 @@ TUPLE: tax-table single married ; 0 -rot [ tax-bracket ] each drop ; : marriage-table ( w4 tax-table -- triples ) - swap w4-married? - [ tax-table-married ] [ tax-table-single ] if ; + swap married?>> [ married>> ] [ single>> ] if ; : federal-tax ( salary w4 tax-table -- n ) [ adjust-allowances ] 2keep marriage-table tax ; From c56a769fef3555a739011e5de4488818bf0d8a1a Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 17 Aug 2008 12:01:04 -0500 Subject: [PATCH 12/16] add if-at, when-at, and unless-at it's like if-empty --- extra/assocs/lib/lib.factor | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/extra/assocs/lib/lib.factor b/extra/assocs/lib/lib.factor index 5036a13d78..5f4620fd64 100755 --- a/extra/assocs/lib/lib.factor +++ b/extra/assocs/lib/lib.factor @@ -37,3 +37,10 @@ IN: assocs.lib H{ } clone [ swap [ change-at ] 2curry assoc-each ] keep ; inline + +: if-at ( obj assoc quot1 quot2 -- ) + [ dupd at* [ not -rot ? ] keep ] 2dip if ; inline + +: when-at ( obj assoc quot -- ) [ ] if-at ; inline + +: unless-at ( obj assoc quot -- ) [ ] swap if-at ; inline From ab56eb0e48b9e1f82b035ab43ff6bf491815eb98 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 17 Aug 2008 21:27:35 -0500 Subject: [PATCH 13/16] add ?at, simplify if-at --- extra/assocs/lib/lib.factor | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/extra/assocs/lib/lib.factor b/extra/assocs/lib/lib.factor index 5f4620fd64..2a8634987f 100755 --- a/extra/assocs/lib/lib.factor +++ b/extra/assocs/lib/lib.factor @@ -38,8 +38,11 @@ IN: assocs.lib swap [ change-at ] 2curry assoc-each ] keep ; inline +: ?at ( obj1 assoc -- obj1/obj2 ) + dupd at* [ nip ] [ drop ] if ; + : if-at ( obj assoc quot1 quot2 -- ) - [ dupd at* [ not -rot ? ] keep ] 2dip if ; inline + [ ?at dup ] 2dip if ; inline : when-at ( obj assoc quot -- ) [ ] if-at ; inline From 16342a8a403bf0d44a090546cd1d6f479f9bb371 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 17 Aug 2008 21:54:10 -0500 Subject: [PATCH 14/16] fix ?at, if-at and add unit tests --- extra/assocs/lib/lib-tests.factor | 15 ++++++++++++++- extra/assocs/lib/lib.factor | 6 +++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/extra/assocs/lib/lib-tests.factor b/extra/assocs/lib/lib-tests.factor index 0bf8270088..c7e1aa4fbf 100644 --- a/extra/assocs/lib/lib-tests.factor +++ b/extra/assocs/lib/lib-tests.factor @@ -1,4 +1,17 @@ +USING: kernel tools.test sequences vectors assocs.lib ; IN: assocs.lib.tests -USING: assocs.lib tools.test vectors ; { 1 1 } [ [ ?push ] histogram ] must-infer-as + +! substitute +[ { 2 } ] [ { 1 } H{ { 1 2 } } [ ?at drop ] curry map ] unit-test +[ { 3 } ] [ { 3 } H{ { 1 2 } } [ ?at drop ] curry map ] unit-test + +[ 2 ] [ 1 H{ { 1 2 } } [ ] [ ] if-at ] unit-test +[ 3 ] [ 3 H{ { 1 2 } } [ ] [ ] if-at ] unit-test + +[ "hi" ] [ 1 H{ { 1 2 } } [ drop "hi" ] when-at ] unit-test +[ 3 ] [ 3 H{ { 1 2 } } [ drop "hi" ] when-at ] unit-test +[ 2 ] [ 1 H{ { 1 2 } } [ drop "hi" ] unless-at ] unit-test +[ "hi" ] [ 3 H{ { 1 2 } } [ drop "hi" ] unless-at ] unit-test + diff --git a/extra/assocs/lib/lib.factor b/extra/assocs/lib/lib.factor index 2a8634987f..ed9b4bf0c4 100755 --- a/extra/assocs/lib/lib.factor +++ b/extra/assocs/lib/lib.factor @@ -38,11 +38,11 @@ IN: assocs.lib swap [ change-at ] 2curry assoc-each ] keep ; inline -: ?at ( obj1 assoc -- obj1/obj2 ) - dupd at* [ nip ] [ drop ] if ; +: ?at ( obj assoc -- value/obj ? ) + dupd at* [ [ nip ] [ drop ] if ] keep ; : if-at ( obj assoc quot1 quot2 -- ) - [ ?at dup ] 2dip if ; inline + [ ?at ] 2dip if ; inline : when-at ( obj assoc quot -- ) [ ] if-at ; inline From 9f73644575758388272d98063f0411c278a78658 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 18 Aug 2008 11:24:18 -0500 Subject: [PATCH 15/16] put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible --- extra/regexp2/backend/backend.factor | 23 ++ extra/regexp2/classes/classes.factor | 49 +++ extra/regexp2/dfa/dfa.factor | 70 ++++ extra/regexp2/nfa/nfa.factor | 126 ++++++ extra/regexp2/parser/parser-tests.factor | 33 ++ extra/regexp2/parser/parser.factor | 362 ++++++++++++++++++ extra/regexp2/regexp2-tests.factor | 240 ++++++++++++ extra/regexp2/regexp2.factor | 35 ++ .../transition-tables.factor | 44 +++ extra/regexp2/traversal/traversal.factor | 88 +++++ extra/regexp2/utils/utils.factor | 69 ++++ 11 files changed, 1139 insertions(+) create mode 100644 extra/regexp2/backend/backend.factor create mode 100644 extra/regexp2/classes/classes.factor create mode 100644 extra/regexp2/dfa/dfa.factor create mode 100644 extra/regexp2/nfa/nfa.factor create mode 100644 extra/regexp2/parser/parser-tests.factor create mode 100644 extra/regexp2/parser/parser.factor create mode 100644 extra/regexp2/regexp2-tests.factor create mode 100644 extra/regexp2/regexp2.factor create mode 100644 extra/regexp2/transition-tables/transition-tables.factor create mode 100644 extra/regexp2/traversal/traversal.factor create mode 100644 extra/regexp2/utils/utils.factor diff --git a/extra/regexp2/backend/backend.factor b/extra/regexp2/backend/backend.factor new file mode 100644 index 0000000000..5f59c25bc3 --- /dev/null +++ b/extra/regexp2/backend/backend.factor @@ -0,0 +1,23 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors hashtables kernel math state-tables vars vectors ; +IN: regexp2.backend + +TUPLE: regexp + raw + { stack vector } + parse-tree + nfa-table + dfa-table + minimized-table + { state integer } + { new-states vector } + { visited-states hashtable } ; + +: reset-regexp ( regexp -- regexp ) + 0 >>state + V{ } clone >>stack + V{ } clone >>new-states + H{ } clone >>visited-states ; + +SYMBOL: current-regexp diff --git a/extra/regexp2/classes/classes.factor b/extra/regexp2/classes/classes.factor new file mode 100644 index 0000000000..0862f9cb63 --- /dev/null +++ b/extra/regexp2/classes/classes.factor @@ -0,0 +1,49 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors kernel math math.order symbols regexp2.parser +words regexp2.utils unicode.categories combinators.short-circuit ; +IN: regexp2.classes + +GENERIC: class-member? ( obj class -- ? ) + +M: word class-member? ( obj class -- ? ) 2drop f ; +M: integer class-member? ( obj class -- ? ) 2drop f ; + +M: character-class-range class-member? ( obj class -- ? ) + [ from>> ] [ to>> ] bi between? ; + +M: any-char class-member? ( obj class -- ? ) + 2drop t ; + +M: letter-class class-member? ( obj class -- ? ) + drop letter? ; + +M: LETTER-class class-member? ( obj class -- ? ) + drop LETTER? ; + +M: ascii-class class-member? ( obj class -- ? ) + drop ascii? ; + +M: digit-class class-member? ( obj class -- ? ) + drop digit? ; + +M: alpha-class class-member? ( obj class -- ? ) + drop alpha? ; + +M: punctuation-class class-member? ( obj class -- ? ) + drop punct? ; + +M: java-printable-class class-member? ( obj class -- ? ) + drop java-printable? ; + +M: non-newline-blank-class class-member? ( obj class -- ? ) + drop { [ blank? ] [ CHAR: \n = not ] } 1&& ; + +M: control-character-class class-member? ( obj class -- ? ) + drop control-char? ; + +M: hex-digit-class class-member? ( obj class -- ? ) + drop hex-digit? ; + +M: java-blank-class class-member? ( obj class -- ? ) + drop java-blank? ; diff --git a/extra/regexp2/dfa/dfa.factor b/extra/regexp2/dfa/dfa.factor new file mode 100644 index 0000000000..0dcf6c4ab5 --- /dev/null +++ b/extra/regexp2/dfa/dfa.factor @@ -0,0 +1,70 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs combinators fry kernel locals +math math.order regexp2.nfa regexp2.transition-tables sequences +sets sorting vectors regexp2.utils sequences.lib ; +USING: io prettyprint threads ; +IN: regexp2.dfa + +: find-delta ( states transition regexp -- new-states ) + nfa-table>> transitions>> + rot [ swap at at ] with with map sift concat prune ; + +: (find-epsilon-closure) ( states regexp -- new-states ) + eps swap find-delta ; + +: find-epsilon-closure ( states regexp -- new-states ) + '[ dup , (find-epsilon-closure) union ] [ length ] while-changes + natural-sort ; + +: find-closure ( states transition regexp -- new-states ) + [ find-delta ] 2keep nip find-epsilon-closure ; + +: find-start-state ( regexp -- state ) + [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ; + +: find-transitions ( seq1 regexp -- seq2 ) + nfa-table>> transitions>> + [ at keys ] curry map concat eps swap remove ; + +: add-todo-state ( state regexp -- ) + 2dup visited-states>> key? [ + 2drop + ] [ + [ visited-states>> conjoin ] + [ new-states>> push ] 2bi + ] if ; + +: new-transitions ( regexp -- ) + dup new-states>> [ + drop + ] [ + dupd pop dup pick find-transitions rot + [ + [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep + >r swapd transition boa r> dfa-table>> add-transition + ] curry with each + new-transitions + ] if-empty ; + +: states ( hashtable -- array ) + [ keys ] + [ values [ values concat ] map concat append ] bi ; + +: set-final-states ( regexp -- ) + dup + [ nfa-table>> final-states>> keys ] + [ dfa-table>> transitions>> states ] bi + [ intersect empty? not ] with filter + + swap dfa-table>> final-states>> + [ conjoin ] curry each ; + +: set-initial-state ( regexp -- ) + dup + [ dfa-table>> ] [ find-start-state ] bi + [ >>start-state drop ] keep + 1vector >>new-states drop ; + +: construct-dfa ( regexp -- ) + [ set-initial-state ] [ new-transitions ] [ set-final-states ] tri ; diff --git a/extra/regexp2/nfa/nfa.factor b/extra/regexp2/nfa/nfa.factor new file mode 100644 index 0000000000..f87a2a7b52 --- /dev/null +++ b/extra/regexp2/nfa/nfa.factor @@ -0,0 +1,126 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs grouping kernel regexp2.backend +locals math namespaces regexp2.parser sequences state-tables fry +quotations math.order math.ranges vectors unicode.categories +regexp2.utils regexp2.transition-tables words sequences.lib ; +IN: regexp2.nfa + +SYMBOL: negation-mode +: negated? ( -- ? ) negation-mode get 0 or odd? ; + +SINGLETON: eps + +: next-state ( regexp -- state ) + [ state>> ] [ [ 1+ ] change-state drop ] bi ; + +: set-start-state ( regexp -- ) + dup stack>> [ + drop + ] [ + [ nfa-table>> ] [ pop first ] bi* >>start-state drop + ] if-empty ; + +GENERIC: nfa-node ( node -- ) + +:: add-simple-entry ( obj class -- ) + [let* | regexp [ current-regexp get ] + s0 [ regexp next-state ] + s1 [ regexp next-state ] + stack [ regexp stack>> ] + table [ regexp nfa-table>> ] | + negated? [ + s0 f obj class boa table add-transition + s0 s1 table add-transition + ] [ + s0 s1 obj class boa table add-transition + ] if + s0 s1 2array stack push + t s1 table final-states>> set-at ] ; + +:: concatenate-nodes ( -- ) + [let* | regexp [ current-regexp get ] + stack [ regexp stack>> ] + table [ regexp nfa-table>> ] + s2 [ stack peek first ] + s3 [ stack pop second ] + s0 [ stack peek first ] + s1 [ stack pop second ] | + s1 s2 eps table add-transition + s1 table final-states>> delete-at + s0 s3 2array stack push ] ; + +:: alternate-nodes ( -- ) + [let* | regexp [ current-regexp get ] + stack [ regexp stack>> ] + table [ regexp nfa-table>> ] + s2 [ stack peek first ] + s3 [ stack pop second ] + s0 [ stack peek first ] + s1 [ stack pop second ] + s4 [ regexp next-state ] + s5 [ regexp next-state ] | + s4 s0 eps table add-transition + s4 s2 eps table add-transition + s1 s5 eps table add-transition + s3 s5 eps table add-transition + s1 table final-states>> delete-at + s3 table final-states>> delete-at + t s5 table final-states>> set-at + s4 s5 2array stack push ] ; + +M: kleene-star nfa-node ( node -- ) + term>> nfa-node + [let* | regexp [ current-regexp get ] + stack [ regexp stack>> ] + s0 [ stack peek first ] + s1 [ stack pop second ] + s2 [ regexp next-state ] + s3 [ regexp next-state ] + table [ regexp nfa-table>> ] | + s1 table final-states>> delete-at + t s3 table final-states>> set-at + s1 s0 eps table add-transition + s2 s0 eps table add-transition + s2 s3 eps table add-transition + s1 s3 eps table add-transition + s2 s3 2array stack push ] ; + +M: concatenation nfa-node ( node -- ) + seq>> + [ [ nfa-node ] each ] + [ length 1- [ concatenate-nodes ] times ] bi ; + +M: alternation nfa-node ( node -- ) + seq>> + [ [ nfa-node ] each ] + [ length 1- [ alternate-nodes ] times ] bi ; + +M: constant nfa-node ( node -- ) + char>> literal-transition add-simple-entry ; + +M: epsilon nfa-node ( node -- ) + drop eps literal-transition add-simple-entry ; + +M: word nfa-node ( node -- ) + class-transition add-simple-entry ; + +M: character-class-range nfa-node ( node -- ) + class-transition add-simple-entry ; + +M: capture-group nfa-node ( node -- ) + term>> nfa-node ; + +M: negation nfa-node ( node -- ) + negation-mode inc + term>> nfa-node + negation-mode dec ; + +: construct-nfa ( regexp -- ) + [ + reset-regexp + negation-mode off + [ current-regexp set ] + [ parse-tree>> nfa-node ] + [ set-start-state ] tri + ] with-scope ; diff --git a/extra/regexp2/parser/parser-tests.factor b/extra/regexp2/parser/parser-tests.factor new file mode 100644 index 0000000000..9dc7dc7909 --- /dev/null +++ b/extra/regexp2/parser/parser-tests.factor @@ -0,0 +1,33 @@ +USING: kernel tools.test regexp2.backend regexp2 ; +IN: regexp2.parser + +: test-regexp ( string -- ) + default-regexp parse-regexp ; + +: test-regexp2 ( string -- regexp ) + default-regexp dup parse-regexp ; + +[ "(" ] [ unmatched-parentheses? ] must-fail-with + +[ ] [ "a|b" test-regexp ] unit-test +[ ] [ "a.b" test-regexp ] unit-test +[ ] [ "a|b|c" test-regexp ] unit-test +[ ] [ "abc|b" test-regexp ] unit-test +[ ] [ "a|bcd" test-regexp ] unit-test +[ ] [ "a|(b)" test-regexp ] unit-test +[ ] [ "(a)|b" test-regexp ] unit-test +[ ] [ "(a|b)" test-regexp ] unit-test +[ ] [ "((a)|(b))" test-regexp ] unit-test + +[ ] [ "(?:a)" test-regexp ] unit-test +[ ] [ "(?i:a)" test-regexp ] unit-test +[ ] [ "(?-i:a)" test-regexp ] unit-test +[ "(?z:a)" test-regexp ] [ bad-option? ] must-fail-with +[ "(?-z:a)" test-regexp ] [ bad-option? ] must-fail-with + +[ ] [ "(?=a)" test-regexp ] unit-test + +[ ] [ "[abc]" test-regexp ] unit-test +[ ] [ "[a-c]" test-regexp ] unit-test +[ ] [ "[^a-c]" test-regexp ] unit-test +[ "[^]" test-regexp ] must-fail diff --git a/extra/regexp2/parser/parser.factor b/extra/regexp2/parser/parser.factor new file mode 100644 index 0000000000..fc1029db58 --- /dev/null +++ b/extra/regexp2/parser/parser.factor @@ -0,0 +1,362 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs combinators io io.streams.string +kernel math math.parser multi-methods namespaces qualified +quotations sequences sequences.lib splitting symbols vectors +dlists math.order combinators.lib unicode.categories +sequences.lib regexp2.backend regexp2.utils ; +IN: regexp2.parser + +FROM: math.ranges => [a,b] ; + +MIXIN: node +TUPLE: concatenation seq ; INSTANCE: concatenation node +TUPLE: alternation seq ; INSTANCE: alternation node +TUPLE: kleene-star term ; INSTANCE: kleene-star node +TUPLE: question term ; INSTANCE: question node +TUPLE: negation term ; INSTANCE: negation node +TUPLE: constant char ; INSTANCE: constant node +TUPLE: range from to ; INSTANCE: range node +TUPLE: lookahead term ; INSTANCE: lookahead node +TUPLE: lookbehind term ; INSTANCE: lookbehind node +TUPLE: capture-group term ; INSTANCE: capture-group node +TUPLE: non-capture-group term ; INSTANCE: non-capture-group node +TUPLE: independent-group term ; INSTANCE: independent-group node +TUPLE: character-class-range from to ; INSTANCE: character-class-range node +SINGLETON: epsilon INSTANCE: epsilon node +SINGLETON: any-char INSTANCE: any-char node +SINGLETON: front-anchor INSTANCE: front-anchor node +SINGLETON: back-anchor INSTANCE: back-anchor node + +TUPLE: option-on option ; INSTANCE: option-on node +TUPLE: option-off option ; INSTANCE: option-off node +SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case ; +MIXIN: regexp-option +INSTANCE: unix-lines regexp-option +INSTANCE: dotall regexp-option +INSTANCE: multiline regexp-option +INSTANCE: comments regexp-option +INSTANCE: case-insensitive regexp-option +INSTANCE: unicode-case regexp-option + +SINGLETONS: letter-class LETTER-class Letter-class digit-class +alpha-class non-newline-blank-class +ascii-class punctuation-class java-printable-class blank-class +control-character-class hex-digit-class java-blank-class c-identifier-class ; + +SINGLETONS: beginning-of-group end-of-group +beginning-of-character-class end-of-character-class +left-parenthesis pipe caret dash ; + +: ( obj -- constant ) constant boa ; +: ( obj -- negation ) negation boa ; +: ( seq -- concatenation ) >vector concatenation boa ; +: ( seq -- alternation ) >vector alternation boa ; +: ( obj -- capture-group ) capture-group boa ; +: ( obj -- kleene-star ) kleene-star boa ; + +: first|concatenation ( seq -- first/concatenation ) + dup length 1 = [ first ] [ ] if ; + +: first|alternation ( seq -- first/alternation ) + dup length 1 = [ first ] [ ] if ; + +ERROR: unmatched-parentheses ; + +: make-positive-lookahead ( string -- ) + lookahead boa push-stack ; + +: make-negative-lookahead ( string -- ) + lookahead boa push-stack ; + +: make-independent-group ( string -- ) + #! no backtracking + independent-group boa push-stack ; + +: make-positive-lookbehind ( string -- ) + lookbehind boa push-stack ; + +: make-negative-lookbehind ( string -- ) + lookbehind boa push-stack ; + +DEFER: nested-parse-regexp +: make-non-capturing-group ( string -- ) + non-capture-group boa push-stack ; + +ERROR: bad-option ch ; + +: option ( ch -- singleton ) + { + { CHAR: i [ case-insensitive ] } + { CHAR: d [ unix-lines ] } + { CHAR: m [ multiline ] } + { CHAR: s [ dotall ] } + { CHAR: u [ unicode-case ] } + { CHAR: x [ comments ] } + [ bad-option ] + } case ; + +: option-on ( ch -- ) option \ option-on boa push-stack ; +: option-off ( ch -- ) option \ option-off boa push-stack ; +: toggle-option ( ch ? -- ) [ option-on ] [ option-off ] if ; +: (parse-options) ( string ? -- ) [ toggle-option ] curry each ; + +: parse-options ( string -- ) + "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; + +DEFER: (parse-regexp) +: parse-special-group-options ( options -- ) + beginning-of-group push-stack + parse-options (parse-regexp) pop-stack make-non-capturing-group ; + +ERROR: bad-special-group string ; + +: (parse-special-group) ( -- ) + read1 { + { [ dup CHAR: : = ] + [ drop nested-parse-regexp pop-stack make-non-capturing-group ] } + { [ dup CHAR: = = ] + [ drop nested-parse-regexp pop-stack make-positive-lookahead ] } + { [ dup CHAR: = = ] + [ drop nested-parse-regexp pop-stack make-negative-lookahead ] } + { [ dup CHAR: > = ] + [ drop nested-parse-regexp pop-stack make-independent-group ] } + { [ dup CHAR: < = peek1 CHAR: = = and ] + [ drop read1 drop nested-parse-regexp pop-stack make-positive-lookbehind ] } + { [ dup CHAR: < = peek1 CHAR: ! = and ] + [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] } + [ + ":" read-until [ bad-special-group ] unless + swap prefix parse-special-group-options + ] + } cond ; + +: handle-left-parenthesis ( -- ) + peek1 CHAR: ? = + [ read1 drop (parse-special-group) ] + [ nested-parse-regexp ] if ; + +: handle-dot ( -- ) any-char push-stack ; +: handle-pipe ( -- ) pipe push-stack ; +: handle-star ( -- ) stack pop push-stack ; +: handle-question ( -- ) + stack pop epsilon 2array push-stack ; +: handle-plus ( -- ) + stack pop dup 2array push-stack ; + +ERROR: unmatched-brace ; +: parse-repetition ( -- start finish ? ) + "}" read-until [ unmatched-brace ] unless + [ "," split1 [ string>number ] bi@ ] + [ CHAR: , swap index >boolean ] bi ; + +: replicate/concatenate ( n obj -- obj' ) + over zero? [ 2drop epsilon ] + [ first|concatenation ] if ; + +: exactly-n ( n -- ) + stack pop replicate/concatenate push-stack ; + +: at-least-n ( n -- ) + stack pop + [ replicate/concatenate ] keep + 2array push-stack ; + +: at-most-n ( n -- ) + 1+ + stack pop + [ replicate/concatenate ] curry map push-stack ; + +: from-m-to-n ( m n -- ) + [a,b] + stack pop + [ replicate/concatenate ] curry map + push-stack ; + +ERROR: invalid-range a b ; + +: handle-left-brace ( -- ) + parse-repetition + >r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r> + [ + 2dup and [ from-m-to-n ] + [ [ nip at-most-n ] [ at-least-n ] if* ] if + ] [ drop 0 max exactly-n ] if ; + +: handle-front-anchor ( -- ) front-anchor push-stack ; +: handle-back-anchor ( -- ) back-anchor push-stack ; + +ERROR: bad-character-class obj ; +ERROR: expected-posix-class ; + +: parse-posix-class ( -- obj ) + read1 CHAR: { = [ expected-posix-class ] unless + "}" read-until [ bad-character-class ] unless + { + { "Lower" [ letter-class ] } + { "Upper" [ LETTER-class ] } + { "ASCII" [ ascii-class ] } + { "Alpha" [ Letter-class ] } + { "Digit" [ digit-class ] } + { "Alnum" [ alpha-class ] } + { "Punct" [ punctuation-class ] } + { "Graph" [ java-printable-class ] } + { "Print" [ java-printable-class ] } + { "Blank" [ non-newline-blank-class ] } + { "Cntrl" [ control-character-class ] } + { "XDigit" [ hex-digit-class ] } + { "Space" [ java-blank-class ] } + ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss + [ bad-character-class ] + } case ; + +: parse-octal ( -- n ) 3 read oct> check-octal ; +: parse-short-hex ( -- n ) 2 read hex> check-hex ; +: parse-long-hex ( -- n ) 6 read hex> check-hex ; +: parse-control-character ( -- n ) read1 ; + +ERROR: bad-escaped-literals seq ; +: parse-escaped-literals ( -- obj ) + "\\E" read-until [ bad-escaped-literals ] unless + read1 drop + [ epsilon ] [ + [ ] V{ } map-as + first|concatenation + ] if-empty ; + +: parse-escaped ( -- obj ) + read1 + { + { CHAR: \ [ CHAR: \ ] } + { CHAR: . [ CHAR: . ] } + { CHAR: t [ CHAR: \t ] } + { CHAR: n [ CHAR: \n ] } + { CHAR: r [ CHAR: \r ] } + { CHAR: f [ HEX: c ] } + { CHAR: a [ HEX: 7 ] } + { CHAR: e [ HEX: 1b ] } + + { CHAR: d [ digit-class ] } + { CHAR: D [ digit-class ] } + { CHAR: s [ java-blank-class ] } + { CHAR: S [ java-blank-class ] } + { CHAR: w [ c-identifier-class ] } + { CHAR: W [ c-identifier-class ] } + + { CHAR: p [ parse-posix-class ] } + { CHAR: P [ parse-posix-class ] } + { CHAR: x [ parse-short-hex ] } + { CHAR: u [ parse-long-hex ] } + { CHAR: 0 [ parse-octal ] } + { CHAR: c [ parse-control-character ] } + + { CHAR: Q [ parse-escaped-literals ] } + } case ; + +: handle-escape ( -- ) parse-escaped push-stack ; + +: handle-dash ( vector -- vector' ) + H{ { dash CHAR: - } } substitute ; + +: character-class>alternation ( seq -- alternation ) + [ dup number? [ ] when ] map first|alternation ; + +: handle-caret ( vector -- vector' ) + dup [ length 2 >= ] [ first caret eq? ] bi and [ + rest-slice character-class>alternation + ] [ + character-class>alternation + ] if ; + +: make-character-class ( -- character-class ) + [ beginning-of-character-class swap cut-stack ] change-whole-stack + handle-dash handle-caret ; + +: apply-dash ( -- ) + stack [ pop3 nip character-class-range boa ] keep push ; + +: apply-dash? ( -- ? ) + stack dup length 3 >= + [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ; + +ERROR: empty-negated-character-class ; +DEFER: handle-left-bracket +: (parse-character-class) ( -- ) + read1 [ empty-negated-character-class ] unless* { + { CHAR: [ [ handle-left-bracket t ] } + { CHAR: ] [ make-character-class push-stack f ] } + { CHAR: - [ dash push-stack t ] } + { CHAR: \ [ parse-escaped push-stack t ] } + [ push-stack apply-dash? [ apply-dash ] when t ] + } case + [ (parse-character-class) ] when ; + +: parse-character-class-second ( -- ) + read1 { + { CHAR: [ [ CHAR: [ push-stack ] } + { CHAR: ] [ CHAR: ] push-stack ] } + { CHAR: - [ CHAR: - push-stack ] } + [ push1 ] + } case ; + +: parse-character-class-first ( -- ) + read1 { + { CHAR: ^ [ caret push-stack parse-character-class-second ] } + { CHAR: [ [ CHAR: [ push-stack ] } + { CHAR: ] [ CHAR: ] push-stack ] } + { CHAR: - [ CHAR: - push-stack ] } + [ push1 ] + } case ; + +: handle-left-bracket ( -- ) + beginning-of-character-class push-stack + parse-character-class-first (parse-character-class) ; + +ERROR: empty-regexp ; +: finish-regexp-parse ( stack -- obj ) + dup length { + { 0 [ empty-regexp ] } + { 1 [ first ] } + [ + drop { pipe } split + [ first|concatenation ] map first|alternation + ] + } case ; + +: handle-right-parenthesis ( -- ) + stack beginning-of-group over last-index cut rest + [ current-regexp get swap >>stack drop ] + [ finish-regexp-parse push-stack ] bi* ; + +: nested-parse-regexp ( -- ) + beginning-of-group push-stack (parse-regexp) ; + +: ((parse-regexp)) ( token -- ) + { + { CHAR: . [ handle-dot ] } + { CHAR: ( [ handle-left-parenthesis ] } + { CHAR: ) [ handle-right-parenthesis ] } + { CHAR: | [ handle-pipe ] } + { CHAR: ? [ handle-question ] } + { CHAR: * [ handle-star ] } + { CHAR: + [ handle-plus ] } + { CHAR: { [ handle-left-brace ] } + { CHAR: [ [ handle-left-bracket ] } + { CHAR: ^ [ handle-front-anchor ] } + { CHAR: $ [ handle-back-anchor ] } + { CHAR: \ [ handle-escape ] } + [ push-stack ] + } case ; + +: (parse-regexp) ( -- ) + read1 [ ((parse-regexp)) (parse-regexp) ] when* ; + +: parse-regexp ( regexp -- ) + dup current-regexp [ + raw>> [ + [ (parse-regexp) ] with-input-stream + ] unless-empty + current-regexp get + stack finish-regexp-parse + >>parse-tree drop + ] with-variable ; diff --git a/extra/regexp2/regexp2-tests.factor b/extra/regexp2/regexp2-tests.factor new file mode 100644 index 0000000000..2b34fe6e77 --- /dev/null +++ b/extra/regexp2/regexp2-tests.factor @@ -0,0 +1,240 @@ +USING: regexp2 tools.test kernel regexp2.traversal ; +IN: regexp2-tests + +[ f ] [ "b" "a*" matches? ] unit-test +[ t ] [ "" "a*" matches? ] unit-test +[ t ] [ "a" "a*" matches? ] unit-test +[ t ] [ "aaaaaaa" "a*" matches? ] unit-test +[ f ] [ "ab" "a*" matches? ] unit-test + +[ t ] [ "abc" "abc" matches? ] unit-test +[ t ] [ "a" "a|b|c" matches? ] unit-test +[ t ] [ "b" "a|b|c" matches? ] unit-test +[ t ] [ "c" "a|b|c" matches? ] unit-test +[ f ] [ "c" "d|e|f" matches? ] unit-test + +[ f ] [ "aa" "a|b|c" matches? ] unit-test +[ f ] [ "bb" "a|b|c" matches? ] unit-test +[ f ] [ "cc" "a|b|c" matches? ] unit-test +[ f ] [ "cc" "d|e|f" matches? ] unit-test + +[ f ] [ "" "a+" matches? ] unit-test +[ t ] [ "a" "a+" matches? ] unit-test +[ t ] [ "aa" "a+" matches? ] unit-test + +[ t ] [ "" "a?" matches? ] unit-test +[ t ] [ "a" "a?" matches? ] unit-test +[ f ] [ "aa" "a?" matches? ] unit-test + +[ f ] [ "" "." matches? ] unit-test +[ t ] [ "a" "." matches? ] unit-test +[ t ] [ "." "." matches? ] unit-test +! [ f ] [ "\n" "." matches? ] unit-test + +[ f ] [ "" ".+" matches? ] unit-test +[ t ] [ "a" ".+" matches? ] unit-test +[ t ] [ "ab" ".+" matches? ] unit-test + + +[ t ] [ "" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "a" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "c" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "cc" "a|b*|c+|d?" matches? ] unit-test +[ f ] [ "ccd" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "d" "a|b*|c+|d?" matches? ] unit-test + +[ t ] [ "foo" "foo|bar" matches? ] unit-test +[ t ] [ "bar" "foo|bar" matches? ] unit-test +[ f ] [ "foobar" "foo|bar" matches? ] unit-test + +[ f ] [ "" "(a)" matches? ] unit-test +[ t ] [ "a" "(a)" matches? ] unit-test +[ f ] [ "aa" "(a)" matches? ] unit-test +[ t ] [ "aa" "(a*)" matches? ] unit-test + +[ f ] [ "aababaaabbac" "(a|b)+" matches? ] unit-test +[ t ] [ "ababaaabba" "(a|b)+" matches? ] unit-test + +[ f ] [ "" "a{1}" matches? ] unit-test +[ t ] [ "a" "a{1}" matches? ] unit-test +[ f ] [ "aa" "a{1}" matches? ] unit-test + +[ f ] [ "a" "a{2,}" matches? ] unit-test +[ t ] [ "aaa" "a{2,}" matches? ] unit-test +[ t ] [ "aaaa" "a{2,}" matches? ] unit-test +[ t ] [ "aaaaa" "a{2,}" matches? ] unit-test + +[ t ] [ "" "a{,2}" matches? ] unit-test +[ t ] [ "a" "a{,2}" matches? ] unit-test +[ t ] [ "aa" "a{,2}" matches? ] unit-test +[ f ] [ "aaa" "a{,2}" matches? ] unit-test +[ f ] [ "aaaa" "a{,2}" matches? ] unit-test +[ f ] [ "aaaaa" "a{,2}" matches? ] unit-test + +[ f ] [ "" "a{1,3}" matches? ] unit-test +[ t ] [ "a" "a{1,3}" matches? ] unit-test +[ t ] [ "aa" "a{1,3}" matches? ] unit-test +[ t ] [ "aaa" "a{1,3}" matches? ] unit-test +[ f ] [ "aaaa" "a{1,3}" matches? ] unit-test + +[ f ] [ "" "[a]" matches? ] unit-test +[ t ] [ "a" "[a]" matches? ] unit-test +[ t ] [ "a" "[abc]" matches? ] unit-test +[ f ] [ "b" "[a]" matches? ] unit-test +[ f ] [ "d" "[abc]" matches? ] unit-test +[ t ] [ "ab" "[abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[abc]{1,2}" matches? ] unit-test + +[ f ] [ "" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^abc]" matches? ] unit-test +[ t ] [ "b" "[^a]" matches? ] unit-test +[ t ] [ "d" "[^abc]" matches? ] unit-test +[ f ] [ "ab" "[^abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[^abc]{1,2}" matches? ] unit-test + +[ t ] [ "]" "[]]" matches? ] unit-test +[ f ] [ "]" "[^]]" matches? ] unit-test +[ t ] [ "a" "[^]]" matches? ] unit-test + +[ "^" "[^]" matches? ] must-fail +[ t ] [ "^" "[]^]" matches? ] unit-test +[ t ] [ "]" "[]^]" matches? ] unit-test + +[ t ] [ "[" "[[]" matches? ] unit-test +[ f ] [ "^" "[^^]" matches? ] unit-test +[ t ] [ "a" "[^^]" matches? ] unit-test + +[ t ] [ "-" "[-]" matches? ] unit-test +[ f ] [ "a" "[-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test + +[ t ] [ "-" "[-a]" matches? ] unit-test +[ t ] [ "a" "[-a]" matches? ] unit-test +[ t ] [ "-" "[a-]" matches? ] unit-test +[ t ] [ "a" "[a-]" matches? ] unit-test +[ f ] [ "b" "[a-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test + +[ f ] [ "-" "[a-c]" matches? ] unit-test +[ t ] [ "-" "[^a-c]" matches? ] unit-test +[ t ] [ "b" "[a-c]" matches? ] unit-test +[ f ] [ "b" "[^a-c]" matches? ] unit-test + +[ t ] [ "-" "[a-c-]" matches? ] unit-test +[ f ] [ "-" "[^a-c-]" matches? ] unit-test + +[ t ] [ "\\" "[\\\\]" matches? ] unit-test +[ f ] [ "a" "[\\\\]" matches? ] unit-test +[ f ] [ "\\" "[^\\\\]" matches? ] unit-test +[ t ] [ "a" "[^\\\\]" matches? ] unit-test + +[ t ] [ "0" "[\\d]" matches? ] unit-test +[ f ] [ "a" "[\\d]" matches? ] unit-test +[ f ] [ "0" "[^\\d]" matches? ] unit-test +[ t ] [ "a" "[^\\d]" matches? ] unit-test + +[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" matches? ] unit-test +[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" matches? ] unit-test +[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" matches? ] unit-test + +[ t ] [ "1000" "\\d{4,6}" matches? ] unit-test +[ t ] [ "1000" "[0-9]{4,6}" matches? ] unit-test + +[ t ] [ "abc" "\\p{Lower}{3}" matches? ] unit-test +[ f ] [ "ABC" "\\p{Lower}{3}" matches? ] unit-test +[ t ] [ "ABC" "\\p{Upper}{3}" matches? ] unit-test +[ f ] [ "abc" "\\p{Upper}{3}" matches? ] unit-test +! +[ f ] [ "abc" "[\\p{Upper}]{3}" matches? ] unit-test +[ t ] [ "ABC" "[\\p{Upper}]{3}" matches? ] unit-test + +[ t ] [ "" "\\Q\\E" matches? ] unit-test +[ f ] [ "a" "\\Q\\E" matches? ] unit-test +[ t ] [ "|*+" "\\Q|*+\\E" matches? ] unit-test +[ f ] [ "abc" "\\Q|*+\\E" matches? ] unit-test +[ t ] [ "s" "\\Qs\\E" matches? ] unit-test + +[ t ] [ "S" "\\0123" matches? ] unit-test +[ t ] [ "SXY" "\\0123XY" matches? ] unit-test +[ t ] [ "x" "\\x78" matches? ] unit-test +[ f ] [ "y" "\\x78" matches? ] unit-test +[ t ] [ "x" "\\u000078" matches? ] unit-test +[ f ] [ "y" "\\u000078" matches? ] unit-test + +[ t ] [ "ab" "a+b" matches? ] unit-test +[ f ] [ "b" "a+b" matches? ] unit-test +[ t ] [ "aab" "a+b" matches? ] unit-test +[ f ] [ "abb" "a+b" matches? ] unit-test + +[ t ] [ "abbbb" "ab*" matches? ] unit-test +[ t ] [ "a" "ab*" matches? ] unit-test +[ f ] [ "abab" "ab*" matches? ] unit-test + +[ f ] [ "x" "\\." matches? ] unit-test +[ t ] [ "." "\\." matches? ] unit-test + +[ t ] [ "aaaab" "a+ab" matches? ] unit-test +[ f ] [ "aaaxb" "a+ab" matches? ] unit-test +[ t ] [ "aaacb" "a+cb" matches? ] unit-test +[ f ] [ "aaaab" "a++ab" matches? ] unit-test +[ t ] [ "aaacb" "a++cb" matches? ] unit-test + +[ 3 ] [ "aaacb" "a*" match-head ] unit-test +[ 1 ] [ "aaacb" "a+?" match-head ] unit-test +[ 2 ] [ "aaacb" "aa?" match-head ] unit-test +[ 1 ] [ "aaacb" "aa??" match-head ] unit-test +[ 3 ] [ "aacb" "aa?c" match-head ] unit-test +[ 3 ] [ "aacb" "aa??c" match-head ] unit-test + +! [ t ] [ "aaa" "AAA" t matches? ] unit-test +! [ f ] [ "aax" "AAA" t matches? ] unit-test +! [ t ] [ "aaa" "A*" t matches? ] unit-test +! [ f ] [ "aaba" "A*" t matches? ] unit-test +! [ t ] [ "b" "[AB]" t matches? ] unit-test +! [ f ] [ "c" "[AB]" t matches? ] unit-test +! [ t ] [ "c" "[A-Z]" t matches? ] unit-test +! [ f ] [ "3" "[A-Z]" t matches? ] unit-test + +[ ] [ + "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" + drop +] unit-test + +[ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test +[ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test + +! [ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test +! [ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test + +! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test +! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test + +! [ 3 ] [ "foo bar" "foo\\b" match-head ] unit-test +! [ f ] [ "fooxbar" "foo\\b" matches? ] unit-test +! [ t ] [ "foo" "foo\\b" matches? ] unit-test +! [ t ] [ "foo bar" "foo\\b bar" matches? ] unit-test +! [ f ] [ "fooxbar" "foo\\bxbar" matches? ] unit-test +! [ f ] [ "foo" "foo\\bbar" matches? ] unit-test + +! [ f ] [ "foo bar" "foo\\B" matches? ] unit-test +! [ 3 ] [ "fooxbar" "foo\\B" match-head ] unit-test +! [ t ] [ "foo" "foo\\B" matches? ] unit-test +! [ f ] [ "foo bar" "foo\\B bar" matches? ] unit-test +! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test +! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test + +! [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test +! [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test +! [ t ] [ ".o" "\\.[a-z]" matches? ] unit-test + +! Bug in parsing word +! [ t ] [ "a" R' a' matches? ] unit-test + +! ((A)(B(C))) +! 1. ((A)(B(C))) +! 2. (A) +! 3. (B(C)) +! 4. (C) diff --git a/extra/regexp2/regexp2.factor b/extra/regexp2/regexp2.factor new file mode 100644 index 0000000000..0f15b3c1ec --- /dev/null +++ b/extra/regexp2/regexp2.factor @@ -0,0 +1,35 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors combinators kernel regexp2.backend regexp2.utils +regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal state-tables +regexp2.transition-tables ; +IN: regexp2 + +: default-regexp ( string -- regexp ) + regexp new + swap >>raw + >>nfa-table + >>dfa-table + >>minimized-table + reset-regexp ; + +: ( string -- regexp ) + default-regexp + { + [ parse-regexp ] + [ construct-nfa ] + [ construct-dfa ] + [ ] + } cleave ; + +: R! CHAR: ! ; parsing +: R" CHAR: " ; parsing +: R# CHAR: # ; parsing +: R' CHAR: ' ; parsing +: R( CHAR: ) ; parsing +: R/ CHAR: / ; parsing +: R@ CHAR: @ ; parsing +: R[ CHAR: ] ; parsing +: R` CHAR: ` ; parsing +: R{ CHAR: } ; parsing +: R| CHAR: | ; parsing diff --git a/extra/regexp2/transition-tables/transition-tables.factor b/extra/regexp2/transition-tables/transition-tables.factor new file mode 100644 index 0000000000..0547846655 --- /dev/null +++ b/extra/regexp2/transition-tables/transition-tables.factor @@ -0,0 +1,44 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs fry hashtables kernel sequences +vectors ; +IN: regexp2.transition-tables + +: insert-at ( value key hash -- ) + 2dup at* [ + 2nip push + ] [ + drop >r >r dup vector? [ 1vector ] unless r> r> set-at + ] if ; + +: ?insert-at ( value key hash/f -- hash ) + [ H{ } clone ] unless* [ insert-at ] keep ; + +TUPLE: transition from to obj ; +TUPLE: literal-transition < transition ; +TUPLE: class-transition < transition ; +TUPLE: default-transition < transition ; + +TUPLE: literal obj ; +TUPLE: class obj ; +TUPLE: default ; +: ( from to obj -- transition ) literal-transition boa ; +: ( from to obj -- transition ) class-transition boa ; +: ( from to -- transition ) t default-transition boa ; + +TUPLE: transition-table transitions + literals classes defaults + start-state final-states ; + +: ( -- transition-table ) + transition-table new + H{ } clone >>transitions + H{ } clone >>final-states ; + +: set-transition ( transition hash -- ) + >r [ to>> ] [ obj>> ] [ from>> ] tri r> + 2dup at* [ 2nip insert-at ] + [ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ; + +: add-transition ( transition transition-table -- ) + transitions>> set-transition ; diff --git a/extra/regexp2/traversal/traversal.factor b/extra/regexp2/traversal/traversal.factor new file mode 100644 index 0000000000..2fbdc49a2a --- /dev/null +++ b/extra/regexp2/traversal/traversal.factor @@ -0,0 +1,88 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors assocs combinators combinators.lib kernel +math math.ranges quotations sequences regexp2.parser +regexp2.classes combinators.short-circuit assocs.lib +sequences.lib ; +IN: regexp2.traversal + +TUPLE: dfa-traverser + dfa-table + last-state current-state + text + start-index current-index + matches ; + +: ( text regexp -- match ) + dfa-table>> + dfa-traverser new + swap [ start-state>> >>current-state ] keep + >>dfa-table + swap >>text + 0 >>start-index + 0 >>current-index + V{ } clone >>matches ; + +: final-state? ( dfa-traverser -- ? ) + [ current-state>> ] [ dfa-table>> final-states>> ] bi + key? ; + +: text-finished? ( dfa-traverser -- ? ) + [ current-index>> ] [ text>> length ] bi >= ; + +: save-final-state ( dfa-straverser -- ) + [ current-index>> ] [ matches>> ] bi push ; + +: match-done? ( dfa-traverser -- ? ) + dup final-state? [ + dup save-final-state + ] when text-finished? ; + +: increment-state ( dfa-traverser state -- dfa-traverser ) + >r [ 1+ ] change-current-index + dup current-state>> >>last-state r> + first >>current-state ; + +: match-failed ( dfa-traverser -- dfa-traverser ) + V{ } clone >>matches ; + +: match-literal ( transition from-state table -- to-state/f ) + transitions>> [ at ] [ 2drop f ] if-at ; + +: assoc-with ( param assoc quot -- assoc curry ) + swapd [ [ -rot ] dip call ] 2curry ; inline + +: match-class ( transition from-state table -- to-state/f ) + transitions>> at* [ + [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if + ] [ drop ] if ; + +: match-default ( transition from-state table -- to-state/f ) + [ nip ] dip transitions>> + [ t swap [ drop f ] unless-at ] [ drop f ] if-at ; + +: match-transition ( obj from-state dfa -- to-state/f ) + { [ match-literal ] [ match-class ] [ match-default ] } 3|| ; + +: setup-match ( match -- obj state dfa-table ) + { current-index>> text>> current-state>> dfa-table>> } get-slots + [ nth ] 2dip ; + +: do-match ( dfa-traverser -- dfa-traverser ) + dup match-done? [ + dup setup-match match-transition + [ increment-state do-match ] when* + ] unless ; + +: return-match ( dfa-traverser -- interval/f ) + dup matches>> + [ drop f ] + [ [ start-index>> ] [ peek ] bi* 1 ] if-empty ; + +: match ( string regexp -- pair ) + do-match return-match ; + +: matches? ( string regexp -- ? ) + dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ; + +: match-head ( string regexp -- end ) match length>> 1- ; diff --git a/extra/regexp2/utils/utils.factor b/extra/regexp2/utils/utils.factor new file mode 100644 index 0000000000..0167e73005 --- /dev/null +++ b/extra/regexp2/utils/utils.factor @@ -0,0 +1,69 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs combinators.lib io kernel +math math.order namespaces regexp2.backend sequences +sequences.lib unicode.categories math.ranges fry +combinators.short-circuit ; +IN: regexp2.utils + +: (while-changes) ( obj quot pred pred-ret -- obj ) + ! quot: ( obj -- obj' ) + ! pred: ( obj -- <=> ) + >r >r dup slip r> pick over call r> dupd = + [ 3drop ] [ (while-changes) ] if ; inline + +: while-changes ( obj quot pred -- obj' ) + pick over call (while-changes) ; inline + +: last-state ( regexp -- range ) stack>> peek first2 [a,b] ; +: push1 ( obj -- ) input-stream get stream>> push ; +: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ; +: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ; + +: stack ( -- obj ) current-regexp get stack>> ; +: change-whole-stack ( quot -- ) + current-regexp get + [ stack>> swap call ] keep (>>stack) ; inline +: push-stack ( obj -- ) stack push ; +: pop-stack ( -- obj ) stack pop ; +: cut-out ( vector n -- vector' vector ) cut rest ; +ERROR: cut-stack-error ; +: cut-stack ( obj vector -- vector' vector ) + tuck last-index [ cut-stack-error ] unless* cut-out swap ; + +ERROR: bad-octal number ; +ERROR: bad-hex number ; +: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ; +: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ; + +: ascii? ( n -- ? ) 0 HEX: 7f between? ; +: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; +: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; + +: hex-digit? ( n -- ? ) + [ + [ decimal-digit? ] + [ CHAR: a CHAR: f between? ] + [ CHAR: A CHAR: F between? ] + ] 1|| ; + +: control-char? ( n -- ? ) + [ + [ 0 HEX: 1f between? ] + [ HEX: 7f = ] + ] 1|| ; + +: punct? ( n -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; + +: c-identifier-char? ( ch -- ? ) + [ [ alpha? ] [ CHAR: _ = ] ] 1|| ; + +: java-blank? ( n -- ? ) + { + CHAR: \s CHAR: \t CHAR: \n + HEX: b HEX: 7 CHAR: \r + } member? ; + +: java-printable? ( n -- ? ) + [ [ alpha? ] [ punct? ] ] 1|| ; From 4a1a87521c25c6f7b3f6af7b2387b4299e73214b Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 18 Aug 2008 11:29:30 -0500 Subject: [PATCH 16/16] add some metadata --- extra/regexp2/authors.txt | 1 + extra/regexp2/summary.txt | 1 + extra/regexp2/tags.txt | 2 ++ 3 files changed, 4 insertions(+) create mode 100644 extra/regexp2/authors.txt create mode 100644 extra/regexp2/summary.txt create mode 100644 extra/regexp2/tags.txt diff --git a/extra/regexp2/authors.txt b/extra/regexp2/authors.txt new file mode 100644 index 0000000000..7c1b2f2279 --- /dev/null +++ b/extra/regexp2/authors.txt @@ -0,0 +1 @@ +Doug Coleman diff --git a/extra/regexp2/summary.txt b/extra/regexp2/summary.txt new file mode 100644 index 0000000000..aa1e1c27a9 --- /dev/null +++ b/extra/regexp2/summary.txt @@ -0,0 +1 @@ +Regular expressions diff --git a/extra/regexp2/tags.txt b/extra/regexp2/tags.txt new file mode 100644 index 0000000000..65bc471f6b --- /dev/null +++ b/extra/regexp2/tags.txt @@ -0,0 +1,2 @@ +parsing +text