make tokenize-line configurable, fix bug in take-quoted-string

2009-04-01 15:51:39 -05:00 · 2009-04-01 15:51:39 -05:00 · 6af6de1aac
parent d64e07af8b
commit 6af6de1aac
2 changed files with 26 additions and 3 deletions
--- a/extra/html/parser/state/state-tests.factor
+++ b/extra/html/parser/state/state-tests.factor
@ -53,13 +53,18 @@ IN: html.parser.state.tests
 [ "cd" ]
 [ "abcd" <state-parser> [ "ab" take-sequence drop ] [ "cd" take-sequence ] bi ] unit-test

-
 [ f ]
 [
    "\"abc\" asdf" <state-parser>
    [ CHAR: \ CHAR: " take-quoted-string drop ] [ "asdf" take-sequence ] bi
 ] unit-test

+[ "abc\\\"def" ]
+[
+    "\"abc\\\"def\" asdf" <state-parser>
+    CHAR: \ CHAR: " take-quoted-string
+] unit-test
+
 [ "asdf" ]
 [
    "\"abc\" asdf" <state-parser>
@ -82,3 +87,6 @@ IN: html.parser.state.tests

 [ "c" ]
 [ "c" <state-parser> take-token ] unit-test
+
+[ { "a" "b" "c" "abcd e \\\"f g" } ]
+[ "a b c  \"abcd e \\\"f g\"" CHAR: \ CHAR: " tokenize-line ] unit-test
--- a/extra/html/parser/state/state.factor
+++ b/extra/html/parser/state/state.factor
@ -1,7 +1,8 @@
 ! Copyright (C) 2005, 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 USING: namespaces math kernel sequences accessors fry circular
-unicode.case unicode.categories locals combinators.short-circuit ;
+unicode.case unicode.categories locals combinators.short-circuit
+make combinators ;

 IN: html.parser.state

@ -87,7 +88,7 @@ TUPLE: state-parser sequence n ;
    state-parser advance
    [
        {
-            [ { [ previous quote-char = ] [ current quote-char = ] } 1&& ]
+            [ { [ previous escape-char = ] [ current quote-char = ] } 1&& ]
            [ current quote-char = not ]
        } 1||
    ] take-while :> string
@ -99,3 +100,17 @@ TUPLE: state-parser sequence n ;

 : take-token ( state-parser -- string )
    skip-whitespace [ current { [ blank? ] [ f = ] } 1|| ] take-until ;
+
+:: (tokenize-line) ( state-parser escape-char quote-char -- )
+    state-parser skip-whitespace
+    dup current {
+        { quote-char [
+            [ escape-char quote-char take-quoted-string , ]
+            [ escape-char quote-char (tokenize-line) ] bi
+        ] }
+        { f [ drop ] }
+        [ drop [ take-token , ] [ escape-char quote-char (tokenize-line) ] bi ]
+    } case ;
+
+: tokenize-line ( line escape-char quote-char -- seq )
+    [ <state-parser> ] 2dip [ (tokenize-line) ] { } make ;