make tokenize-line configurable, fix bug in take-quoted-string

db4
Doug Coleman 2009-04-01 15:51:39 -05:00
parent d64e07af8b
commit 6af6de1aac
2 changed files with 26 additions and 3 deletions

View File

@ -53,13 +53,18 @@ IN: html.parser.state.tests
[ "cd" ]
[ "abcd" <state-parser> [ "ab" take-sequence drop ] [ "cd" take-sequence ] bi ] unit-test
[ f ]
[
"\"abc\" asdf" <state-parser>
[ CHAR: \ CHAR: " take-quoted-string drop ] [ "asdf" take-sequence ] bi
] unit-test
[ "abc\\\"def" ]
[
"\"abc\\\"def\" asdf" <state-parser>
CHAR: \ CHAR: " take-quoted-string
] unit-test
[ "asdf" ]
[
"\"abc\" asdf" <state-parser>
@ -82,3 +87,6 @@ IN: html.parser.state.tests
[ "c" ]
[ "c" <state-parser> take-token ] unit-test
[ { "a" "b" "c" "abcd e \\\"f g" } ]
[ "a b c \"abcd e \\\"f g\"" CHAR: \ CHAR: " tokenize-line ] unit-test

View File

@ -1,7 +1,8 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: namespaces math kernel sequences accessors fry circular
unicode.case unicode.categories locals combinators.short-circuit ;
unicode.case unicode.categories locals combinators.short-circuit
make combinators ;
IN: html.parser.state
@ -87,7 +88,7 @@ TUPLE: state-parser sequence n ;
state-parser advance
[
{
[ { [ previous quote-char = ] [ current quote-char = ] } 1&& ]
[ { [ previous escape-char = ] [ current quote-char = ] } 1&& ]
[ current quote-char = not ]
} 1||
] take-while :> string
@ -99,3 +100,17 @@ TUPLE: state-parser sequence n ;
: take-token ( state-parser -- string )
skip-whitespace [ current { [ blank? ] [ f = ] } 1|| ] take-until ;
:: (tokenize-line) ( state-parser escape-char quote-char -- )
state-parser skip-whitespace
dup current {
{ quote-char [
[ escape-char quote-char take-quoted-string , ]
[ escape-char quote-char (tokenize-line) ] bi
] }
{ f [ drop ] }
[ drop [ take-token , ] [ escape-char quote-char (tokenize-line) ] bi ]
} case ;
: tokenize-line ( line escape-char quote-char -- seq )
[ <state-parser> ] 2dip [ (tokenize-line) ] { } make ;