Merge branch 'master' of git://factorcode.org/git/factor

2009-01-21 23:32:59 -06:00 · 2009-01-21 23:32:59 -06:00 · ec8f3172de
parent 09b82321b5 4ace56f31d
commit ec8f3172de
26 changed files with 595 additions and 431 deletions
--- a/basis/http/client/client.factor
+++ b/basis/http/client/client.factor
@ -34,6 +34,8 @@ IN: http.client

 GENERIC: >post-data ( object -- post-data )

+M: f >post-data ;
+
 M: post-data >post-data ;

 M: string >post-data
@ -41,15 +43,13 @@ M: string >post-data
    "application/octet-stream" <post-data>
        swap >>data ;

-M: byte-array >post-data
-    "application/octet-stream" <post-data>
-        swap >>data ;
-
 M: assoc >post-data
    "application/x-www-form-urlencoded" <post-data>
        swap >>params ;

-M: f >post-data ;
+M: object >post-data
+    "application/octet-stream" <post-data>
+        swap >>data ;

 : normalize-post-data ( request -- request )
    dup post-data>> [
@ -63,8 +63,10 @@ M: f >post-data ;
    normalize-post-data ;

 : write-post-data ( request -- request )
-    dup method>> [ "POST" = ] [ "PUT" = ] bi or
-    [ dup post-data>> data>> write ] when ; 
+    dup method>> { "POST" "PUT" } member?  [
+        dup post-data>> data>> dup sequence?
+        [ write ] [ output-stream get stream-copy ] if
+    ] when ; 

 : write-request ( request -- )
    unparse-post-data
--- a/basis/multiline/multiline.factor
+++ b/basis/multiline/multiline.factor
@ -51,4 +51,13 @@ PRIVATE>
 : <"
    "\">" parse-multiline-string parsed ; parsing

+: <'
+    "'>" parse-multiline-string parsed ; parsing
+
+: {'
+    "'}" parse-multiline-string parsed ; parsing
+
+: {"
+    "\"}" parse-multiline-string parsed ; parsing
+
 : /* "*/" parse-multiline-string drop ; parsing
--- a/basis/xml/autoencoding/authors.txt
+++ b/basis/xml/autoencoding/authors.txt
@ -0,0 +1 @@
+Daniel Ehrenberg
--- a/basis/xml/autoencoding/autoencoding.factor
+++ b/basis/xml/autoencoding/autoencoding.factor
@ -0,0 +1,64 @@
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel namespaces xml.name io.encodings.utf8 xml.elements
+io.encodings.utf16 xml.tokenize xml.state math ascii sequences
+io.encodings.string io.encodings combinators ;
+IN: xml.autoencoding
+
+: continue-make-tag ( str -- tag )
+    parse-name-starting middle-tag end-tag ;
+
+: start-utf16le ( -- tag )
+    utf16le decode-input-if
+    CHAR: ? expect
+    0 expect check instruct ;
+
+: 10xxxxxx? ( ch -- ? )
+    -6 shift 3 bitand 2 = ;
+          
+: start<name ( ch -- tag )
+    ascii?
+    [ utf8 decode-input-if next make-tag ] [
+        next
+        [ get-next 10xxxxxx? not ] take-until
+        get-char suffix utf8 decode
+        utf8 decode-input-if next
+        continue-make-tag
+    ] if ;
+          
+: start< ( -- tag )
+    get-next {
+        { 0 [ next next start-utf16le ] }
+        { CHAR: ? [ check next next instruct ] } ! XML prolog parsing sets the encoding
+        { CHAR: ! [ check utf8 decode-input next next direct ] }
+        [ check start<name ]
+    } case ;
+
+: skip-utf8-bom ( -- tag )
+    "\u0000bb\u0000bf" expect utf8 decode-input
+    CHAR: < expect check make-tag ;
+
+: decode-expecting ( encoding string -- tag )
+    [ decode-input-if next ] [ expect-string ] bi* check make-tag ;
+
+: start-utf16be ( -- tag )
+    utf16be "<" decode-expecting ;
+
+: skip-utf16le-bom ( -- tag )
+    utf16le "\u0000fe<" decode-expecting ;
+
+: skip-utf16be-bom ( -- tag )
+    utf16be "\u0000ff<" decode-expecting ;
+
+: start-document ( -- tag )
+    get-char {
+        { CHAR: < [ start< ] }
+        { 0 [ start-utf16be ] }
+        { HEX: EF [ skip-utf8-bom ] }
+        { HEX: FF [ skip-utf16le-bom ] }
+        { HEX: FE [ skip-utf16be-bom ] }
+        { f [ "" ] }
+        [ drop utf8 decode-input-if f ]
+        ! Same problem as with <e`>, in the case of XML chunks?
+    } case check ;
+
--- a/basis/xml/data/data.factor
+++ b/basis/xml/data/data.factor
@ -45,7 +45,7 @@ C: <element-decl> element-decl
 TUPLE: attlist-decl < directive name att-defs ;
 C: <attlist-decl> attlist-decl

-TUPLE: entity-decl < directive name def ;
+TUPLE: entity-decl < directive name def pe? ;
 C: <entity-decl> entity-decl

 TUPLE: system-id system-literal ;
--- a/basis/xml/dtd/authors.txt
+++ b/basis/xml/dtd/authors.txt
@ -0,0 +1 @@
+Daniel Ehrenberg
--- a/basis/xml/dtd/dtd.factor
+++ b/basis/xml/dtd/dtd.factor
@ -0,0 +1,61 @@
+! Copyright (C) 2005, 2009 Daniel Ehrenberg, Slava Pestov
+! See http://factorcode.org/license.txt for BSD license.
+USING: xml.tokenize xml.data xml.state kernel sequences ascii
+fry xml.errors combinators hashtables namespaces xml.entities
+strings ;
+IN: xml.dtd
+
+: take-word ( -- string )
+    [ get-char blank? ] take-until ;
+
+: take-decl-contents ( -- first second )
+    pass-blank take-word pass-blank ">" take-string ;
+
+: take-element-decl ( -- element-decl )
+    take-decl-contents <element-decl> ;
+
+: take-attlist-decl ( -- attlist-decl )
+    take-decl-contents <attlist-decl> ;
+
+: take-notation-decl ( -- notation-decl )
+    take-decl-contents <notation-decl> ; 
+
+: take-until-one-of ( seps -- str sep )
+    '[ get-char _ member? ] take-until get-char ;
+
+: take-system-id ( -- system-id )
+    parse-quote <system-id> close ;
+
+: take-public-id ( -- public-id )
+    parse-quote parse-quote <public-id> close ;
+
+UNION: dtd-acceptable
+    directive comment instruction ;
+
+: (take-external-id) ( token -- external-id )
+    pass-blank {
+        { "SYSTEM" [ take-system-id ] }
+        { "PUBLIC" [ take-public-id ] }
+        [ bad-external-id ]
+    } case ;
+
+: take-external-id ( -- external-id )
+    take-word (take-external-id) ;
+
+: only-blanks ( str -- )
+    [ blank? ] all? [ bad-decl ] unless ;
+: take-entity-def ( var -- entity-name entity-def )
+    [
+        take-word pass-blank get-char {
+            { CHAR: ' [ parse-quote ] }
+            { CHAR: " [ parse-quote ] }
+            [ drop take-external-id ]
+        } case
+   ] dip '[ swap _ [ ?set-at ] change ] 2keep ;
+
+: take-entity-decl ( -- entity-decl )
+    pass-blank get-char {
+        { CHAR: % [ next pass-blank pe-table take-entity-def t ] }
+        [ drop extra-entities take-entity-def f ]
+    } case
+    close <entity-decl> ;
--- a/basis/xml/elements/authors.txt
+++ b/basis/xml/elements/authors.txt
@ -0,0 +1 @@
+Daniel Ehrenberg
--- a/basis/xml/elements/elements.factor
+++ b/basis/xml/elements/elements.factor
@ -0,0 +1,165 @@
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel namespaces xml.tokenize xml.state xml.name
+xml.data accessors arrays make xml.char-classes fry assocs sequences
+math xml.errors sets combinators io.encodings io.encodings.iana
+unicode.case xml.dtd strings ;
+IN: xml.elements
+
+: parse-attr ( -- )
+    parse-name pass-blank CHAR: = expect pass-blank
+    t parse-quote* 2array , ;
+
+: start-tag ( -- name ? )
+    #! Outputs the name and whether this is a closing tag
+    get-char CHAR: / = dup [ next ] when
+    parse-name swap ;
+
+: (middle-tag) ( -- )
+    pass-blank version=1.0? get-char name-start?
+    [ parse-attr (middle-tag) ] when ;
+
+: assure-no-duplicates ( attrs-alist -- attrs-alist )
+    H{ } clone 2dup '[ swap _ push-at ] assoc-each
+    [ nip length 2 >= ] assoc-filter >alist
+    [ first first2 duplicate-attr ] unless-empty ;
+
+: middle-tag ( -- attrs-alist )
+    ! f make will make a vector if it has any elements
+    [ (middle-tag) ] f make pass-blank
+    assure-no-duplicates ;
+
+: end-tag ( name attrs-alist -- tag )
+    tag-ns pass-blank get-char CHAR: / =
+    [ pop-ns <contained> next CHAR: > expect ]
+    [ depth inc <opener> close ] if ;
+
+: take-comment ( -- comment )
+    "--" expect-string
+    "--" take-string
+    <comment>
+    CHAR: > expect ;
+
+: assure-no-extra ( seq -- )
+    [ first ] map {
+        T{ name f "" "version" f }
+        T{ name f "" "encoding" f }
+        T{ name f "" "standalone" f }
+    } diff
+    [ extra-attrs ] unless-empty ; 
+
+: good-version ( version -- version )
+    dup { "1.0" "1.1" } member? [ bad-version ] unless ;
+
+: prolog-version ( alist -- version )
+    T{ name f "" "version" f } swap at
+    [ good-version ] [ versionless-prolog ] if* ;
+
+: prolog-encoding ( alist -- encoding )
+    T{ name f "" "encoding" f } swap at "UTF-8" or ;
+
+: yes/no>bool ( string -- t/f )
+    {
+        { "yes" [ t ] }
+        { "no" [ f ] }
+        [ not-yes/no ]
+    } case ;
+
+: prolog-standalone ( alist -- version )
+    T{ name f "" "standalone" f } swap at
+    [ yes/no>bool ] [ f ] if* ;
+
+: prolog-attrs ( alist -- prolog )
+    [ prolog-version ]
+    [ prolog-encoding ]
+    [ prolog-standalone ]
+    tri <prolog> ;
+
+SYMBOL: string-input?
+: decode-input-if ( encoding -- )
+    string-input? get [ drop ] [ decode-input ] if ;
+
+: parse-prolog ( -- prolog )
+    pass-blank middle-tag "?>" expect-string
+    dup assure-no-extra prolog-attrs
+    dup encoding>> dup "UTF-16" =
+    [ drop ] [ name>encoding [ decode-input-if ] when* ] if
+    dup prolog-data set ;
+
+: instruct ( -- instruction )
+    take-name {
+        { [ dup "xml" = ] [ drop parse-prolog ] }
+        { [ dup >lower "xml" = ] [ capitalized-prolog ] }
+        { [ dup valid-name? not ] [ bad-name ] }
+        [ "?>" take-string append <instruction> ]
+    } cond ;
+
+: take-cdata ( -- string )
+    depth get zero? [ bad-cdata ] when
+    "[CDATA[" expect-string "]]>" take-string ;
+
+DEFER: make-tag ! Is this unavoidable?
+
+: expand-pe ( -- ) ; ! Make this run the contents of the pe within a DOCTYPE
+
+: (take-internal-subset) ( -- )
+    pass-blank get-char {
+        { CHAR: ] [ next ] }
+        { CHAR: % [ expand-pe ] }
+        { CHAR: < [
+            next make-tag dup dtd-acceptable?
+            [ bad-doctype ] unless , (take-internal-subset)
+        ] }
+        [ 1string bad-doctype ]
+    } case ;
+
+: take-internal-subset ( -- seq )
+    [
+        H{ } pe-table set
+        t in-dtd? set
+        (take-internal-subset)
+    ] { } make ;
+
+: nontrivial-doctype ( -- external-id internal-subset )
+    pass-blank get-char CHAR: [ = [
+        next take-internal-subset f swap close
+    ] [
+        " >" take-until-one-of {
+            { CHAR: \s [ (take-external-id) ] }
+            { CHAR: > [ only-blanks f ] }
+        } case f
+    ] if ;
+
+: take-doctype-decl ( -- doctype-decl )
+    pass-blank " >" take-until-one-of {
+        { CHAR: \s [ nontrivial-doctype ] }
+        { CHAR: > [ f f ] }
+    } case <doctype-decl> ;
+
+
+: take-directive ( -- directive )
+    take-name {
+        { "ELEMENT" [ take-element-decl ] }
+        { "ATTLIST" [ take-attlist-decl ] }
+        { "DOCTYPE" [ take-doctype-decl ] }
+        { "ENTITY" [ take-entity-decl ] }
+        { "NOTATION" [ take-notation-decl ] }
+        [ bad-directive ]
+    } case ;
+
+: direct ( -- object )
+    get-char {
+        { CHAR: - [ take-comment ] }
+        { CHAR: [ [ take-cdata ] }
+        [ drop take-directive ]
+    } case ;
+
+: make-tag ( -- tag )
+    {
+        { [ get-char dup CHAR: ! = ] [ drop next direct ] }
+        { [ CHAR: ? = ] [ next instruct ] }
+        [
+            start-tag [ dup add-ns pop-ns <closer> depth dec close ]
+            [ middle-tag end-tag ] if
+        ]
+    } cond ;
--- a/basis/xml/errors/errors-tests.factor
+++ b/basis/xml/errors/errors-tests.factor
@ -34,3 +34,5 @@ T{ duplicate-attr f 1 21 T{ name { space "" } { main "this" } } V{ "a" "b" } } "
 T{ bad-cdata f 1 3 } "<![CDATA[]]><x/>" xml-error-test
 T{ bad-cdata f 1 7 } "<x/><![CDATA[]]>" xml-error-test
 T{ pre/post-content f "&" t } "&32;<x/>" xml-error-test
+T{ bad-doctype f 1 17 "a" } "<!DOCTYPE foo [ a ]><x/>" xml-error-test
+T{ bad-doctype f 1 22 T{ opener { name T{ name f "" "foo" "" } } { attrs T{ attrs } } } } "<!DOCTYPE foo [ <foo> ]><x/>" xml-error-test
--- a/basis/xml/errors/errors.factor
+++ b/basis/xml/errors/errors.factor
@ -2,7 +2,7 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: xml.data xml.writer kernel generic io prettyprint math 
 debugger sequences xml.state accessors summary
-namespaces io.streams.string xml.backend ;
+namespaces io.streams.string xml.backend xml.writer.private ;
 IN: xml.errors

 TUPLE: parsing-error line column ;
@ -332,6 +332,12 @@ M: not-enough-characters summary ( obj -- str )
        "Not enough characters" print
    ] with-string-writer ;

+TUPLE: bad-doctype < parsing-error contents ;
+: bad-doctype ( contents -- * )
+    \ bad-doctype parsing-error swap >>contents throw ;
+M: bad-doctype summary
+    call-next-method "\nDTD contains invalid object" append ;
+
 UNION: xml-parse-error
    multitags notags extra-attrs nonexist-ns bad-decl
    not-yes/no unclosed mismatched expected no-entity
--- a/basis/xml/name/name.factor
+++ b/basis/xml/name/name.factor
@ -0,0 +1,76 @@
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel namespaces accessors xml.tokenize xml.data assocs
+xml.errors xml.char-classes combinators.short-circuit splitting
+fry xml.state sequences ;
+IN: xml.name
+
+! XML namespace processing: ns = namespace
+
+! A stack of hashtables
+SYMBOL: ns-stack
+
+: attrs>ns ( attrs-alist -- hash )
+    ! this should check to make sure URIs are valid
+    [
+        [
+            swap dup space>> "xmlns" =
+            [ main>> set ]
+            [
+                T{ name f "" "xmlns" f } names-match?
+                [ "" set ] [ drop ] if
+            ] if
+        ] assoc-each
+    ] { } make-assoc f like ;
+
+: add-ns ( name -- )
+    dup space>> dup ns-stack get assoc-stack
+    [ nip ] [ nonexist-ns ] if* >>url drop ;
+
+: push-ns ( hash -- )
+    ns-stack get push ;
+
+: pop-ns ( -- )
+    ns-stack get pop* ;
+
+: init-ns-stack ( -- )
+    V{ H{
+        { "xml" "http://www.w3.org/XML/1998/namespace" }
+        { "xmlns" "http://www.w3.org/2000/xmlns" }
+        { "" "" }
+    } } clone
+    ns-stack set ;
+
+: tag-ns ( name attrs-alist -- name attrs )
+    dup attrs>ns push-ns
+    [ dup add-ns ] dip dup [ drop add-ns ] assoc-each <attrs> ;
+
+: valid-name? ( str -- ? )
+    [ f ] [
+        version=1.0? swap {
+            [ first name-start? ]
+            [ rest-slice [ name-char? ] with all? ]
+        } 2&&
+    ] if-empty ;
+
+: prefixed-name ( str -- name/f )
+    ":" split dup length 2 = [
+        [ [ valid-name? ] all? ]
+        [ first2 f <name> ] bi and
+    ] [ drop f ] if ;
+
+: interpret-name ( str -- name )
+    dup prefixed-name [ ] [
+        dup valid-name?
+        [ <simple-name> ] [ bad-name ] if
+    ] ?if ;
+
+: take-name ( -- string )
+    version=1.0? '[ _ get-char name-char? not ] take-until ;
+
+: parse-name ( -- name )
+    take-name interpret-name ;
+
+: parse-name-starting ( string -- name )
+    take-name append interpret-name ;
+
--- a/basis/xml/state/authors.txt
+++ b/basis/xml/state/authors.txt
@ -0,0 +1 @@
+Daniel Ehrenberg
--- a/basis/xml/state/state.factor
+++ b/basis/xml/state/state.factor
@ -1,6 +1,6 @@
 ! Copyright (C) 2005, 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors kernel namespaces ;
+USING: accessors kernel namespaces io ;
 IN: xml.state

 TUPLE: spot char line column next check ;
--- a/basis/xml/tests/state-parser-tests.factor
+++ b/basis/xml/tests/state-parser-tests.factor
@ -2,7 +2,7 @@ USING: tools.test xml.tokenize xml.state io.streams.string kernel io strings asc
 IN: xml.test.state

 : string-parse ( str quot -- )
-    [ <string-reader> ] dip state-parse ;
+    [ <string-reader> ] dip with-state ;

 : take-rest ( -- string )
    [ f ] take-until ;
--- a/basis/xml/tests/test.factor
+++ b/basis/xml/tests/test.factor
@ -3,11 +3,13 @@
 IN: xml.tests
 USING: kernel xml tools.test io namespaces make sequences
 xml.errors xml.entities.html parser strings xml.data io.files
-xml.writer xml.utilities continuations assocs
+xml.utilities continuations assocs
 sequences.deep accessors io.streams.string ;

 ! This is insufficient
 \ read-xml must-infer
+[ [ drop ] sax ] must-infer
+\ string>xml must-infer

 SYMBOL: xml-file
 [ ] [ "resource:basis/xml/tests/test.xml"
@ -29,8 +31,6 @@ SYMBOL: xml-file
 ] unit-test
 [ V{ "fa&g" } ] [ xml-file get "x" get-id children>> ] unit-test
 [ "that" ] [ xml-file get "this" swap at ] unit-test
-[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><a b=\"c\"/>" ]
-    [ "<a b='c'/>" string>xml xml>string ] unit-test
 [ "abcd" ] [
    "<main>a<sub>bc</sub>d<nothing/></main>" string>xml
    [ [ dup string? [ % ] [ drop ] if ] deep-each ] "" make
@ -47,10 +47,6 @@ SYMBOL: xml-file
    at swap "z" [ tuck ] dip swap set-at
    T{ name f "blah" "z" f } swap at ] unit-test
 [ "foo" ] [ "<boo><![CDATA[foo]]></boo>" string>xml children>string ] unit-test
-[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><foo>bar baz</foo>" ]
-[ "<foo>bar</foo>" string>xml [ " baz" append ] map xml>string ] unit-test
-[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<foo>\n  bar\n</foo>" ]
-[ "<foo>         bar            </foo>" string>xml pprint-xml>string ] unit-test
 [ "<!-- B+, B, or B--->" string>xml ] must-fail
 [ ] [ "<?xml version='1.0'?><!-- declarations for <head> & <body> --><foo/>" string>xml drop ] unit-test
 [ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>xml-chunk first ] unit-test
@ -61,8 +57,5 @@ SYMBOL: xml-file
 [ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first ] unit-test
 [ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first ] unit-test
 [ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo   SYSTEM \"blah.dtd\"   >" string>xml-chunk first ] unit-test
-[ t ] [ "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' >" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test
-[ "foo" ] [ "<!ENTITY bar 'foo'><x>&bar;</x>" string>xml children>string ] unit-test
-[ V{ "hello" } ] [ "hello" string>xml-chunk ] unit-test
 [ 958 ] [ [ "&xi;" string>xml-chunk ] with-html-entities first first ] unit-test
 [ "x" "<" ] [ "<x value='&lt;'/>" string>xml [ name>> main>> ] [ "value" swap at ] bi ] unit-test
--- a/basis/xml/tokenize/tokenize.factor
+++ b/basis/xml/tokenize/tokenize.factor
@ -1,17 +1,15 @@
-! Copyright (C) 2005, 2006 Daniel Ehrenberg
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays ascii assocs combinators locals
-combinators.short-circuit fry io.encodings io.encodings.iana
-io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make
-math math.parser namespaces sequences sets splitting xml.state
-strings xml.char-classes xml.data xml.entities xml.errors hashtables
-circular io sbufs ;
+USING: namespaces xml.state kernel sequences accessors
+xml.char-classes xml.errors math io sbufs fry strings ascii
+circular xml.entities assocs make splitting math.parser
+locals combinators arrays ;
 IN: xml.tokenize

-! Originally from state-parser
-
 SYMBOL: prolog-data

+SYMBOL: depth
+
 : version=1.0? ( -- ? )
    prolog-data get [ version>> "1.0" = ] [ t ] if* ;

@ -41,6 +39,14 @@ SYMBOL: prolog-data
    #! Increment spot.
    get-char [ unexpected-end ] unless (next) record ;

+: init-parser ( -- )
+    0 1 0 f f <spot> spot set
+    read1 set-next next ;
+
+: with-state ( stream quot -- )
+    ! with-input-stream implicitly creates a new scope which we use
+    swap [ init-parser call ] with-input-stream ; inline
+
 : skip-until ( quot: ( -- ? ) -- )
    get-char [
        [ call ] keep swap [ drop ] [
@ -82,89 +88,6 @@ SYMBOL: prolog-data
    dup [ get-char next ] replicate 2dup =
    [ 2drop ] [ expected ] if ;

-: init-parser ( -- )
-    0 1 0 f f <spot> spot set
-    read1 set-next next ;
-
-: state-parse ( stream quot -- )
-    ! with-input-stream implicitly creates a new scope which we use
-    swap [ init-parser call ] with-input-stream ; inline
-
-! XML namespace processing: ns = namespace
-
-! A stack of hashtables
-SYMBOL: ns-stack
-
-SYMBOL: depth
-
-: attrs>ns ( attrs-alist -- hash )
-    ! this should check to make sure URIs are valid
-    [
-        [
-            swap dup space>> "xmlns" =
-            [ main>> set ]
-            [
-                T{ name f "" "xmlns" f } names-match?
-                [ "" set ] [ drop ] if
-            ] if
-        ] assoc-each
-    ] { } make-assoc f like ;
-
-: add-ns ( name -- )
-    dup space>> dup ns-stack get assoc-stack
-    [ nip ] [ nonexist-ns ] if* >>url drop ;
-
-: push-ns ( hash -- )
-    ns-stack get push ;
-
-: pop-ns ( -- )
-    ns-stack get pop* ;
-
-: init-ns-stack ( -- )
-    V{ H{
-        { "xml" "http://www.w3.org/XML/1998/namespace" }
-        { "xmlns" "http://www.w3.org/2000/xmlns" }
-        { "" "" }
-    } } clone
-    ns-stack set ;
-
-: tag-ns ( name attrs-alist -- name attrs )
-    dup attrs>ns push-ns
-    [ dup add-ns ] dip dup [ drop add-ns ] assoc-each <attrs> ;
-
-! Parsing names
-
-: valid-name? ( str -- ? )
-    [ f ] [
-        version=1.0? swap {
-            [ first name-start? ]
-            [ rest-slice [ name-char? ] with all? ]
-        } 2&&
-    ] if-empty ;
-
-: prefixed-name ( str -- name/f )
-    ":" split dup length 2 = [
-        [ [ valid-name? ] all? ]
-        [ first2 f <name> ] bi and
-    ] [ drop f ] if ;
-
-: interpret-name ( str -- name )
-    dup prefixed-name [ ] [
-        dup valid-name?
-        [ <simple-name> ] [ bad-name ] if
-    ] ?if ;
-
-: take-name ( -- string )
-    version=1.0? '[ _ get-char name-char? not ] take-until ;
-
-: parse-name ( -- name )
-    take-name interpret-name ;
-
-: parse-name-starting ( string -- name )
-    take-name append interpret-name ;
-
-!   -- Parsing strings
-
 : parse-named-entity ( string -- )
    dup entities at [ , ] [
        dup extra-entities get at
@ -211,12 +134,8 @@ SYMBOL: in-dtd?
        char CHAR: < =
    ] parse-char ;

-! Parsing tags
-
-: start-tag ( -- name ? )
-    #! Outputs the name and whether this is a closing tag
-    get-char CHAR: / = dup [ next ] when
-    parse-name swap ;
+: close ( -- )
+    pass-blank CHAR: > expect ;

 : normalize-quote ( str -- str )
    [ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;
@ -235,262 +154,3 @@ SYMBOL: in-dtd?
 : parse-quote ( -- seq )
   f parse-quote* ;

-: parse-attr ( -- )
-    parse-name pass-blank CHAR: = expect pass-blank
-    t parse-quote* 2array , ;
-
-: (middle-tag) ( -- )
-    pass-blank version=1.0? get-char name-start?
-    [ parse-attr (middle-tag) ] when ;
-
-: assure-no-duplicates ( attrs-alist -- attrs-alist )
-    H{ } clone 2dup '[ swap _ push-at ] assoc-each
-    [ nip length 2 >= ] assoc-filter >alist
-    [ first first2 duplicate-attr ] unless-empty ;
-
-: middle-tag ( -- attrs-alist )
-    ! f make will make a vector if it has any elements
-    [ (middle-tag) ] f make pass-blank
-    assure-no-duplicates ;
-
-: close ( -- )
-    pass-blank CHAR: > expect ;
-
-: end-tag ( name attrs-alist -- tag )
-    tag-ns pass-blank get-char CHAR: / =
-    [ pop-ns <contained> next CHAR: > expect ]
-    [ depth inc <opener> close ] if ;
-
-: take-comment ( -- comment )
-    "--" expect-string
-    "--" take-string
-    <comment>
-    CHAR: > expect ;
-
-: take-cdata ( -- string )
-    depth get zero? [ bad-cdata ] when
-    "[CDATA[" expect-string "]]>" take-string ;
-
-: take-word ( -- string )
-    [ get-char blank? ] take-until ;
-
-: take-decl-contents ( -- first second )
-    pass-blank take-word pass-blank ">" take-string ;
-
-: take-element-decl ( -- element-decl )
-    take-decl-contents <element-decl> ;
-
-: take-attlist-decl ( -- attlist-decl )
-    take-decl-contents <attlist-decl> ;
-
-: take-notation-decl ( -- notation-decl )
-    take-decl-contents <notation-decl> ; 
-
-: take-until-one-of ( seps -- str sep )
-    '[ get-char _ member? ] take-until get-char ;
-
-: take-system-id ( -- system-id )
-    parse-quote <system-id> close ;
-
-: take-public-id ( -- public-id )
-    parse-quote parse-quote <public-id> close ;
-
-DEFER: direct
-
-: (take-internal-subset) ( -- )
-    pass-blank get-char {
-        { CHAR: ] [ next ] }
-        [ drop "<!" expect-string direct , (take-internal-subset) ]
-    } case ;
-
-: take-internal-subset ( -- seq )
-    [
-        H{ } pe-table set
-        t in-dtd? set
-        (take-internal-subset)
-    ] { } make ;
-
-: (take-external-id) ( token -- external-id )
-    pass-blank {
-        { "SYSTEM" [ take-system-id ] }
-        { "PUBLIC" [ take-public-id ] }
-        [ bad-external-id ]
-    } case ;
-
-: take-external-id ( -- external-id )
-    take-word (take-external-id) ;
-
-: only-blanks ( str -- )
-    [ blank? ] all? [ bad-decl ] unless ;
-
-: nontrivial-doctype ( -- external-id internal-subset )
-    pass-blank get-char CHAR: [ = [
-        next take-internal-subset f swap close
-    ] [
-        " >" take-until-one-of {
-            { CHAR: \s [ (take-external-id) ] }
-            { CHAR: > [ only-blanks f ] }
-        } case f
-    ] if ;
-
-: take-doctype-decl ( -- doctype-decl )
-    pass-blank " >" take-until-one-of {
-        { CHAR: \s [ nontrivial-doctype ] }
-        { CHAR: > [ f f ] }
-    } case <doctype-decl> ;
-
-: take-entity-def ( var -- entity-name entity-def )
-    [
-        take-word pass-blank get-char {
-            { CHAR: ' [ parse-quote ] }
-            { CHAR: " [ parse-quote ] }
-            [ drop take-external-id ]
-        } case swap
-   ] dip [ [ ?set-at ] change ] 2keep swap ;
-
-: take-entity-decl ( -- entity-decl )
-    pass-blank get-char {
-        { CHAR: % [ next pass-blank pe-table take-entity-def ] }
-        [ drop extra-entities take-entity-def ]
-    } case
-    close <entity-decl> ;
-
-: take-directive ( -- directive )
-    take-name {
-        { "ELEMENT" [ take-element-decl ] }
-        { "ATTLIST" [ take-attlist-decl ] }
-        { "DOCTYPE" [ take-doctype-decl ] }
-        { "ENTITY" [ take-entity-decl ] }
-        { "NOTATION" [ take-notation-decl ] }
-        [ bad-directive ]
-    } case ;
-
-: direct ( -- object )
-    get-char {
-        { CHAR: - [ take-comment ] }
-        { CHAR: [ [ take-cdata ] }
-        [ drop take-directive ]
-    } case ;
-
-: assure-no-extra ( seq -- )
-    [ first ] map {
-        T{ name f "" "version" f }
-        T{ name f "" "encoding" f }
-        T{ name f "" "standalone" f }
-    } diff
-    [ extra-attrs ] unless-empty ; 
-
-: good-version ( version -- version )
-    dup { "1.0" "1.1" } member? [ bad-version ] unless ;
-
-: prolog-version ( alist -- version )
-    T{ name f "" "version" f } swap at
-    [ good-version ] [ versionless-prolog ] if* ;
-
-: prolog-encoding ( alist -- encoding )
-    T{ name f "" "encoding" f } swap at "UTF-8" or ;
-
-: yes/no>bool ( string -- t/f )
-    {
-        { "yes" [ t ] }
-        { "no" [ f ] }
-        [ not-yes/no ]
-    } case ;
-
-: prolog-standalone ( alist -- version )
-    T{ name f "" "standalone" f } swap at
-    [ yes/no>bool ] [ f ] if* ;
-
-: prolog-attrs ( alist -- prolog )
-    [ prolog-version ]
-    [ prolog-encoding ]
-    [ prolog-standalone ]
-    tri <prolog> ;
-
-SYMBOL: string-input?
-: decode-input-if ( encoding -- )
-    string-input? get [ drop ] [ decode-input ] if ;
-
-: parse-prolog ( -- prolog )
-    pass-blank middle-tag "?>" expect-string
-    dup assure-no-extra prolog-attrs
-    dup encoding>> dup "UTF-16" =
-    [ drop ] [ name>encoding [ decode-input-if ] when* ] if
-    dup prolog-data set ;
-
-: instruct ( -- instruction )
-    take-name {
-        { [ dup "xml" = ] [ drop parse-prolog ] }
-        { [ dup >lower "xml" = ] [ capitalized-prolog ] }
-        { [ dup valid-name? not ] [ bad-name ] }
-        [ "?>" take-string append <instruction> ]
-    } cond ;
-
-: make-tag ( -- tag )
-    {
-        { [ get-char dup CHAR: ! = ] [ drop next direct ] }
-        { [ CHAR: ? = ] [ next instruct ] }
-        [
-            start-tag [ dup add-ns pop-ns <closer> depth dec close ]
-            [ middle-tag end-tag ] if
-        ]
-    } cond ;
-
-! Autodetecting encodings
-
-: continue-make-tag ( str -- tag )
-    parse-name-starting middle-tag end-tag ;
-
-: start-utf16le ( -- tag )
-    utf16le decode-input-if
-    CHAR: ? expect
-    0 expect check instruct ;
-
-: 10xxxxxx? ( ch -- ? )
-    -6 shift 3 bitand 2 = ;
-          
-: start<name ( ch -- tag )
-    ascii?
-    [ utf8 decode-input-if next make-tag ] [
-        next
-        [ get-next 10xxxxxx? not ] take-until
-        get-char suffix utf8 decode
-        utf8 decode-input-if next
-        continue-make-tag
-    ] if ;
-          
-: start< ( -- tag )
-    get-next {
-        { 0 [ next next start-utf16le ] }
-        { CHAR: ? [ check next next instruct ] } ! XML prolog parsing sets the encoding
-        { CHAR: ! [ check utf8 decode-input next next direct ] }
-        [ check start<name ]
-    } case ;
-
-: skip-utf8-bom ( -- tag )
-    "\u0000bb\u0000bf" expect utf8 decode-input
-    CHAR: < expect check make-tag ;
-
-: decode-expecting ( encoding string -- tag )
-    [ decode-input-if next ] [ expect-string ] bi* check make-tag ;
-
-: start-utf16be ( -- tag )
-    utf16be "<" decode-expecting ;
-
-: skip-utf16le-bom ( -- tag )
-    utf16le "\u0000fe<" decode-expecting ;
-
-: skip-utf16be-bom ( -- tag )
-    utf16be "\u0000ff<" decode-expecting ;
-
-: start-document ( -- tag )
-    get-char {
-        { CHAR: < [ start< ] }
-        { 0 [ start-utf16be ] }
-        { HEX: EF [ skip-utf8-bom ] }
-        { HEX: FF [ skip-utf16le-bom ] }
-        { HEX: FE [ skip-utf16be-bom ] }
-        { f [ "" ] }
-        [ drop utf8 decode-input-if f ]
-        ! Same problem as with <e`>, in the case of XML chunks?
-    } case check ;
--- a/basis/xml/writer/writer-docs.factor
+++ b/basis/xml/writer/writer-docs.factor
@ -11,7 +11,6 @@ ARTICLE: "xml.writer" "Writing XML"
    "These words are used to print XML normally"
    { $subsection xml>string }
    { $subsection write-xml }
-    { $subsection print-xml }
    "These words are used to prettyprint XML"
    { $subsection pprint-xml>string }
    { $subsection pprint-xml>string-but }
@ -38,11 +37,6 @@ HELP: write-xml
 { $description "prints the contents of an XML document to " { $link output-stream } "." }
 { $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;

-HELP: print-xml
-{ $values { "xml" "an XML document" } }
-{ $description "prints the contents of an XML document to " { $link output-stream } ", followed by a newline" }
-{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
-
 HELP: pprint-xml
 { $values { "xml" "an XML document" } }
 { $description "prints the contents of an XML document to " { $link output-stream } " in a prettyprinted form." }
@ -58,5 +52,5 @@ HELP: pprint-xml>string-but
 { $description "Prettyprints an XML document, returning the result as a string and leaving the whitespace of the tags with names in sensitive-tags intact." }
 { $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;

-{ xml>string print-xml write-xml pprint-xml pprint-xml>string pprint-xml>string-but pprint-xml-but } related-words
+{ xml>string write-xml pprint-xml pprint-xml>string pprint-xml>string-but pprint-xml-but } related-words

--- a/basis/xml/writer/writer-tests.factor
+++ b/basis/xml/writer/writer-tests.factor
@ -1,5 +1,62 @@
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: xml.data xml.writer tools.test fry xml kernel multiline
+xml.writer.private io.streams.string xml.utilities sequences ;
 IN: xml.writer.tests
-USING: xml.data xml.writer tools.test ;
+
+\ write-xml must-infer
+\ xml>string must-infer
+\ pprint-xml must-infer
+\ pprint-xml-but must-infer

 [ "foo" ] [ T{ name { main "foo" } } name>string ] unit-test
+[ "foo" ] [ T{ name { space "" } { main "foo" } } name>string ] unit-test
 [ "ns:foo" ] [ T{ name { space "ns" } { main "foo" } } name>string ] unit-test
+
+: reprints-as ( to from -- )
+     [ '[ _ ] ] [ '[ _ string>xml xml>string ] ] bi* unit-test ;
+
+: pprint-reprints-as ( to from -- )
+     [ '[ _ ] ] [ '[ _ string>xml pprint-xml>string ] ] bi* unit-test ;
+
+: reprints-same ( string -- ) dup reprints-as ;
+
+"<?xml version=\"1.0\" encoding=\"UTF-8\"?><x/>" reprints-same
+
+{" <?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE foo [<!ENTITY foo "bar">]>
+<x>bar</x> "}
+{" <?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE foo [<!ENTITY foo 'bar'>]>
+<x>&foo;</x> "} reprints-as
+
+{" <?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE foo [
+  <!ENTITY foo "bar">
+  <!ELEMENT br EMPTY>
+  <!ATTLIST list type    (bullets|ordered|glossary)  "ordered">
+  <!NOTATION foo bar>
+  <?baz bing bang bong?>
+  <!--wtf-->
+]>
+<x>
+  bar
+</x>"}
+{" <?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE foo [ <!ENTITY foo 'bar'> <!ELEMENT br EMPTY>
+<!ATTLIST list
+          type    (bullets|ordered|glossary)  "ordered">
+<!NOTATION 	foo bar> <?baz bing bang bong?>
+      		<!--wtf-->
+]>
+<x>&foo;</x>"} pprint-reprints-as
+
+[ t ] [ "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' >" dup string>xml-chunk xml-chunk>string = ] unit-test
+[ "foo" ] [ "<!ENTITY bar 'foo'><x>&bar;</x>" string>xml children>string ] unit-test
+[ V{ "hello" } ] [ "hello" string>xml-chunk ] unit-test
+[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><a b=\"c\"/>" ]
+    [ "<a b='c'/>" string>xml xml>string ] unit-test
+[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><foo>bar baz</foo>" ]
+[ "<foo>bar</foo>" string>xml [ " baz" append ] map xml>string ] unit-test
+[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<foo>\n  bar\n</foo>" ]
+[ "<foo>         bar            </foo>" string>xml pprint-xml>string ] unit-test
--- a/basis/xml/writer/writer.factor
+++ b/basis/xml/writer/writer.factor
@ -1,4 +1,4 @@
-! Copyright (C) 2005, 2006 Daniel Ehrenberg
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 USING: hashtables kernel math namespaces sequences strings
 assocs combinators io io.streams.string accessors
@ -11,6 +11,8 @@ SYMBOL: indentation
 SYMBOL: indenter
 "  " indenter set-global

+<PRIVATE
+
 : sensitive? ( tag -- ? )
    sensitive-tags get swap '[ _ names-match? ] contains? ;

@ -40,9 +42,13 @@ SYMBOL: indenter
 : name>string ( name -- string )
    [ main>> ] [ space>> ] bi [ ":" rot 3append ] unless-empty ;

+PRIVATE>
+
 : print-name ( name -- )
    name>string write ;

+<PRIVATE
+
 : print-attrs ( assoc -- )
    [
        " " write
@ -52,11 +58,18 @@ SYMBOL: indenter
        "\"" write
    ] assoc-each ;

+PRIVATE>
+
 GENERIC: write-xml-chunk ( object -- )

+<PRIVATE
+
 M: string write-xml-chunk
-    escape-string dup empty? not xml-pprint? get and
-    [ nl 80 indent-string indented-break ] when write ;
+    escape-string xml-pprint? get [
+        dup [ blank? ] all?
+        [ drop "" ]
+        [ nl 80 indent-string indented-break ] if
+    ] when write ;

 : write-tag ( tag -- )
    ?indent CHAR: < write1
@ -100,11 +113,20 @@ M: attlist-decl write-xml-chunk
    [ att-defs>> write ">" write ]
    bi ;

+M: notation-decl write-xml-chunk
+    "<!NOTATION " write
+    [ name>> write " " write ]
+    [ id>> write ">" write ]
+    bi ;
+
 M: entity-decl write-xml-chunk
    "<!ENTITY " write
-    [ name>> write " " write ]
-    [ def>> write-xml-chunk ">" write ]
-    bi ;
+    [ pe?>> [ " % " write ] when ]
+    [ name>> write " \"" write ] [
+        def>> f xml-pprint?
+        [ write-xml-chunk ] with-variable
+        "\">" write
+    ] tri ;

 M: system-id write-xml-chunk
    "SYSTEM '" write system-literal>> write "'" write ;
@ -114,17 +136,21 @@ M: public-id write-xml-chunk
    [ pubid-literal>> write "' '" write ]
    [ system-literal>> write "'" write ] bi ;

+: write-internal-subset ( seq -- )
+    [
+        "[" write indent
+        [ ?indent write-xml-chunk ] each
+        unindent ?indent "]" write
+    ] when* ;
+
 M: doctype-decl write-xml-chunk
-    "<!DOCTYPE " write
+    ?indent "<!DOCTYPE " write
    [ name>> write " " write ]
    [ external-id>> [ write-xml-chunk " " write ] when* ]
-    [
-        internal-subset>>
-        [ "[" write [ write-xml-chunk ] each "]" write ] when* ">" write
-    ] tri ;
+    [ internal-subset>> write-internal-subset ">" write ] tri ;

 M: directive write-xml-chunk
-    "<!" write text>> write CHAR: > write1 ;
+    "<!" write text>> write CHAR: > write1 nl ;

 M: instruction write-xml-chunk
    "<?" write text>> write "?>" write ;
@ -138,6 +164,8 @@ M: sequence write-xml-chunk
    standalone>> [ "\" standalone=\"yes" write ] when
    "\"?>" write ;

+PRIVATE>
+
 : write-xml ( xml -- )
    {
        [ prolog>> write-prolog ]
@ -149,31 +177,25 @@ M: sequence write-xml-chunk
 M: xml write-xml-chunk
    body>> write-xml-chunk ;

-: print-xml ( xml -- )
-    write-xml nl ;
-
 : xml>string ( xml -- string )
    [ write-xml ] with-string-writer ;

 : xml-chunk>string ( object -- string )
    [ write-xml-chunk ] with-string-writer ;

-: with-xml-pprint ( sensitive-tags quot -- )
+: pprint-xml-but ( xml sensitive-tags -- )
    [
-        swap [ assure-name ] map sensitive-tags set
+        [ assure-name ] map sensitive-tags set
        0 indentation set
        xml-pprint? on
-        call
-    ] with-scope ; inline
-
-: pprint-xml-but ( xml sensitive-tags -- )
-    [ print-xml ] with-xml-pprint ;
+        write-xml
+    ] with-scope ;

 : pprint-xml ( xml -- )
    f pprint-xml-but ;

 : pprint-xml>string-but ( xml sensitive-tags -- string )
-    [ xml>string ] with-xml-pprint ;
+    [ pprint-xml-but ] with-string-writer ;

 : pprint-xml>string ( xml -- string )
    f pprint-xml>string-but ;
--- a/basis/xml/xml.factor
+++ b/basis/xml/xml.factor
@ -1,9 +1,9 @@
-! Copyright (C) 2005, 2006 Daniel Ehrenberg
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays io io.encodings.binary io.files
 io.streams.string kernel namespaces sequences strings
-xml.backend xml.data xml.errors xml.tokenize ascii xml.entities
-xml.writer xml.state assocs ;
+xml.backend xml.data xml.errors xml.elements ascii xml.entities
+xml.writer xml.state xml.autoencoding assocs xml.tokenize xml.name ;
 IN: xml

 !   -- Overall parser with data tree
@ -132,7 +132,7 @@ TUPLE: pull-xml scope ;
        reset-prolog init-ns-stack
        start-document [ call-under ] when*
        sax-loop
-    ] state-parse ; inline recursive
+    ] with-state ; inline recursive

 : (read-xml) ( -- )
    start-document [ process ] when*
@ -144,7 +144,7 @@ TUPLE: pull-xml scope ;
        done? [ unclosed ] unless
        xml-stack get first second
        prolog-data get swap
-    ] state-parse ;
+    ] with-state ;

 : read-xml ( stream -- xml )
    0 depth
--- a/extra/html/parser/parser.factor
+++ b/extra/html/parser/parser.factor
@ -2,7 +2,7 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays html.parser.utils hashtables io kernel
 namespaces make prettyprint quotations sequences splitting
-state-parser strings unicode.categories unicode.case ;
+html.parser.state strings unicode.categories unicode.case ;
 IN: html.parser

 TUPLE: tag name attributes text closing? ;
@ -59,8 +59,8 @@ SYMBOL: tagstack
    [ get-char CHAR: " = ] take-until ;

 : read-quote ( -- string )
-    get-char next* CHAR: ' =
-    [ read-single-quote ] [ read-double-quote ] if next* ;
+    get-char next CHAR: ' =
+    [ read-single-quote ] [ read-double-quote ] if next ;

 : read-key ( -- string )
    read-whitespace*
@ -68,7 +68,7 @@ SYMBOL: tagstack

 : read-= ( -- )
    read-whitespace*
-    [ get-char CHAR: = = ] take-until drop next* ;
+    [ get-char CHAR: = = ] take-until drop next ;

 : read-value ( -- string )
    read-whitespace*
@ -76,14 +76,14 @@ SYMBOL: tagstack
    [ blank? ] trim ;

 : read-comment ( -- )
-    "-->" take-string* make-comment-tag push-tag ;
+    "-->" take-string make-comment-tag push-tag ;

 : read-dtd ( -- )
-    ">" take-string* make-dtd-tag push-tag ;
+    ">" take-string make-dtd-tag push-tag ;

 : read-bang ( -- )
-    next* get-char CHAR: - = get-next CHAR: - = and [
-        next* next*
+    next get-char CHAR: - = get-next CHAR: - = and [
+        next next
        read-comment
    ] [
        read-dtd
@ -91,10 +91,10 @@ SYMBOL: tagstack

 : read-tag ( -- string )
    [ get-char CHAR: > = get-char CHAR: < = or ] take-until
-    get-char CHAR: < = [ next* ] unless ;
+    get-char CHAR: < = [ next ] unless ;

 : read-< ( -- string )
-    next* get-char CHAR: ! = [
+    next get-char CHAR: ! = [
        read-bang f
    ] [
        read-tag
--- a/extra/html/parser/state/state-tests.factor
+++ b/extra/html/parser/state/state-tests.factor
@ -0,0 +1,13 @@
+USING: tools.test html.parser.state ascii kernel ;
+IN: html.parser.state.tests
+
+: take-rest ( -- string )
+    [ f ] take-until ;
+
+: take-char ( -- string )
+    [ get-char = ] curry take-until ;
+
+[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
+[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test
+[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test
+! [ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test
--- a/extra/html/parser/state/state.factor
+++ b/extra/html/parser/state/state.factor
@ -0,0 +1,41 @@
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: namespaces math kernel sequences accessors fry circular ;
+IN: html.parser.state
+
+TUPLE: state string i ;
+
+: get-i ( -- i ) state get i>> ;
+
+: get-char ( -- char )
+    state get [ i>> ] [ string>> ] bi ?nth ;
+
+: get-next ( -- char )
+    state get [ i>> 1+ ] [ string>> ] bi ?nth ;
+
+: next ( -- )
+    state get [ 1+ ] change-i drop ;
+
+: string-parse ( string quot -- )
+    [ 0 state boa state ] dip with-variable ;
+
+: short* ( n seq -- n' seq )
+    over [ nip dup length swap ] unless ;
+
+: skip-until ( quot: ( -- ? ) -- )
+    get-char [
+        [ call ] keep swap
+        [ drop ] [ next skip-until ] if
+    ] [ drop ] if ; inline recursive
+
+: take-until ( quot: ( -- ? ) -- )
+    [ get-i ] dip skip-until get-i
+    state get string>> subseq ;
+
+: string-matches? ( string circular -- ? )
+    get-char over push-circular sequence= ;
+
+: take-string ( match -- string )
+    dup length <circular-string>
+    [ 2dup string-matches? ] take-until nip
+    dup length rot length 1- - head next ;
--- a/extra/html/parser/utils/utils-tests.factor
+++ b/extra/html/parser/utils/utils-tests.factor
@ -1,7 +1,7 @@
 USING: assocs combinators continuations hashtables
 hashtables.private io kernel math
 namespaces prettyprint quotations sequences splitting
-state-parser strings tools.test ;
+strings tools.test ;
 USING: html.parser.utils ;
 IN: html.parser.utils.tests

--- a/extra/html/parser/utils/utils.factor
+++ b/extra/html/parser/utils/utils.factor
@ -2,17 +2,12 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: assocs circular combinators continuations hashtables
 hashtables.private io kernel math namespaces prettyprint
-quotations sequences splitting state-parser strings
+quotations sequences splitting html.parser.state strings
 combinators.short-circuit ;
 IN: html.parser.utils

 : string-parse-end? ( -- ? ) get-next not ;

-: take-string* ( match -- string )
-    dup length <circular-string>
-    [ 2dup string-matches? ] take-until nip
-    dup length rot length 1- - head next* ;
-
 : trim1 ( seq ch -- newseq )
    [ [ ?head-slice drop ] [ ?tail-slice drop ] bi ] 2keep drop like ;