From 915b6d3706b82f4126d95b634d2db0bc94786a9f Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 21 Jan 2009 18:16:51 -0600 Subject: [PATCH 1/4] Reorganizing XML --- basis/xml/autoencoding/authors.txt | 1 + basis/xml/autoencoding/autoencoding.factor | 64 ++++ basis/xml/data/data.factor | 2 +- basis/xml/dtd/authors.txt | 1 + basis/xml/dtd/dtd.factor | 61 ++++ basis/xml/elements/authors.txt | 1 + basis/xml/elements/elements.factor | 165 +++++++++ basis/xml/errors/errors-tests.factor | 2 + basis/xml/errors/errors.factor | 8 +- basis/xml/name/name.factor | 76 +++++ basis/xml/state/authors.txt | 1 + basis/xml/state/state.factor | 2 +- basis/xml/tests/state-parser-tests.factor | 2 +- basis/xml/tests/test.factor | 13 +- basis/xml/tokenize/tokenize.factor | 374 +-------------------- basis/xml/writer/writer-docs.factor | 8 +- basis/xml/writer/writer-tests.factor | 59 +++- basis/xml/writer/writer.factor | 68 ++-- basis/xml/xml.factor | 10 +- 19 files changed, 511 insertions(+), 407 deletions(-) create mode 100644 basis/xml/autoencoding/authors.txt create mode 100644 basis/xml/autoencoding/autoencoding.factor create mode 100644 basis/xml/dtd/authors.txt create mode 100644 basis/xml/dtd/dtd.factor create mode 100644 basis/xml/elements/authors.txt create mode 100644 basis/xml/elements/elements.factor create mode 100644 basis/xml/name/name.factor create mode 100644 basis/xml/state/authors.txt diff --git a/basis/xml/autoencoding/authors.txt b/basis/xml/autoencoding/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/basis/xml/autoencoding/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/basis/xml/autoencoding/autoencoding.factor b/basis/xml/autoencoding/autoencoding.factor new file mode 100644 index 0000000000..5d7e460862 --- /dev/null +++ b/basis/xml/autoencoding/autoencoding.factor @@ -0,0 +1,64 @@ +! Copyright (C) 2005, 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: kernel namespaces xml.name io.encodings.utf8 xml.elements +io.encodings.utf16 xml.tokenize xml.state math ascii sequences +io.encodings.string io.encodings combinators ; +IN: xml.autoencoding + +: continue-make-tag ( str -- tag ) + parse-name-starting middle-tag end-tag ; + +: start-utf16le ( -- tag ) + utf16le decode-input-if + CHAR: ? expect + 0 expect check instruct ; + +: 10xxxxxx? ( ch -- ? ) + -6 shift 3 bitand 2 = ; + +: start, in the case of XML chunks? + } case check ; + diff --git a/basis/xml/data/data.factor b/basis/xml/data/data.factor index 8e6ff4bf09..68e91743d3 100644 --- a/basis/xml/data/data.factor +++ b/basis/xml/data/data.factor @@ -45,7 +45,7 @@ C: element-decl TUPLE: attlist-decl < directive name att-defs ; C: attlist-decl -TUPLE: entity-decl < directive name def ; +TUPLE: entity-decl < directive name def pe? ; C: entity-decl TUPLE: system-id system-literal ; diff --git a/basis/xml/dtd/authors.txt b/basis/xml/dtd/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/basis/xml/dtd/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/basis/xml/dtd/dtd.factor b/basis/xml/dtd/dtd.factor new file mode 100644 index 0000000000..a1b90a60d7 --- /dev/null +++ b/basis/xml/dtd/dtd.factor @@ -0,0 +1,61 @@ +! Copyright (C) 2005, 2009 Daniel Ehrenberg, Slava Pestov +! See http://factorcode.org/license.txt for BSD license. +USING: xml.tokenize xml.data xml.state kernel sequences ascii +fry xml.errors combinators hashtables namespaces xml.entities +strings ; +IN: xml.dtd + +: take-word ( -- string ) + [ get-char blank? ] take-until ; + +: take-decl-contents ( -- first second ) + pass-blank take-word pass-blank ">" take-string ; + +: take-element-decl ( -- element-decl ) + take-decl-contents ; + +: take-attlist-decl ( -- attlist-decl ) + take-decl-contents ; + +: take-notation-decl ( -- notation-decl ) + take-decl-contents ; + +: take-until-one-of ( seps -- str sep ) + '[ get-char _ member? ] take-until get-char ; + +: take-system-id ( -- system-id ) + parse-quote close ; + +: take-public-id ( -- public-id ) + parse-quote parse-quote close ; + +UNION: dtd-acceptable + directive comment instruction ; + +: (take-external-id) ( token -- external-id ) + pass-blank { + { "SYSTEM" [ take-system-id ] } + { "PUBLIC" [ take-public-id ] } + [ bad-external-id ] + } case ; + +: take-external-id ( -- external-id ) + take-word (take-external-id) ; + +: only-blanks ( str -- ) + [ blank? ] all? [ bad-decl ] unless ; +: take-entity-def ( var -- entity-name entity-def ) + [ + take-word pass-blank get-char { + { CHAR: ' [ parse-quote ] } + { CHAR: " [ parse-quote ] } + [ drop take-external-id ] + } case + ] dip '[ swap _ [ ?set-at ] change ] 2keep ; + +: take-entity-decl ( -- entity-decl ) + pass-blank get-char { + { CHAR: % [ next pass-blank pe-table take-entity-def t ] } + [ drop extra-entities take-entity-def f ] + } case + close ; diff --git a/basis/xml/elements/authors.txt b/basis/xml/elements/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/basis/xml/elements/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/basis/xml/elements/elements.factor b/basis/xml/elements/elements.factor new file mode 100644 index 0000000000..65b8b66536 --- /dev/null +++ b/basis/xml/elements/elements.factor @@ -0,0 +1,165 @@ +! Copyright (C) 2005, 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: kernel namespaces xml.tokenize xml.state xml.name +xml.data accessors arrays make xml.char-classes fry assocs sequences +math xml.errors sets combinators io.encodings io.encodings.iana +unicode.case xml.dtd strings ; +IN: xml.elements + +: parse-attr ( -- ) + parse-name pass-blank CHAR: = expect pass-blank + t parse-quote* 2array , ; + +: start-tag ( -- name ? ) + #! Outputs the name and whether this is a closing tag + get-char CHAR: / = dup [ next ] when + parse-name swap ; + +: (middle-tag) ( -- ) + pass-blank version=1.0? get-char name-start? + [ parse-attr (middle-tag) ] when ; + +: assure-no-duplicates ( attrs-alist -- attrs-alist ) + H{ } clone 2dup '[ swap _ push-at ] assoc-each + [ nip length 2 >= ] assoc-filter >alist + [ first first2 duplicate-attr ] unless-empty ; + +: middle-tag ( -- attrs-alist ) + ! f make will make a vector if it has any elements + [ (middle-tag) ] f make pass-blank + assure-no-duplicates ; + +: end-tag ( name attrs-alist -- tag ) + tag-ns pass-blank get-char CHAR: / = + [ pop-ns next CHAR: > expect ] + [ depth inc close ] if ; + +: take-comment ( -- comment ) + "--" expect-string + "--" take-string + + CHAR: > expect ; + +: assure-no-extra ( seq -- ) + [ first ] map { + T{ name f "" "version" f } + T{ name f "" "encoding" f } + T{ name f "" "standalone" f } + } diff + [ extra-attrs ] unless-empty ; + +: good-version ( version -- version ) + dup { "1.0" "1.1" } member? [ bad-version ] unless ; + +: prolog-version ( alist -- version ) + T{ name f "" "version" f } swap at + [ good-version ] [ versionless-prolog ] if* ; + +: prolog-encoding ( alist -- encoding ) + T{ name f "" "encoding" f } swap at "UTF-8" or ; + +: yes/no>bool ( string -- t/f ) + { + { "yes" [ t ] } + { "no" [ f ] } + [ not-yes/no ] + } case ; + +: prolog-standalone ( alist -- version ) + T{ name f "" "standalone" f } swap at + [ yes/no>bool ] [ f ] if* ; + +: prolog-attrs ( alist -- prolog ) + [ prolog-version ] + [ prolog-encoding ] + [ prolog-standalone ] + tri ; + +SYMBOL: string-input? +: decode-input-if ( encoding -- ) + string-input? get [ drop ] [ decode-input ] if ; + +: parse-prolog ( -- prolog ) + pass-blank middle-tag "?>" expect-string + dup assure-no-extra prolog-attrs + dup encoding>> dup "UTF-16" = + [ drop ] [ name>encoding [ decode-input-if ] when* ] if + dup prolog-data set ; + +: instruct ( -- instruction ) + take-name { + { [ dup "xml" = ] [ drop parse-prolog ] } + { [ dup >lower "xml" = ] [ capitalized-prolog ] } + { [ dup valid-name? not ] [ bad-name ] } + [ "?>" take-string append ] + } cond ; + +: take-cdata ( -- string ) + depth get zero? [ bad-cdata ] when + "[CDATA[" expect-string "]]>" take-string ; + +DEFER: make-tag ! Is this unavoidable? + +: expand-pe ( -- ) ; ! Make this run the contents of the pe within a DOCTYPE + +: (take-internal-subset) ( -- ) + pass-blank get-char { + { CHAR: ] [ next ] } + { CHAR: % [ expand-pe ] } + { CHAR: < [ + next make-tag dup dtd-acceptable? + [ bad-doctype ] unless , (take-internal-subset) + ] } + [ 1string bad-doctype ] + } case ; + +: take-internal-subset ( -- seq ) + [ + H{ } pe-table set + t in-dtd? set + (take-internal-subset) + ] { } make ; + +: nontrivial-doctype ( -- external-id internal-subset ) + pass-blank get-char CHAR: [ = [ + next take-internal-subset f swap close + ] [ + " >" take-until-one-of { + { CHAR: \s [ (take-external-id) ] } + { CHAR: > [ only-blanks f ] } + } case f + ] if ; + +: take-doctype-decl ( -- doctype-decl ) + pass-blank " >" take-until-one-of { + { CHAR: \s [ nontrivial-doctype ] } + { CHAR: > [ f f ] } + } case ; + + +: take-directive ( -- directive ) + take-name { + { "ELEMENT" [ take-element-decl ] } + { "ATTLIST" [ take-attlist-decl ] } + { "DOCTYPE" [ take-doctype-decl ] } + { "ENTITY" [ take-entity-decl ] } + { "NOTATION" [ take-notation-decl ] } + [ bad-directive ] + } case ; + +: direct ( -- object ) + get-char { + { CHAR: - [ take-comment ] } + { CHAR: [ [ take-cdata ] } + [ drop take-directive ] + } case ; + +: make-tag ( -- tag ) + { + { [ get-char dup CHAR: ! = ] [ drop next direct ] } + { [ CHAR: ? = ] [ next instruct ] } + [ + start-tag [ dup add-ns pop-ns depth dec close ] + [ middle-tag end-tag ] if + ] + } cond ; diff --git a/basis/xml/errors/errors-tests.factor b/basis/xml/errors/errors-tests.factor index 1aff55fa74..bf02f4b6ca 100644 --- a/basis/xml/errors/errors-tests.factor +++ b/basis/xml/errors/errors-tests.factor @@ -34,3 +34,5 @@ T{ duplicate-attr f 1 21 T{ name { space "" } { main "this" } } V{ "a" "b" } } " T{ bad-cdata f 1 3 } "" xml-error-test T{ bad-cdata f 1 7 } "" xml-error-test T{ pre/post-content f "&" t } "&32;" xml-error-test +T{ bad-doctype f 1 17 "a" } "" xml-error-test +T{ bad-doctype f 1 22 T{ opener { name T{ name f "" "foo" "" } } { attrs T{ attrs } } } } " ]>" xml-error-test diff --git a/basis/xml/errors/errors.factor b/basis/xml/errors/errors.factor index fe58eac317..ea6eb51415 100644 --- a/basis/xml/errors/errors.factor +++ b/basis/xml/errors/errors.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: xml.data xml.writer kernel generic io prettyprint math debugger sequences xml.state accessors summary -namespaces io.streams.string xml.backend ; +namespaces io.streams.string xml.backend xml.writer.private ; IN: xml.errors TUPLE: parsing-error line column ; @@ -332,6 +332,12 @@ M: not-enough-characters summary ( obj -- str ) "Not enough characters" print ] with-string-writer ; +TUPLE: bad-doctype < parsing-error contents ; +: bad-doctype ( contents -- * ) + \ bad-doctype parsing-error swap >>contents throw ; +M: bad-doctype summary + call-next-method "\nDTD contains invalid object" append ; + UNION: xml-parse-error multitags notags extra-attrs nonexist-ns bad-decl not-yes/no unclosed mismatched expected no-entity diff --git a/basis/xml/name/name.factor b/basis/xml/name/name.factor new file mode 100644 index 0000000000..32053b1eb4 --- /dev/null +++ b/basis/xml/name/name.factor @@ -0,0 +1,76 @@ +! Copyright (C) 2005, 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: kernel namespaces accessors xml.tokenize xml.data assocs +xml.errors xml.char-classes combinators.short-circuit splitting +fry xml.state sequences ; +IN: xml.name + +! XML namespace processing: ns = namespace + +! A stack of hashtables +SYMBOL: ns-stack + +: attrs>ns ( attrs-alist -- hash ) + ! this should check to make sure URIs are valid + [ + [ + swap dup space>> "xmlns" = + [ main>> set ] + [ + T{ name f "" "xmlns" f } names-match? + [ "" set ] [ drop ] if + ] if + ] assoc-each + ] { } make-assoc f like ; + +: add-ns ( name -- ) + dup space>> dup ns-stack get assoc-stack + [ nip ] [ nonexist-ns ] if* >>url drop ; + +: push-ns ( hash -- ) + ns-stack get push ; + +: pop-ns ( -- ) + ns-stack get pop* ; + +: init-ns-stack ( -- ) + V{ H{ + { "xml" "http://www.w3.org/XML/1998/namespace" } + { "xmlns" "http://www.w3.org/2000/xmlns" } + { "" "" } + } } clone + ns-stack set ; + +: tag-ns ( name attrs-alist -- name attrs ) + dup attrs>ns push-ns + [ dup add-ns ] dip dup [ drop add-ns ] assoc-each ; + +: valid-name? ( str -- ? ) + [ f ] [ + version=1.0? swap { + [ first name-start? ] + [ rest-slice [ name-char? ] with all? ] + } 2&& + ] if-empty ; + +: prefixed-name ( str -- name/f ) + ":" split dup length 2 = [ + [ [ valid-name? ] all? ] + [ first2 f ] bi and + ] [ drop f ] if ; + +: interpret-name ( str -- name ) + dup prefixed-name [ ] [ + dup valid-name? + [ ] [ bad-name ] if + ] ?if ; + +: take-name ( -- string ) + version=1.0? '[ _ get-char name-char? not ] take-until ; + +: parse-name ( -- name ) + take-name interpret-name ; + +: parse-name-starting ( string -- name ) + take-name append interpret-name ; + diff --git a/basis/xml/state/authors.txt b/basis/xml/state/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/basis/xml/state/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/basis/xml/state/state.factor b/basis/xml/state/state.factor index 8978041111..80fb6be982 100644 --- a/basis/xml/state/state.factor +++ b/basis/xml/state/state.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2005, 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: accessors kernel namespaces ; +USING: accessors kernel namespaces io ; IN: xml.state TUPLE: spot char line column next check ; diff --git a/basis/xml/tests/state-parser-tests.factor b/basis/xml/tests/state-parser-tests.factor index 3ac9d8bc91..31d4a03c7b 100644 --- a/basis/xml/tests/state-parser-tests.factor +++ b/basis/xml/tests/state-parser-tests.factor @@ -2,7 +2,7 @@ USING: tools.test xml.tokenize xml.state io.streams.string kernel io strings asc IN: xml.test.state : string-parse ( str quot -- ) - [ ] dip state-parse ; + [ ] dip with-state ; : take-rest ( -- string ) [ f ] take-until ; diff --git a/basis/xml/tests/test.factor b/basis/xml/tests/test.factor index edbb236581..61873d85bf 100644 --- a/basis/xml/tests/test.factor +++ b/basis/xml/tests/test.factor @@ -3,11 +3,13 @@ IN: xml.tests USING: kernel xml tools.test io namespaces make sequences xml.errors xml.entities.html parser strings xml.data io.files -xml.writer xml.utilities continuations assocs +xml.utilities continuations assocs sequences.deep accessors io.streams.string ; ! This is insufficient \ read-xml must-infer +[ [ drop ] sax ] must-infer +\ string>xml must-infer SYMBOL: xml-file [ ] [ "resource:basis/xml/tests/test.xml" @@ -29,8 +31,6 @@ SYMBOL: xml-file ] unit-test [ V{ "fa&g" } ] [ xml-file get "x" get-id children>> ] unit-test [ "that" ] [ xml-file get "this" swap at ] unit-test -[ "" ] - [ "" string>xml xml>string ] unit-test [ "abcd" ] [ "
abcd
" string>xml [ [ dup string? [ % ] [ drop ] if ] deep-each ] "" make @@ -47,10 +47,6 @@ SYMBOL: xml-file at swap "z" [ tuck ] dip swap set-at T{ name f "blah" "z" f } swap at ] unit-test [ "foo" ] [ "" string>xml children>string ] unit-test -[ "bar baz" ] -[ "bar" string>xml [ " baz" append ] map xml>string ] unit-test -[ "\n\n bar\n" ] -[ " bar " string>xml pprint-xml>string ] unit-test [ "" string>xml ] must-fail [ ] [ "" string>xml drop ] unit-test [ T{ element-decl f "br" "EMPTY" } ] [ "" string>xml-chunk first ] unit-test @@ -61,8 +57,5 @@ SYMBOL: xml-file [ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test [ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test [ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test -[ t ] [ "" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test -[ "foo" ] [ "&bar;" string>xml children>string ] unit-test -[ V{ "hello" } ] [ "hello" string>xml-chunk ] unit-test [ 958 ] [ [ "ξ" string>xml-chunk ] with-html-entities first first ] unit-test [ "x" "<" ] [ "" string>xml [ name>> main>> ] [ "value" swap at ] bi ] unit-test diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor index 20ff888305..7a26385332 100644 --- a/basis/xml/tokenize/tokenize.factor +++ b/basis/xml/tokenize/tokenize.factor @@ -1,17 +1,15 @@ -! Copyright (C) 2005, 2006 Daniel Ehrenberg +! Copyright (C) 2005, 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays ascii assocs combinators locals -combinators.short-circuit fry io.encodings io.encodings.iana -io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make -math math.parser namespaces sequences sets splitting xml.state -strings xml.char-classes xml.data xml.entities xml.errors hashtables -circular io sbufs ; +USING: namespaces xml.state kernel sequences accessors +xml.char-classes xml.errors math io sbufs fry strings ascii +circular xml.entities assocs make splitting math.parser +locals combinators arrays ; IN: xml.tokenize -! Originally from state-parser - SYMBOL: prolog-data +SYMBOL: depth + : version=1.0? ( -- ? ) prolog-data get [ version>> "1.0" = ] [ t ] if* ; @@ -41,6 +39,14 @@ SYMBOL: prolog-data #! Increment spot. get-char [ unexpected-end ] unless (next) record ; +: init-parser ( -- ) + 0 1 0 f f spot set + read1 set-next next ; + +: with-state ( stream quot -- ) + ! with-input-stream implicitly creates a new scope which we use + swap [ init-parser call ] with-input-stream ; inline + : skip-until ( quot: ( -- ? ) -- ) get-char [ [ call ] keep swap [ drop ] [ @@ -82,89 +88,6 @@ SYMBOL: prolog-data dup [ get-char next ] replicate 2dup = [ 2drop ] [ expected ] if ; -: init-parser ( -- ) - 0 1 0 f f spot set - read1 set-next next ; - -: state-parse ( stream quot -- ) - ! with-input-stream implicitly creates a new scope which we use - swap [ init-parser call ] with-input-stream ; inline - -! XML namespace processing: ns = namespace - -! A stack of hashtables -SYMBOL: ns-stack - -SYMBOL: depth - -: attrs>ns ( attrs-alist -- hash ) - ! this should check to make sure URIs are valid - [ - [ - swap dup space>> "xmlns" = - [ main>> set ] - [ - T{ name f "" "xmlns" f } names-match? - [ "" set ] [ drop ] if - ] if - ] assoc-each - ] { } make-assoc f like ; - -: add-ns ( name -- ) - dup space>> dup ns-stack get assoc-stack - [ nip ] [ nonexist-ns ] if* >>url drop ; - -: push-ns ( hash -- ) - ns-stack get push ; - -: pop-ns ( -- ) - ns-stack get pop* ; - -: init-ns-stack ( -- ) - V{ H{ - { "xml" "http://www.w3.org/XML/1998/namespace" } - { "xmlns" "http://www.w3.org/2000/xmlns" } - { "" "" } - } } clone - ns-stack set ; - -: tag-ns ( name attrs-alist -- name attrs ) - dup attrs>ns push-ns - [ dup add-ns ] dip dup [ drop add-ns ] assoc-each ; - -! Parsing names - -: valid-name? ( str -- ? ) - [ f ] [ - version=1.0? swap { - [ first name-start? ] - [ rest-slice [ name-char? ] with all? ] - } 2&& - ] if-empty ; - -: prefixed-name ( str -- name/f ) - ":" split dup length 2 = [ - [ [ valid-name? ] all? ] - [ first2 f ] bi and - ] [ drop f ] if ; - -: interpret-name ( str -- name ) - dup prefixed-name [ ] [ - dup valid-name? - [ ] [ bad-name ] if - ] ?if ; - -: take-name ( -- string ) - version=1.0? '[ _ get-char name-char? not ] take-until ; - -: parse-name ( -- name ) - take-name interpret-name ; - -: parse-name-starting ( string -- name ) - take-name append interpret-name ; - -! -- Parsing strings - : parse-named-entity ( string -- ) dup entities at [ , ] [ dup extra-entities get at @@ -211,12 +134,8 @@ SYMBOL: in-dtd? char CHAR: < = ] parse-char ; -! Parsing tags - -: start-tag ( -- name ? ) - #! Outputs the name and whether this is a closing tag - get-char CHAR: / = dup [ next ] when - parse-name swap ; +: close ( -- ) + pass-blank CHAR: > expect ; : normalize-quote ( str -- str ) [ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ; @@ -235,262 +154,3 @@ SYMBOL: in-dtd? : parse-quote ( -- seq ) f parse-quote* ; -: parse-attr ( -- ) - parse-name pass-blank CHAR: = expect pass-blank - t parse-quote* 2array , ; - -: (middle-tag) ( -- ) - pass-blank version=1.0? get-char name-start? - [ parse-attr (middle-tag) ] when ; - -: assure-no-duplicates ( attrs-alist -- attrs-alist ) - H{ } clone 2dup '[ swap _ push-at ] assoc-each - [ nip length 2 >= ] assoc-filter >alist - [ first first2 duplicate-attr ] unless-empty ; - -: middle-tag ( -- attrs-alist ) - ! f make will make a vector if it has any elements - [ (middle-tag) ] f make pass-blank - assure-no-duplicates ; - -: close ( -- ) - pass-blank CHAR: > expect ; - -: end-tag ( name attrs-alist -- tag ) - tag-ns pass-blank get-char CHAR: / = - [ pop-ns next CHAR: > expect ] - [ depth inc close ] if ; - -: take-comment ( -- comment ) - "--" expect-string - "--" take-string - - CHAR: > expect ; - -: take-cdata ( -- string ) - depth get zero? [ bad-cdata ] when - "[CDATA[" expect-string "]]>" take-string ; - -: take-word ( -- string ) - [ get-char blank? ] take-until ; - -: take-decl-contents ( -- first second ) - pass-blank take-word pass-blank ">" take-string ; - -: take-element-decl ( -- element-decl ) - take-decl-contents ; - -: take-attlist-decl ( -- attlist-decl ) - take-decl-contents ; - -: take-notation-decl ( -- notation-decl ) - take-decl-contents ; - -: take-until-one-of ( seps -- str sep ) - '[ get-char _ member? ] take-until get-char ; - -: take-system-id ( -- system-id ) - parse-quote close ; - -: take-public-id ( -- public-id ) - parse-quote parse-quote close ; - -DEFER: direct - -: (take-internal-subset) ( -- ) - pass-blank get-char { - { CHAR: ] [ next ] } - [ drop "" take-until-one-of { - { CHAR: \s [ (take-external-id) ] } - { CHAR: > [ only-blanks f ] } - } case f - ] if ; - -: take-doctype-decl ( -- doctype-decl ) - pass-blank " >" take-until-one-of { - { CHAR: \s [ nontrivial-doctype ] } - { CHAR: > [ f f ] } - } case ; - -: take-entity-def ( var -- entity-name entity-def ) - [ - take-word pass-blank get-char { - { CHAR: ' [ parse-quote ] } - { CHAR: " [ parse-quote ] } - [ drop take-external-id ] - } case swap - ] dip [ [ ?set-at ] change ] 2keep swap ; - -: take-entity-decl ( -- entity-decl ) - pass-blank get-char { - { CHAR: % [ next pass-blank pe-table take-entity-def ] } - [ drop extra-entities take-entity-def ] - } case - close ; - -: take-directive ( -- directive ) - take-name { - { "ELEMENT" [ take-element-decl ] } - { "ATTLIST" [ take-attlist-decl ] } - { "DOCTYPE" [ take-doctype-decl ] } - { "ENTITY" [ take-entity-decl ] } - { "NOTATION" [ take-notation-decl ] } - [ bad-directive ] - } case ; - -: direct ( -- object ) - get-char { - { CHAR: - [ take-comment ] } - { CHAR: [ [ take-cdata ] } - [ drop take-directive ] - } case ; - -: assure-no-extra ( seq -- ) - [ first ] map { - T{ name f "" "version" f } - T{ name f "" "encoding" f } - T{ name f "" "standalone" f } - } diff - [ extra-attrs ] unless-empty ; - -: good-version ( version -- version ) - dup { "1.0" "1.1" } member? [ bad-version ] unless ; - -: prolog-version ( alist -- version ) - T{ name f "" "version" f } swap at - [ good-version ] [ versionless-prolog ] if* ; - -: prolog-encoding ( alist -- encoding ) - T{ name f "" "encoding" f } swap at "UTF-8" or ; - -: yes/no>bool ( string -- t/f ) - { - { "yes" [ t ] } - { "no" [ f ] } - [ not-yes/no ] - } case ; - -: prolog-standalone ( alist -- version ) - T{ name f "" "standalone" f } swap at - [ yes/no>bool ] [ f ] if* ; - -: prolog-attrs ( alist -- prolog ) - [ prolog-version ] - [ prolog-encoding ] - [ prolog-standalone ] - tri ; - -SYMBOL: string-input? -: decode-input-if ( encoding -- ) - string-input? get [ drop ] [ decode-input ] if ; - -: parse-prolog ( -- prolog ) - pass-blank middle-tag "?>" expect-string - dup assure-no-extra prolog-attrs - dup encoding>> dup "UTF-16" = - [ drop ] [ name>encoding [ decode-input-if ] when* ] if - dup prolog-data set ; - -: instruct ( -- instruction ) - take-name { - { [ dup "xml" = ] [ drop parse-prolog ] } - { [ dup >lower "xml" = ] [ capitalized-prolog ] } - { [ dup valid-name? not ] [ bad-name ] } - [ "?>" take-string append ] - } cond ; - -: make-tag ( -- tag ) - { - { [ get-char dup CHAR: ! = ] [ drop next direct ] } - { [ CHAR: ? = ] [ next instruct ] } - [ - start-tag [ dup add-ns pop-ns depth dec close ] - [ middle-tag end-tag ] if - ] - } cond ; - -! Autodetecting encodings - -: continue-make-tag ( str -- tag ) - parse-name-starting middle-tag end-tag ; - -: start-utf16le ( -- tag ) - utf16le decode-input-if - CHAR: ? expect - 0 expect check instruct ; - -: 10xxxxxx? ( ch -- ? ) - -6 shift 3 bitand 2 = ; - -: start, in the case of XML chunks? - } case check ; diff --git a/basis/xml/writer/writer-docs.factor b/basis/xml/writer/writer-docs.factor index 6d5a9de1fc..b470403e84 100644 --- a/basis/xml/writer/writer-docs.factor +++ b/basis/xml/writer/writer-docs.factor @@ -11,7 +11,6 @@ ARTICLE: "xml.writer" "Writing XML" "These words are used to print XML normally" { $subsection xml>string } { $subsection write-xml } - { $subsection print-xml } "These words are used to prettyprint XML" { $subsection pprint-xml>string } { $subsection pprint-xml>string-but } @@ -38,11 +37,6 @@ HELP: write-xml { $description "prints the contents of an XML document to " { $link output-stream } "." } { $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ; -HELP: print-xml -{ $values { "xml" "an XML document" } } -{ $description "prints the contents of an XML document to " { $link output-stream } ", followed by a newline" } -{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ; - HELP: pprint-xml { $values { "xml" "an XML document" } } { $description "prints the contents of an XML document to " { $link output-stream } " in a prettyprinted form." } @@ -58,5 +52,5 @@ HELP: pprint-xml>string-but { $description "Prettyprints an XML document, returning the result as a string and leaving the whitespace of the tags with names in sensitive-tags intact." } { $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ; -{ xml>string print-xml write-xml pprint-xml pprint-xml>string pprint-xml>string-but pprint-xml-but } related-words +{ xml>string write-xml pprint-xml pprint-xml>string pprint-xml>string-but pprint-xml-but } related-words diff --git a/basis/xml/writer/writer-tests.factor b/basis/xml/writer/writer-tests.factor index acfe4bfe1e..2b00c90344 100644 --- a/basis/xml/writer/writer-tests.factor +++ b/basis/xml/writer/writer-tests.factor @@ -1,5 +1,62 @@ +! Copyright (C) 2005, 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: xml.data xml.writer tools.test fry xml kernel multiline +xml.writer.private io.streams.string xml.utilities sequences ; IN: xml.writer.tests -USING: xml.data xml.writer tools.test ; + +\ write-xml must-infer +\ xml>string must-infer +\ pprint-xml must-infer +\ pprint-xml-but must-infer [ "foo" ] [ T{ name { main "foo" } } name>string ] unit-test +[ "foo" ] [ T{ name { space "" } { main "foo" } } name>string ] unit-test [ "ns:foo" ] [ T{ name { space "ns" } { main "foo" } } name>string ] unit-test + +: reprints-as ( to from -- ) + [ '[ _ ] ] [ '[ _ string>xml xml>string ] ] bi* unit-test ; + +: pprint-reprints-as ( to from -- ) + [ '[ _ ] ] [ '[ _ string>xml pprint-xml>string ] ] bi* unit-test ; + +: reprints-same ( string -- ) dup reprints-as ; + +"" reprints-same + +{" +]> +bar "} +{" +]> +&foo; "} reprints-as + +{" + + + + + + +]> + + bar +"} +{" + + + + +]> +&foo;"} pprint-reprints-as + +[ t ] [ "" dup string>xml-chunk xml-chunk>string = ] unit-test +[ "foo" ] [ "&bar;" string>xml children>string ] unit-test +[ V{ "hello" } ] [ "hello" string>xml-chunk ] unit-test +[ "
" ] + [ "" string>xml xml>string ] unit-test +[ "bar baz" ] +[ "bar" string>xml [ " baz" append ] map xml>string ] unit-test +[ "\n\n bar\n" ] +[ " bar " string>xml pprint-xml>string ] unit-test diff --git a/basis/xml/writer/writer.factor b/basis/xml/writer/writer.factor index 4d715a1634..3a274d7135 100644 --- a/basis/xml/writer/writer.factor +++ b/basis/xml/writer/writer.factor @@ -1,4 +1,4 @@ -! Copyright (C) 2005, 2006 Daniel Ehrenberg +! Copyright (C) 2005, 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: hashtables kernel math namespaces sequences strings assocs combinators io io.streams.string accessors @@ -11,6 +11,8 @@ SYMBOL: indentation SYMBOL: indenter " " indenter set-global +string ( name -- string ) [ main>> ] [ space>> ] bi [ ":" rot 3append ] unless-empty ; +PRIVATE> + : print-name ( name -- ) name>string write ; + + GENERIC: write-xml-chunk ( object -- ) +> write ">" write ] bi ; +M: notation-decl write-xml-chunk + "> write " " write ] + [ id>> write ">" write ] + bi ; + M: entity-decl write-xml-chunk "> write " " write ] - [ def>> write-xml-chunk ">" write ] - bi ; + [ pe?>> [ " % " write ] when ] + [ name>> write " \"" write ] [ + def>> f xml-pprint? + [ write-xml-chunk ] with-variable + "\">" write + ] tri ; M: system-id write-xml-chunk "SYSTEM '" write system-literal>> write "'" write ; @@ -114,17 +136,21 @@ M: public-id write-xml-chunk [ pubid-literal>> write "' '" write ] [ system-literal>> write "'" write ] bi ; +: write-internal-subset ( seq -- ) + [ + "[" write indent + [ ?indent write-xml-chunk ] each + unindent ?indent "]" write + ] when* ; + M: doctype-decl write-xml-chunk - "> write " " write ] [ external-id>> [ write-xml-chunk " " write ] when* ] - [ - internal-subset>> - [ "[" write [ write-xml-chunk ] each "]" write ] when* ">" write - ] tri ; + [ internal-subset>> write-internal-subset ">" write ] tri ; M: directive write-xml-chunk - "> write CHAR: > write1 ; + "> write CHAR: > write1 nl ; M: instruction write-xml-chunk "> write "?>" write ; @@ -138,6 +164,8 @@ M: sequence write-xml-chunk standalone>> [ "\" standalone=\"yes" write ] when "\"?>" write ; +PRIVATE> + : write-xml ( xml -- ) { [ prolog>> write-prolog ] @@ -149,31 +177,25 @@ M: sequence write-xml-chunk M: xml write-xml-chunk body>> write-xml-chunk ; -: print-xml ( xml -- ) - write-xml nl ; - : xml>string ( xml -- string ) [ write-xml ] with-string-writer ; : xml-chunk>string ( object -- string ) [ write-xml-chunk ] with-string-writer ; -: with-xml-pprint ( sensitive-tags quot -- ) +: pprint-xml-but ( xml sensitive-tags -- ) [ - swap [ assure-name ] map sensitive-tags set + [ assure-name ] map sensitive-tags set 0 indentation set xml-pprint? on - call - ] with-scope ; inline - -: pprint-xml-but ( xml sensitive-tags -- ) - [ print-xml ] with-xml-pprint ; + write-xml + ] with-scope ; : pprint-xml ( xml -- ) f pprint-xml-but ; : pprint-xml>string-but ( xml sensitive-tags -- string ) - [ xml>string ] with-xml-pprint ; + [ pprint-xml-but ] with-string-writer ; : pprint-xml>string ( xml -- string ) f pprint-xml>string-but ; diff --git a/basis/xml/xml.factor b/basis/xml/xml.factor index 727393b9a2..636aa288b5 100644 --- a/basis/xml/xml.factor +++ b/basis/xml/xml.factor @@ -1,9 +1,9 @@ -! Copyright (C) 2005, 2006 Daniel Ehrenberg +! Copyright (C) 2005, 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays io io.encodings.binary io.files io.streams.string kernel namespaces sequences strings -xml.backend xml.data xml.errors xml.tokenize ascii xml.entities -xml.writer xml.state assocs ; +xml.backend xml.data xml.errors xml.elements ascii xml.entities +xml.writer xml.state xml.autoencoding assocs xml.tokenize xml.name ; IN: xml ! -- Overall parser with data tree @@ -132,7 +132,7 @@ TUPLE: pull-xml scope ; reset-prolog init-ns-stack start-document [ call-under ] when* sax-loop - ] state-parse ; inline recursive + ] with-state ; inline recursive : (read-xml) ( -- ) start-document [ process ] when* @@ -144,7 +144,7 @@ TUPLE: pull-xml scope ; done? [ unclosed ] unless xml-stack get first second prolog-data get swap - ] state-parse ; + ] with-state ; : read-xml ( stream -- xml ) 0 depth From 465ed2fca8be8d45b25980bd83c94fc5a29dff5e Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 21 Jan 2009 18:17:06 -0600 Subject: [PATCH 2/4] New delimiter choices for multiline --- basis/multiline/multiline.factor | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/basis/multiline/multiline.factor b/basis/multiline/multiline.factor index a79c25750c..53c2789c50 100644 --- a/basis/multiline/multiline.factor +++ b/basis/multiline/multiline.factor @@ -51,4 +51,13 @@ PRIVATE> : <" "\">" parse-multiline-string parsed ; parsing +: <' + "'>" parse-multiline-string parsed ; parsing + +: {' + "'}" parse-multiline-string parsed ; parsing + +: {" + "\"}" parse-multiline-string parsed ; parsing + : /* "*/" parse-multiline-string drop ; parsing From 984b68d00b75e90a29859201c926f01b831d5a16 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 21 Jan 2009 21:57:44 -0600 Subject: [PATCH 3/4] Fixing HTML dependancy on state-parser --- extra/html/parser/parser.factor | 20 +++++------ extra/html/parser/state/state-tests.factor | 13 +++++++ extra/html/parser/state/state.factor | 41 ++++++++++++++++++++++ extra/html/parser/utils/utils-tests.factor | 2 +- extra/html/parser/utils/utils.factor | 7 +--- 5 files changed, 66 insertions(+), 17 deletions(-) create mode 100644 extra/html/parser/state/state-tests.factor create mode 100644 extra/html/parser/state/state.factor diff --git a/extra/html/parser/parser.factor b/extra/html/parser/parser.factor index 836693026a..c445b708c5 100644 --- a/extra/html/parser/parser.factor +++ b/extra/html/parser/parser.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays html.parser.utils hashtables io kernel namespaces make prettyprint quotations sequences splitting -state-parser strings unicode.categories unicode.case ; +html.parser.state strings unicode.categories unicode.case ; IN: html.parser TUPLE: tag name attributes text closing? ; @@ -59,8 +59,8 @@ SYMBOL: tagstack [ get-char CHAR: " = ] take-until ; : read-quote ( -- string ) - get-char next* CHAR: ' = - [ read-single-quote ] [ read-double-quote ] if next* ; + get-char next CHAR: ' = + [ read-single-quote ] [ read-double-quote ] if next ; : read-key ( -- string ) read-whitespace* @@ -68,7 +68,7 @@ SYMBOL: tagstack : read-= ( -- ) read-whitespace* - [ get-char CHAR: = = ] take-until drop next* ; + [ get-char CHAR: = = ] take-until drop next ; : read-value ( -- string ) read-whitespace* @@ -76,14 +76,14 @@ SYMBOL: tagstack [ blank? ] trim ; : read-comment ( -- ) - "-->" take-string* make-comment-tag push-tag ; + "-->" take-string make-comment-tag push-tag ; : read-dtd ( -- ) - ">" take-string* make-dtd-tag push-tag ; + ">" take-string make-dtd-tag push-tag ; : read-bang ( -- ) - next* get-char CHAR: - = get-next CHAR: - = and [ - next* next* + next get-char CHAR: - = get-next CHAR: - = and [ + next next read-comment ] [ read-dtd @@ -91,10 +91,10 @@ SYMBOL: tagstack : read-tag ( -- string ) [ get-char CHAR: > = get-char CHAR: < = or ] take-until - get-char CHAR: < = [ next* ] unless ; + get-char CHAR: < = [ next ] unless ; : read-< ( -- string ) - next* get-char CHAR: ! = [ + next get-char CHAR: ! = [ read-bang f ] [ read-tag diff --git a/extra/html/parser/state/state-tests.factor b/extra/html/parser/state/state-tests.factor new file mode 100644 index 0000000000..a9be38c0b5 --- /dev/null +++ b/extra/html/parser/state/state-tests.factor @@ -0,0 +1,13 @@ +USING: tools.test html.parser.state ascii kernel ; +IN: html.parser.state.tests + +: take-rest ( -- string ) + [ f ] take-until ; + +: take-char ( -- string ) + [ get-char = ] curry take-until ; + +[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test +[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test +[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test +! [ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test diff --git a/extra/html/parser/state/state.factor b/extra/html/parser/state/state.factor new file mode 100644 index 0000000000..4b1027d338 --- /dev/null +++ b/extra/html/parser/state/state.factor @@ -0,0 +1,41 @@ +! Copyright (C) 2005, 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: namespaces math kernel sequences accessors fry circular ; +IN: html.parser.state + +TUPLE: state string i ; + +: get-i ( -- i ) state get i>> ; + +: get-char ( -- char ) + state get [ i>> ] [ string>> ] bi ?nth ; + +: get-next ( -- char ) + state get [ i>> 1+ ] [ string>> ] bi ?nth ; + +: next ( -- ) + state get [ 1+ ] change-i drop ; + +: string-parse ( string quot -- ) + [ 0 state boa state ] dip with-variable ; + +: short* ( n seq -- n' seq ) + over [ nip dup length swap ] unless ; + +: skip-until ( quot: ( -- ? ) -- ) + get-char [ + [ call ] keep swap + [ drop ] [ next skip-until ] if + ] [ drop ] if ; inline recursive + +: take-until ( quot: ( -- ? ) -- ) + [ get-i ] dip skip-until get-i + state get string>> subseq ; + +: string-matches? ( string circular -- ? ) + get-char over push-circular sequence= ; + +: take-string ( match -- string ) + dup length + [ 2dup string-matches? ] take-until nip + dup length rot length 1- - head next ; diff --git a/extra/html/parser/utils/utils-tests.factor b/extra/html/parser/utils/utils-tests.factor index 4b25db16fd..6d8e3bc05f 100644 --- a/extra/html/parser/utils/utils-tests.factor +++ b/extra/html/parser/utils/utils-tests.factor @@ -1,7 +1,7 @@ USING: assocs combinators continuations hashtables hashtables.private io kernel math namespaces prettyprint quotations sequences splitting -state-parser strings tools.test ; +strings tools.test ; USING: html.parser.utils ; IN: html.parser.utils.tests diff --git a/extra/html/parser/utils/utils.factor b/extra/html/parser/utils/utils.factor index c2a9d73af8..c913b9d306 100644 --- a/extra/html/parser/utils/utils.factor +++ b/extra/html/parser/utils/utils.factor @@ -2,17 +2,12 @@ ! See http://factorcode.org/license.txt for BSD license. USING: assocs circular combinators continuations hashtables hashtables.private io kernel math namespaces prettyprint -quotations sequences splitting state-parser strings +quotations sequences splitting html.parser.state strings combinators.short-circuit ; IN: html.parser.utils : string-parse-end? ( -- ? ) get-next not ; -: take-string* ( match -- string ) - dup length - [ 2dup string-matches? ] take-until nip - dup length rot length 1- - head next* ; - : trim1 ( seq ch -- newseq ) [ [ ?head-slice drop ] [ ?tail-slice drop ] bi ] 2keep drop like ; From 376f332eef363bead9518d157c36a9d8500d4063 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Wed, 21 Jan 2009 22:21:40 -0600 Subject: [PATCH 4/4] allow streams in the post-data tuple --- basis/http/client/client.factor | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/basis/http/client/client.factor b/basis/http/client/client.factor index f2c0a862eb..f8106f4c83 100644 --- a/basis/http/client/client.factor +++ b/basis/http/client/client.factor @@ -34,6 +34,8 @@ IN: http.client GENERIC: >post-data ( object -- post-data ) +M: f >post-data ; + M: post-data >post-data ; M: string >post-data @@ -41,15 +43,13 @@ M: string >post-data "application/octet-stream" swap >>data ; -M: byte-array >post-data - "application/octet-stream" - swap >>data ; - M: assoc >post-data "application/x-www-form-urlencoded" swap >>params ; -M: f >post-data ; +M: object >post-data + "application/octet-stream" + swap >>data ; : normalize-post-data ( request -- request ) dup post-data>> [ @@ -63,8 +63,10 @@ M: f >post-data ; normalize-post-data ; : write-post-data ( request -- request ) - dup method>> [ "POST" = ] [ "PUT" = ] bi or - [ dup post-data>> data>> write ] when ; + dup method>> { "POST" "PUT" } member? [ + dup post-data>> data>> dup sequence? + [ write ] [ output-stream get stream-copy ] if + ] when ; : write-request ( request -- ) unparse-post-data