Merge branch 'master' of git://factorcode.org/git/factor

db4
Doug Coleman 2009-01-21 23:32:59 -06:00
commit ec8f3172de
26 changed files with 595 additions and 431 deletions

View File

@ -34,6 +34,8 @@ IN: http.client
GENERIC: >post-data ( object -- post-data )
M: f >post-data ;
M: post-data >post-data ;
M: string >post-data
@ -41,15 +43,13 @@ M: string >post-data
"application/octet-stream" <post-data>
swap >>data ;
M: byte-array >post-data
"application/octet-stream" <post-data>
swap >>data ;
M: assoc >post-data
"application/x-www-form-urlencoded" <post-data>
swap >>params ;
M: f >post-data ;
M: object >post-data
"application/octet-stream" <post-data>
swap >>data ;
: normalize-post-data ( request -- request )
dup post-data>> [
@ -63,8 +63,10 @@ M: f >post-data ;
normalize-post-data ;
: write-post-data ( request -- request )
dup method>> [ "POST" = ] [ "PUT" = ] bi or
[ dup post-data>> data>> write ] when ;
dup method>> { "POST" "PUT" } member? [
dup post-data>> data>> dup sequence?
[ write ] [ output-stream get stream-copy ] if
] when ;
: write-request ( request -- )
unparse-post-data

View File

@ -51,4 +51,13 @@ PRIVATE>
: <"
"\">" parse-multiline-string parsed ; parsing
: <'
"'>" parse-multiline-string parsed ; parsing
: {'
"'}" parse-multiline-string parsed ; parsing
: {"
"\"}" parse-multiline-string parsed ; parsing
: /* "*/" parse-multiline-string drop ; parsing

View File

@ -0,0 +1 @@
Daniel Ehrenberg

View File

@ -0,0 +1,64 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel namespaces xml.name io.encodings.utf8 xml.elements
io.encodings.utf16 xml.tokenize xml.state math ascii sequences
io.encodings.string io.encodings combinators ;
IN: xml.autoencoding
: continue-make-tag ( str -- tag )
parse-name-starting middle-tag end-tag ;
: start-utf16le ( -- tag )
utf16le decode-input-if
CHAR: ? expect
0 expect check instruct ;
: 10xxxxxx? ( ch -- ? )
-6 shift 3 bitand 2 = ;
: start<name ( ch -- tag )
ascii?
[ utf8 decode-input-if next make-tag ] [
next
[ get-next 10xxxxxx? not ] take-until
get-char suffix utf8 decode
utf8 decode-input-if next
continue-make-tag
] if ;
: start< ( -- tag )
get-next {
{ 0 [ next next start-utf16le ] }
{ CHAR: ? [ check next next instruct ] } ! XML prolog parsing sets the encoding
{ CHAR: ! [ check utf8 decode-input next next direct ] }
[ check start<name ]
} case ;
: skip-utf8-bom ( -- tag )
"\u0000bb\u0000bf" expect utf8 decode-input
CHAR: < expect check make-tag ;
: decode-expecting ( encoding string -- tag )
[ decode-input-if next ] [ expect-string ] bi* check make-tag ;
: start-utf16be ( -- tag )
utf16be "<" decode-expecting ;
: skip-utf16le-bom ( -- tag )
utf16le "\u0000fe<" decode-expecting ;
: skip-utf16be-bom ( -- tag )
utf16be "\u0000ff<" decode-expecting ;
: start-document ( -- tag )
get-char {
{ CHAR: < [ start< ] }
{ 0 [ start-utf16be ] }
{ HEX: EF [ skip-utf8-bom ] }
{ HEX: FF [ skip-utf16le-bom ] }
{ HEX: FE [ skip-utf16be-bom ] }
{ f [ "" ] }
[ drop utf8 decode-input-if f ]
! Same problem as with <e`>, in the case of XML chunks?
} case check ;

View File

@ -45,7 +45,7 @@ C: <element-decl> element-decl
TUPLE: attlist-decl < directive name att-defs ;
C: <attlist-decl> attlist-decl
TUPLE: entity-decl < directive name def ;
TUPLE: entity-decl < directive name def pe? ;
C: <entity-decl> entity-decl
TUPLE: system-id system-literal ;

View File

@ -0,0 +1 @@
Daniel Ehrenberg

61
basis/xml/dtd/dtd.factor Normal file
View File

@ -0,0 +1,61 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg, Slava Pestov
! See http://factorcode.org/license.txt for BSD license.
USING: xml.tokenize xml.data xml.state kernel sequences ascii
fry xml.errors combinators hashtables namespaces xml.entities
strings ;
IN: xml.dtd
: take-word ( -- string )
[ get-char blank? ] take-until ;
: take-decl-contents ( -- first second )
pass-blank take-word pass-blank ">" take-string ;
: take-element-decl ( -- element-decl )
take-decl-contents <element-decl> ;
: take-attlist-decl ( -- attlist-decl )
take-decl-contents <attlist-decl> ;
: take-notation-decl ( -- notation-decl )
take-decl-contents <notation-decl> ;
: take-until-one-of ( seps -- str sep )
'[ get-char _ member? ] take-until get-char ;
: take-system-id ( -- system-id )
parse-quote <system-id> close ;
: take-public-id ( -- public-id )
parse-quote parse-quote <public-id> close ;
UNION: dtd-acceptable
directive comment instruction ;
: (take-external-id) ( token -- external-id )
pass-blank {
{ "SYSTEM" [ take-system-id ] }
{ "PUBLIC" [ take-public-id ] }
[ bad-external-id ]
} case ;
: take-external-id ( -- external-id )
take-word (take-external-id) ;
: only-blanks ( str -- )
[ blank? ] all? [ bad-decl ] unless ;
: take-entity-def ( var -- entity-name entity-def )
[
take-word pass-blank get-char {
{ CHAR: ' [ parse-quote ] }
{ CHAR: " [ parse-quote ] }
[ drop take-external-id ]
} case
] dip '[ swap _ [ ?set-at ] change ] 2keep ;
: take-entity-decl ( -- entity-decl )
pass-blank get-char {
{ CHAR: % [ next pass-blank pe-table take-entity-def t ] }
[ drop extra-entities take-entity-def f ]
} case
close <entity-decl> ;

View File

@ -0,0 +1 @@
Daniel Ehrenberg

View File

@ -0,0 +1,165 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel namespaces xml.tokenize xml.state xml.name
xml.data accessors arrays make xml.char-classes fry assocs sequences
math xml.errors sets combinators io.encodings io.encodings.iana
unicode.case xml.dtd strings ;
IN: xml.elements
: parse-attr ( -- )
parse-name pass-blank CHAR: = expect pass-blank
t parse-quote* 2array , ;
: start-tag ( -- name ? )
#! Outputs the name and whether this is a closing tag
get-char CHAR: / = dup [ next ] when
parse-name swap ;
: (middle-tag) ( -- )
pass-blank version=1.0? get-char name-start?
[ parse-attr (middle-tag) ] when ;
: assure-no-duplicates ( attrs-alist -- attrs-alist )
H{ } clone 2dup '[ swap _ push-at ] assoc-each
[ nip length 2 >= ] assoc-filter >alist
[ first first2 duplicate-attr ] unless-empty ;
: middle-tag ( -- attrs-alist )
! f make will make a vector if it has any elements
[ (middle-tag) ] f make pass-blank
assure-no-duplicates ;
: end-tag ( name attrs-alist -- tag )
tag-ns pass-blank get-char CHAR: / =
[ pop-ns <contained> next CHAR: > expect ]
[ depth inc <opener> close ] if ;
: take-comment ( -- comment )
"--" expect-string
"--" take-string
<comment>
CHAR: > expect ;
: assure-no-extra ( seq -- )
[ first ] map {
T{ name f "" "version" f }
T{ name f "" "encoding" f }
T{ name f "" "standalone" f }
} diff
[ extra-attrs ] unless-empty ;
: good-version ( version -- version )
dup { "1.0" "1.1" } member? [ bad-version ] unless ;
: prolog-version ( alist -- version )
T{ name f "" "version" f } swap at
[ good-version ] [ versionless-prolog ] if* ;
: prolog-encoding ( alist -- encoding )
T{ name f "" "encoding" f } swap at "UTF-8" or ;
: yes/no>bool ( string -- t/f )
{
{ "yes" [ t ] }
{ "no" [ f ] }
[ not-yes/no ]
} case ;
: prolog-standalone ( alist -- version )
T{ name f "" "standalone" f } swap at
[ yes/no>bool ] [ f ] if* ;
: prolog-attrs ( alist -- prolog )
[ prolog-version ]
[ prolog-encoding ]
[ prolog-standalone ]
tri <prolog> ;
SYMBOL: string-input?
: decode-input-if ( encoding -- )
string-input? get [ drop ] [ decode-input ] if ;
: parse-prolog ( -- prolog )
pass-blank middle-tag "?>" expect-string
dup assure-no-extra prolog-attrs
dup encoding>> dup "UTF-16" =
[ drop ] [ name>encoding [ decode-input-if ] when* ] if
dup prolog-data set ;
: instruct ( -- instruction )
take-name {
{ [ dup "xml" = ] [ drop parse-prolog ] }
{ [ dup >lower "xml" = ] [ capitalized-prolog ] }
{ [ dup valid-name? not ] [ bad-name ] }
[ "?>" take-string append <instruction> ]
} cond ;
: take-cdata ( -- string )
depth get zero? [ bad-cdata ] when
"[CDATA[" expect-string "]]>" take-string ;
DEFER: make-tag ! Is this unavoidable?
: expand-pe ( -- ) ; ! Make this run the contents of the pe within a DOCTYPE
: (take-internal-subset) ( -- )
pass-blank get-char {
{ CHAR: ] [ next ] }
{ CHAR: % [ expand-pe ] }
{ CHAR: < [
next make-tag dup dtd-acceptable?
[ bad-doctype ] unless , (take-internal-subset)
] }
[ 1string bad-doctype ]
} case ;
: take-internal-subset ( -- seq )
[
H{ } pe-table set
t in-dtd? set
(take-internal-subset)
] { } make ;
: nontrivial-doctype ( -- external-id internal-subset )
pass-blank get-char CHAR: [ = [
next take-internal-subset f swap close
] [
" >" take-until-one-of {
{ CHAR: \s [ (take-external-id) ] }
{ CHAR: > [ only-blanks f ] }
} case f
] if ;
: take-doctype-decl ( -- doctype-decl )
pass-blank " >" take-until-one-of {
{ CHAR: \s [ nontrivial-doctype ] }
{ CHAR: > [ f f ] }
} case <doctype-decl> ;
: take-directive ( -- directive )
take-name {
{ "ELEMENT" [ take-element-decl ] }
{ "ATTLIST" [ take-attlist-decl ] }
{ "DOCTYPE" [ take-doctype-decl ] }
{ "ENTITY" [ take-entity-decl ] }
{ "NOTATION" [ take-notation-decl ] }
[ bad-directive ]
} case ;
: direct ( -- object )
get-char {
{ CHAR: - [ take-comment ] }
{ CHAR: [ [ take-cdata ] }
[ drop take-directive ]
} case ;
: make-tag ( -- tag )
{
{ [ get-char dup CHAR: ! = ] [ drop next direct ] }
{ [ CHAR: ? = ] [ next instruct ] }
[
start-tag [ dup add-ns pop-ns <closer> depth dec close ]
[ middle-tag end-tag ] if
]
} cond ;

View File

@ -34,3 +34,5 @@ T{ duplicate-attr f 1 21 T{ name { space "" } { main "this" } } V{ "a" "b" } } "
T{ bad-cdata f 1 3 } "<![CDATA[]]><x/>" xml-error-test
T{ bad-cdata f 1 7 } "<x/><![CDATA[]]>" xml-error-test
T{ pre/post-content f "&" t } "&32;<x/>" xml-error-test
T{ bad-doctype f 1 17 "a" } "<!DOCTYPE foo [ a ]><x/>" xml-error-test
T{ bad-doctype f 1 22 T{ opener { name T{ name f "" "foo" "" } } { attrs T{ attrs } } } } "<!DOCTYPE foo [ <foo> ]><x/>" xml-error-test

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license.
USING: xml.data xml.writer kernel generic io prettyprint math
debugger sequences xml.state accessors summary
namespaces io.streams.string xml.backend ;
namespaces io.streams.string xml.backend xml.writer.private ;
IN: xml.errors
TUPLE: parsing-error line column ;
@ -332,6 +332,12 @@ M: not-enough-characters summary ( obj -- str )
"Not enough characters" print
] with-string-writer ;
TUPLE: bad-doctype < parsing-error contents ;
: bad-doctype ( contents -- * )
\ bad-doctype parsing-error swap >>contents throw ;
M: bad-doctype summary
call-next-method "\nDTD contains invalid object" append ;
UNION: xml-parse-error
multitags notags extra-attrs nonexist-ns bad-decl
not-yes/no unclosed mismatched expected no-entity

View File

@ -0,0 +1,76 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel namespaces accessors xml.tokenize xml.data assocs
xml.errors xml.char-classes combinators.short-circuit splitting
fry xml.state sequences ;
IN: xml.name
! XML namespace processing: ns = namespace
! A stack of hashtables
SYMBOL: ns-stack
: attrs>ns ( attrs-alist -- hash )
! this should check to make sure URIs are valid
[
[
swap dup space>> "xmlns" =
[ main>> set ]
[
T{ name f "" "xmlns" f } names-match?
[ "" set ] [ drop ] if
] if
] assoc-each
] { } make-assoc f like ;
: add-ns ( name -- )
dup space>> dup ns-stack get assoc-stack
[ nip ] [ nonexist-ns ] if* >>url drop ;
: push-ns ( hash -- )
ns-stack get push ;
: pop-ns ( -- )
ns-stack get pop* ;
: init-ns-stack ( -- )
V{ H{
{ "xml" "http://www.w3.org/XML/1998/namespace" }
{ "xmlns" "http://www.w3.org/2000/xmlns" }
{ "" "" }
} } clone
ns-stack set ;
: tag-ns ( name attrs-alist -- name attrs )
dup attrs>ns push-ns
[ dup add-ns ] dip dup [ drop add-ns ] assoc-each <attrs> ;
: valid-name? ( str -- ? )
[ f ] [
version=1.0? swap {
[ first name-start? ]
[ rest-slice [ name-char? ] with all? ]
} 2&&
] if-empty ;
: prefixed-name ( str -- name/f )
":" split dup length 2 = [
[ [ valid-name? ] all? ]
[ first2 f <name> ] bi and
] [ drop f ] if ;
: interpret-name ( str -- name )
dup prefixed-name [ ] [
dup valid-name?
[ <simple-name> ] [ bad-name ] if
] ?if ;
: take-name ( -- string )
version=1.0? '[ _ get-char name-char? not ] take-until ;
: parse-name ( -- name )
take-name interpret-name ;
: parse-name-starting ( string -- name )
take-name append interpret-name ;

View File

@ -0,0 +1 @@
Daniel Ehrenberg

View File

@ -1,6 +1,6 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel namespaces ;
USING: accessors kernel namespaces io ;
IN: xml.state
TUPLE: spot char line column next check ;

View File

@ -2,7 +2,7 @@ USING: tools.test xml.tokenize xml.state io.streams.string kernel io strings asc
IN: xml.test.state
: string-parse ( str quot -- )
[ <string-reader> ] dip state-parse ;
[ <string-reader> ] dip with-state ;
: take-rest ( -- string )
[ f ] take-until ;

View File

@ -3,11 +3,13 @@
IN: xml.tests
USING: kernel xml tools.test io namespaces make sequences
xml.errors xml.entities.html parser strings xml.data io.files
xml.writer xml.utilities continuations assocs
xml.utilities continuations assocs
sequences.deep accessors io.streams.string ;
! This is insufficient
\ read-xml must-infer
[ [ drop ] sax ] must-infer
\ string>xml must-infer
SYMBOL: xml-file
[ ] [ "resource:basis/xml/tests/test.xml"
@ -29,8 +31,6 @@ SYMBOL: xml-file
] unit-test
[ V{ "fa&g" } ] [ xml-file get "x" get-id children>> ] unit-test
[ "that" ] [ xml-file get "this" swap at ] unit-test
[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><a b=\"c\"/>" ]
[ "<a b='c'/>" string>xml xml>string ] unit-test
[ "abcd" ] [
"<main>a<sub>bc</sub>d<nothing/></main>" string>xml
[ [ dup string? [ % ] [ drop ] if ] deep-each ] "" make
@ -47,10 +47,6 @@ SYMBOL: xml-file
at swap "z" [ tuck ] dip swap set-at
T{ name f "blah" "z" f } swap at ] unit-test
[ "foo" ] [ "<boo><![CDATA[foo]]></boo>" string>xml children>string ] unit-test
[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><foo>bar baz</foo>" ]
[ "<foo>bar</foo>" string>xml [ " baz" append ] map xml>string ] unit-test
[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<foo>\n bar\n</foo>" ]
[ "<foo> bar </foo>" string>xml pprint-xml>string ] unit-test
[ "<!-- B+, B, or B--->" string>xml ] must-fail
[ ] [ "<?xml version='1.0'?><!-- declarations for <head> & <body> --><foo/>" string>xml drop ] unit-test
[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>xml-chunk first ] unit-test
@ -61,8 +57,5 @@ SYMBOL: xml-file
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk first ] unit-test
[ t ] [ "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' >" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test
[ "foo" ] [ "<!ENTITY bar 'foo'><x>&bar;</x>" string>xml children>string ] unit-test
[ V{ "hello" } ] [ "hello" string>xml-chunk ] unit-test
[ 958 ] [ [ "&xi;" string>xml-chunk ] with-html-entities first first ] unit-test
[ "x" "<" ] [ "<x value='&lt;'/>" string>xml [ name>> main>> ] [ "value" swap at ] bi ] unit-test

View File

@ -1,17 +1,15 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays ascii assocs combinators locals
combinators.short-circuit fry io.encodings io.encodings.iana
io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make
math math.parser namespaces sequences sets splitting xml.state
strings xml.char-classes xml.data xml.entities xml.errors hashtables
circular io sbufs ;
USING: namespaces xml.state kernel sequences accessors
xml.char-classes xml.errors math io sbufs fry strings ascii
circular xml.entities assocs make splitting math.parser
locals combinators arrays ;
IN: xml.tokenize
! Originally from state-parser
SYMBOL: prolog-data
SYMBOL: depth
: version=1.0? ( -- ? )
prolog-data get [ version>> "1.0" = ] [ t ] if* ;
@ -41,6 +39,14 @@ SYMBOL: prolog-data
#! Increment spot.
get-char [ unexpected-end ] unless (next) record ;
: init-parser ( -- )
0 1 0 f f <spot> spot set
read1 set-next next ;
: with-state ( stream quot -- )
! with-input-stream implicitly creates a new scope which we use
swap [ init-parser call ] with-input-stream ; inline
: skip-until ( quot: ( -- ? ) -- )
get-char [
[ call ] keep swap [ drop ] [
@ -82,89 +88,6 @@ SYMBOL: prolog-data
dup [ get-char next ] replicate 2dup =
[ 2drop ] [ expected ] if ;
: init-parser ( -- )
0 1 0 f f <spot> spot set
read1 set-next next ;
: state-parse ( stream quot -- )
! with-input-stream implicitly creates a new scope which we use
swap [ init-parser call ] with-input-stream ; inline
! XML namespace processing: ns = namespace
! A stack of hashtables
SYMBOL: ns-stack
SYMBOL: depth
: attrs>ns ( attrs-alist -- hash )
! this should check to make sure URIs are valid
[
[
swap dup space>> "xmlns" =
[ main>> set ]
[
T{ name f "" "xmlns" f } names-match?
[ "" set ] [ drop ] if
] if
] assoc-each
] { } make-assoc f like ;
: add-ns ( name -- )
dup space>> dup ns-stack get assoc-stack
[ nip ] [ nonexist-ns ] if* >>url drop ;
: push-ns ( hash -- )
ns-stack get push ;
: pop-ns ( -- )
ns-stack get pop* ;
: init-ns-stack ( -- )
V{ H{
{ "xml" "http://www.w3.org/XML/1998/namespace" }
{ "xmlns" "http://www.w3.org/2000/xmlns" }
{ "" "" }
} } clone
ns-stack set ;
: tag-ns ( name attrs-alist -- name attrs )
dup attrs>ns push-ns
[ dup add-ns ] dip dup [ drop add-ns ] assoc-each <attrs> ;
! Parsing names
: valid-name? ( str -- ? )
[ f ] [
version=1.0? swap {
[ first name-start? ]
[ rest-slice [ name-char? ] with all? ]
} 2&&
] if-empty ;
: prefixed-name ( str -- name/f )
":" split dup length 2 = [
[ [ valid-name? ] all? ]
[ first2 f <name> ] bi and
] [ drop f ] if ;
: interpret-name ( str -- name )
dup prefixed-name [ ] [
dup valid-name?
[ <simple-name> ] [ bad-name ] if
] ?if ;
: take-name ( -- string )
version=1.0? '[ _ get-char name-char? not ] take-until ;
: parse-name ( -- name )
take-name interpret-name ;
: parse-name-starting ( string -- name )
take-name append interpret-name ;
! -- Parsing strings
: parse-named-entity ( string -- )
dup entities at [ , ] [
dup extra-entities get at
@ -211,12 +134,8 @@ SYMBOL: in-dtd?
char CHAR: < =
] parse-char ;
! Parsing tags
: start-tag ( -- name ? )
#! Outputs the name and whether this is a closing tag
get-char CHAR: / = dup [ next ] when
parse-name swap ;
: close ( -- )
pass-blank CHAR: > expect ;
: normalize-quote ( str -- str )
[ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;
@ -235,262 +154,3 @@ SYMBOL: in-dtd?
: parse-quote ( -- seq )
f parse-quote* ;
: parse-attr ( -- )
parse-name pass-blank CHAR: = expect pass-blank
t parse-quote* 2array , ;
: (middle-tag) ( -- )
pass-blank version=1.0? get-char name-start?
[ parse-attr (middle-tag) ] when ;
: assure-no-duplicates ( attrs-alist -- attrs-alist )
H{ } clone 2dup '[ swap _ push-at ] assoc-each
[ nip length 2 >= ] assoc-filter >alist
[ first first2 duplicate-attr ] unless-empty ;
: middle-tag ( -- attrs-alist )
! f make will make a vector if it has any elements
[ (middle-tag) ] f make pass-blank
assure-no-duplicates ;
: close ( -- )
pass-blank CHAR: > expect ;
: end-tag ( name attrs-alist -- tag )
tag-ns pass-blank get-char CHAR: / =
[ pop-ns <contained> next CHAR: > expect ]
[ depth inc <opener> close ] if ;
: take-comment ( -- comment )
"--" expect-string
"--" take-string
<comment>
CHAR: > expect ;
: take-cdata ( -- string )
depth get zero? [ bad-cdata ] when
"[CDATA[" expect-string "]]>" take-string ;
: take-word ( -- string )
[ get-char blank? ] take-until ;
: take-decl-contents ( -- first second )
pass-blank take-word pass-blank ">" take-string ;
: take-element-decl ( -- element-decl )
take-decl-contents <element-decl> ;
: take-attlist-decl ( -- attlist-decl )
take-decl-contents <attlist-decl> ;
: take-notation-decl ( -- notation-decl )
take-decl-contents <notation-decl> ;
: take-until-one-of ( seps -- str sep )
'[ get-char _ member? ] take-until get-char ;
: take-system-id ( -- system-id )
parse-quote <system-id> close ;
: take-public-id ( -- public-id )
parse-quote parse-quote <public-id> close ;
DEFER: direct
: (take-internal-subset) ( -- )
pass-blank get-char {
{ CHAR: ] [ next ] }
[ drop "<!" expect-string direct , (take-internal-subset) ]
} case ;
: take-internal-subset ( -- seq )
[
H{ } pe-table set
t in-dtd? set
(take-internal-subset)
] { } make ;
: (take-external-id) ( token -- external-id )
pass-blank {
{ "SYSTEM" [ take-system-id ] }
{ "PUBLIC" [ take-public-id ] }
[ bad-external-id ]
} case ;
: take-external-id ( -- external-id )
take-word (take-external-id) ;
: only-blanks ( str -- )
[ blank? ] all? [ bad-decl ] unless ;
: nontrivial-doctype ( -- external-id internal-subset )
pass-blank get-char CHAR: [ = [
next take-internal-subset f swap close
] [
" >" take-until-one-of {
{ CHAR: \s [ (take-external-id) ] }
{ CHAR: > [ only-blanks f ] }
} case f
] if ;
: take-doctype-decl ( -- doctype-decl )
pass-blank " >" take-until-one-of {
{ CHAR: \s [ nontrivial-doctype ] }
{ CHAR: > [ f f ] }
} case <doctype-decl> ;
: take-entity-def ( var -- entity-name entity-def )
[
take-word pass-blank get-char {
{ CHAR: ' [ parse-quote ] }
{ CHAR: " [ parse-quote ] }
[ drop take-external-id ]
} case swap
] dip [ [ ?set-at ] change ] 2keep swap ;
: take-entity-decl ( -- entity-decl )
pass-blank get-char {
{ CHAR: % [ next pass-blank pe-table take-entity-def ] }
[ drop extra-entities take-entity-def ]
} case
close <entity-decl> ;
: take-directive ( -- directive )
take-name {
{ "ELEMENT" [ take-element-decl ] }
{ "ATTLIST" [ take-attlist-decl ] }
{ "DOCTYPE" [ take-doctype-decl ] }
{ "ENTITY" [ take-entity-decl ] }
{ "NOTATION" [ take-notation-decl ] }
[ bad-directive ]
} case ;
: direct ( -- object )
get-char {
{ CHAR: - [ take-comment ] }
{ CHAR: [ [ take-cdata ] }
[ drop take-directive ]
} case ;
: assure-no-extra ( seq -- )
[ first ] map {
T{ name f "" "version" f }
T{ name f "" "encoding" f }
T{ name f "" "standalone" f }
} diff
[ extra-attrs ] unless-empty ;
: good-version ( version -- version )
dup { "1.0" "1.1" } member? [ bad-version ] unless ;
: prolog-version ( alist -- version )
T{ name f "" "version" f } swap at
[ good-version ] [ versionless-prolog ] if* ;
: prolog-encoding ( alist -- encoding )
T{ name f "" "encoding" f } swap at "UTF-8" or ;
: yes/no>bool ( string -- t/f )
{
{ "yes" [ t ] }
{ "no" [ f ] }
[ not-yes/no ]
} case ;
: prolog-standalone ( alist -- version )
T{ name f "" "standalone" f } swap at
[ yes/no>bool ] [ f ] if* ;
: prolog-attrs ( alist -- prolog )
[ prolog-version ]
[ prolog-encoding ]
[ prolog-standalone ]
tri <prolog> ;
SYMBOL: string-input?
: decode-input-if ( encoding -- )
string-input? get [ drop ] [ decode-input ] if ;
: parse-prolog ( -- prolog )
pass-blank middle-tag "?>" expect-string
dup assure-no-extra prolog-attrs
dup encoding>> dup "UTF-16" =
[ drop ] [ name>encoding [ decode-input-if ] when* ] if
dup prolog-data set ;
: instruct ( -- instruction )
take-name {
{ [ dup "xml" = ] [ drop parse-prolog ] }
{ [ dup >lower "xml" = ] [ capitalized-prolog ] }
{ [ dup valid-name? not ] [ bad-name ] }
[ "?>" take-string append <instruction> ]
} cond ;
: make-tag ( -- tag )
{
{ [ get-char dup CHAR: ! = ] [ drop next direct ] }
{ [ CHAR: ? = ] [ next instruct ] }
[
start-tag [ dup add-ns pop-ns <closer> depth dec close ]
[ middle-tag end-tag ] if
]
} cond ;
! Autodetecting encodings
: continue-make-tag ( str -- tag )
parse-name-starting middle-tag end-tag ;
: start-utf16le ( -- tag )
utf16le decode-input-if
CHAR: ? expect
0 expect check instruct ;
: 10xxxxxx? ( ch -- ? )
-6 shift 3 bitand 2 = ;
: start<name ( ch -- tag )
ascii?
[ utf8 decode-input-if next make-tag ] [
next
[ get-next 10xxxxxx? not ] take-until
get-char suffix utf8 decode
utf8 decode-input-if next
continue-make-tag
] if ;
: start< ( -- tag )
get-next {
{ 0 [ next next start-utf16le ] }
{ CHAR: ? [ check next next instruct ] } ! XML prolog parsing sets the encoding
{ CHAR: ! [ check utf8 decode-input next next direct ] }
[ check start<name ]
} case ;
: skip-utf8-bom ( -- tag )
"\u0000bb\u0000bf" expect utf8 decode-input
CHAR: < expect check make-tag ;
: decode-expecting ( encoding string -- tag )
[ decode-input-if next ] [ expect-string ] bi* check make-tag ;
: start-utf16be ( -- tag )
utf16be "<" decode-expecting ;
: skip-utf16le-bom ( -- tag )
utf16le "\u0000fe<" decode-expecting ;
: skip-utf16be-bom ( -- tag )
utf16be "\u0000ff<" decode-expecting ;
: start-document ( -- tag )
get-char {
{ CHAR: < [ start< ] }
{ 0 [ start-utf16be ] }
{ HEX: EF [ skip-utf8-bom ] }
{ HEX: FF [ skip-utf16le-bom ] }
{ HEX: FE [ skip-utf16be-bom ] }
{ f [ "" ] }
[ drop utf8 decode-input-if f ]
! Same problem as with <e`>, in the case of XML chunks?
} case check ;

View File

@ -11,7 +11,6 @@ ARTICLE: "xml.writer" "Writing XML"
"These words are used to print XML normally"
{ $subsection xml>string }
{ $subsection write-xml }
{ $subsection print-xml }
"These words are used to prettyprint XML"
{ $subsection pprint-xml>string }
{ $subsection pprint-xml>string-but }
@ -38,11 +37,6 @@ HELP: write-xml
{ $description "prints the contents of an XML document to " { $link output-stream } "." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: print-xml
{ $values { "xml" "an XML document" } }
{ $description "prints the contents of an XML document to " { $link output-stream } ", followed by a newline" }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: pprint-xml
{ $values { "xml" "an XML document" } }
{ $description "prints the contents of an XML document to " { $link output-stream } " in a prettyprinted form." }
@ -58,5 +52,5 @@ HELP: pprint-xml>string-but
{ $description "Prettyprints an XML document, returning the result as a string and leaving the whitespace of the tags with names in sensitive-tags intact." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
{ xml>string print-xml write-xml pprint-xml pprint-xml>string pprint-xml>string-but pprint-xml-but } related-words
{ xml>string write-xml pprint-xml pprint-xml>string pprint-xml>string-but pprint-xml-but } related-words

View File

@ -1,5 +1,62 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: xml.data xml.writer tools.test fry xml kernel multiline
xml.writer.private io.streams.string xml.utilities sequences ;
IN: xml.writer.tests
USING: xml.data xml.writer tools.test ;
\ write-xml must-infer
\ xml>string must-infer
\ pprint-xml must-infer
\ pprint-xml-but must-infer
[ "foo" ] [ T{ name { main "foo" } } name>string ] unit-test
[ "foo" ] [ T{ name { space "" } { main "foo" } } name>string ] unit-test
[ "ns:foo" ] [ T{ name { space "ns" } { main "foo" } } name>string ] unit-test
: reprints-as ( to from -- )
[ '[ _ ] ] [ '[ _ string>xml xml>string ] ] bi* unit-test ;
: pprint-reprints-as ( to from -- )
[ '[ _ ] ] [ '[ _ string>xml pprint-xml>string ] ] bi* unit-test ;
: reprints-same ( string -- ) dup reprints-as ;
"<?xml version=\"1.0\" encoding=\"UTF-8\"?><x/>" reprints-same
{" <?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE foo [<!ENTITY foo "bar">]>
<x>bar</x> "}
{" <?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE foo [<!ENTITY foo 'bar'>]>
<x>&foo;</x> "} reprints-as
{" <?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE foo [
<!ENTITY foo "bar">
<!ELEMENT br EMPTY>
<!ATTLIST list type (bullets|ordered|glossary) "ordered">
<!NOTATION foo bar>
<?baz bing bang bong?>
<!--wtf-->
]>
<x>
bar
</x>"}
{" <?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE foo [ <!ENTITY foo 'bar'> <!ELEMENT br EMPTY>
<!ATTLIST list
type (bullets|ordered|glossary) "ordered">
<!NOTATION foo bar> <?baz bing bang bong?>
<!--wtf-->
]>
<x>&foo;</x>"} pprint-reprints-as
[ t ] [ "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' >" dup string>xml-chunk xml-chunk>string = ] unit-test
[ "foo" ] [ "<!ENTITY bar 'foo'><x>&bar;</x>" string>xml children>string ] unit-test
[ V{ "hello" } ] [ "hello" string>xml-chunk ] unit-test
[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><a b=\"c\"/>" ]
[ "<a b='c'/>" string>xml xml>string ] unit-test
[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?><foo>bar baz</foo>" ]
[ "<foo>bar</foo>" string>xml [ " baz" append ] map xml>string ] unit-test
[ "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<foo>\n bar\n</foo>" ]
[ "<foo> bar </foo>" string>xml pprint-xml>string ] unit-test

View File

@ -1,4 +1,4 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: hashtables kernel math namespaces sequences strings
assocs combinators io io.streams.string accessors
@ -11,6 +11,8 @@ SYMBOL: indentation
SYMBOL: indenter
" " indenter set-global
<PRIVATE
: sensitive? ( tag -- ? )
sensitive-tags get swap '[ _ names-match? ] contains? ;
@ -40,9 +42,13 @@ SYMBOL: indenter
: name>string ( name -- string )
[ main>> ] [ space>> ] bi [ ":" rot 3append ] unless-empty ;
PRIVATE>
: print-name ( name -- )
name>string write ;
<PRIVATE
: print-attrs ( assoc -- )
[
" " write
@ -52,11 +58,18 @@ SYMBOL: indenter
"\"" write
] assoc-each ;
PRIVATE>
GENERIC: write-xml-chunk ( object -- )
<PRIVATE
M: string write-xml-chunk
escape-string dup empty? not xml-pprint? get and
[ nl 80 indent-string indented-break ] when write ;
escape-string xml-pprint? get [
dup [ blank? ] all?
[ drop "" ]
[ nl 80 indent-string indented-break ] if
] when write ;
: write-tag ( tag -- )
?indent CHAR: < write1
@ -100,11 +113,20 @@ M: attlist-decl write-xml-chunk
[ att-defs>> write ">" write ]
bi ;
M: notation-decl write-xml-chunk
"<!NOTATION " write
[ name>> write " " write ]
[ id>> write ">" write ]
bi ;
M: entity-decl write-xml-chunk
"<!ENTITY " write
[ name>> write " " write ]
[ def>> write-xml-chunk ">" write ]
bi ;
[ pe?>> [ " % " write ] when ]
[ name>> write " \"" write ] [
def>> f xml-pprint?
[ write-xml-chunk ] with-variable
"\">" write
] tri ;
M: system-id write-xml-chunk
"SYSTEM '" write system-literal>> write "'" write ;
@ -114,17 +136,21 @@ M: public-id write-xml-chunk
[ pubid-literal>> write "' '" write ]
[ system-literal>> write "'" write ] bi ;
: write-internal-subset ( seq -- )
[
"[" write indent
[ ?indent write-xml-chunk ] each
unindent ?indent "]" write
] when* ;
M: doctype-decl write-xml-chunk
"<!DOCTYPE " write
?indent "<!DOCTYPE " write
[ name>> write " " write ]
[ external-id>> [ write-xml-chunk " " write ] when* ]
[
internal-subset>>
[ "[" write [ write-xml-chunk ] each "]" write ] when* ">" write
] tri ;
[ internal-subset>> write-internal-subset ">" write ] tri ;
M: directive write-xml-chunk
"<!" write text>> write CHAR: > write1 ;
"<!" write text>> write CHAR: > write1 nl ;
M: instruction write-xml-chunk
"<?" write text>> write "?>" write ;
@ -138,6 +164,8 @@ M: sequence write-xml-chunk
standalone>> [ "\" standalone=\"yes" write ] when
"\"?>" write ;
PRIVATE>
: write-xml ( xml -- )
{
[ prolog>> write-prolog ]
@ -149,31 +177,25 @@ M: sequence write-xml-chunk
M: xml write-xml-chunk
body>> write-xml-chunk ;
: print-xml ( xml -- )
write-xml nl ;
: xml>string ( xml -- string )
[ write-xml ] with-string-writer ;
: xml-chunk>string ( object -- string )
[ write-xml-chunk ] with-string-writer ;
: with-xml-pprint ( sensitive-tags quot -- )
: pprint-xml-but ( xml sensitive-tags -- )
[
swap [ assure-name ] map sensitive-tags set
[ assure-name ] map sensitive-tags set
0 indentation set
xml-pprint? on
call
] with-scope ; inline
: pprint-xml-but ( xml sensitive-tags -- )
[ print-xml ] with-xml-pprint ;
write-xml
] with-scope ;
: pprint-xml ( xml -- )
f pprint-xml-but ;
: pprint-xml>string-but ( xml sensitive-tags -- string )
[ xml>string ] with-xml-pprint ;
[ pprint-xml-but ] with-string-writer ;
: pprint-xml>string ( xml -- string )
f pprint-xml>string-but ;

View File

@ -1,9 +1,9 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays io io.encodings.binary io.files
io.streams.string kernel namespaces sequences strings
xml.backend xml.data xml.errors xml.tokenize ascii xml.entities
xml.writer xml.state assocs ;
xml.backend xml.data xml.errors xml.elements ascii xml.entities
xml.writer xml.state xml.autoencoding assocs xml.tokenize xml.name ;
IN: xml
! -- Overall parser with data tree
@ -132,7 +132,7 @@ TUPLE: pull-xml scope ;
reset-prolog init-ns-stack
start-document [ call-under ] when*
sax-loop
] state-parse ; inline recursive
] with-state ; inline recursive
: (read-xml) ( -- )
start-document [ process ] when*
@ -144,7 +144,7 @@ TUPLE: pull-xml scope ;
done? [ unclosed ] unless
xml-stack get first second
prolog-data get swap
] state-parse ;
] with-state ;
: read-xml ( stream -- xml )
0 depth

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays html.parser.utils hashtables io kernel
namespaces make prettyprint quotations sequences splitting
state-parser strings unicode.categories unicode.case ;
html.parser.state strings unicode.categories unicode.case ;
IN: html.parser
TUPLE: tag name attributes text closing? ;
@ -59,8 +59,8 @@ SYMBOL: tagstack
[ get-char CHAR: " = ] take-until ;
: read-quote ( -- string )
get-char next* CHAR: ' =
[ read-single-quote ] [ read-double-quote ] if next* ;
get-char next CHAR: ' =
[ read-single-quote ] [ read-double-quote ] if next ;
: read-key ( -- string )
read-whitespace*
@ -68,7 +68,7 @@ SYMBOL: tagstack
: read-= ( -- )
read-whitespace*
[ get-char CHAR: = = ] take-until drop next* ;
[ get-char CHAR: = = ] take-until drop next ;
: read-value ( -- string )
read-whitespace*
@ -76,14 +76,14 @@ SYMBOL: tagstack
[ blank? ] trim ;
: read-comment ( -- )
"-->" take-string* make-comment-tag push-tag ;
"-->" take-string make-comment-tag push-tag ;
: read-dtd ( -- )
">" take-string* make-dtd-tag push-tag ;
">" take-string make-dtd-tag push-tag ;
: read-bang ( -- )
next* get-char CHAR: - = get-next CHAR: - = and [
next* next*
next get-char CHAR: - = get-next CHAR: - = and [
next next
read-comment
] [
read-dtd
@ -91,10 +91,10 @@ SYMBOL: tagstack
: read-tag ( -- string )
[ get-char CHAR: > = get-char CHAR: < = or ] take-until
get-char CHAR: < = [ next* ] unless ;
get-char CHAR: < = [ next ] unless ;
: read-< ( -- string )
next* get-char CHAR: ! = [
next get-char CHAR: ! = [
read-bang f
] [
read-tag

View File

@ -0,0 +1,13 @@
USING: tools.test html.parser.state ascii kernel ;
IN: html.parser.state.tests
: take-rest ( -- string )
[ f ] take-until ;
: take-char ( -- string )
[ get-char = ] curry take-until ;
[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test
[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test
! [ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test

View File

@ -0,0 +1,41 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: namespaces math kernel sequences accessors fry circular ;
IN: html.parser.state
TUPLE: state string i ;
: get-i ( -- i ) state get i>> ;
: get-char ( -- char )
state get [ i>> ] [ string>> ] bi ?nth ;
: get-next ( -- char )
state get [ i>> 1+ ] [ string>> ] bi ?nth ;
: next ( -- )
state get [ 1+ ] change-i drop ;
: string-parse ( string quot -- )
[ 0 state boa state ] dip with-variable ;
: short* ( n seq -- n' seq )
over [ nip dup length swap ] unless ;
: skip-until ( quot: ( -- ? ) -- )
get-char [
[ call ] keep swap
[ drop ] [ next skip-until ] if
] [ drop ] if ; inline recursive
: take-until ( quot: ( -- ? ) -- )
[ get-i ] dip skip-until get-i
state get string>> subseq ;
: string-matches? ( string circular -- ? )
get-char over push-circular sequence= ;
: take-string ( match -- string )
dup length <circular-string>
[ 2dup string-matches? ] take-until nip
dup length rot length 1- - head next ;

View File

@ -1,7 +1,7 @@
USING: assocs combinators continuations hashtables
hashtables.private io kernel math
namespaces prettyprint quotations sequences splitting
state-parser strings tools.test ;
strings tools.test ;
USING: html.parser.utils ;
IN: html.parser.utils.tests

View File

@ -2,17 +2,12 @@
! See http://factorcode.org/license.txt for BSD license.
USING: assocs circular combinators continuations hashtables
hashtables.private io kernel math namespaces prettyprint
quotations sequences splitting state-parser strings
quotations sequences splitting html.parser.state strings
combinators.short-circuit ;
IN: html.parser.utils
: string-parse-end? ( -- ? ) get-next not ;
: take-string* ( match -- string )
dup length <circular-string>
[ 2dup string-matches? ] take-until nip
dup length rot length 1- - head next* ;
: trim1 ( seq ch -- newseq )
[ [ ?head-slice drop ] [ ?tail-slice drop ] bi ] 2keep drop like ;