diff --git a/basis/syndication/syndication.factor b/basis/syndication/syndication.factor index a6eaff4492..c82fe4006d 100644 --- a/basis/syndication/syndication.factor +++ b/basis/syndication/syndication.factor @@ -3,7 +3,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: xml.utilities kernel assocs xml.generator math.order strings sequences xml.data xml.writer - io.streams.string combinators xml xml.entities io.files io + io.streams.string combinators xml xml.entities.html io.files io http.client namespaces make xml.generator hashtables calendar.format accessors continuations urls present ; IN: syndication diff --git a/basis/xml/entities/entities.factor b/basis/xml/entities/entities.factor index 03de0f78d1..a3812c7723 100644 --- a/basis/xml/entities/entities.factor +++ b/basis/xml/entities/entities.factor @@ -1,6 +1,7 @@ ! Copyright (C) 2005, 2006 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: namespaces make kernel assocs sequences fry ; +USING: namespaces make kernel assocs sequences fry values +io.files io.encodings.binary ; IN: xml.entities : entities-out @@ -36,265 +37,7 @@ IN: xml.entities { "quot" CHAR: " } } ; -: html-entities - #! generated from: - #! http://www.w3.org/TR/REC-html40/sgml/entities.html - H{ - { "nbsp" 160 } - { "iexcl" 161 } - { "cent" 162 } - { "pound" 163 } - { "curren" 164 } - { "yen" 165 } - { "brvbar" 166 } - { "sect" 167 } - { "uml" 168 } - { "copy" 169 } - { "ordf" 170 } - { "laquo" 171 } - { "not" 172 } - { "shy" 173 } - { "reg" 174 } - { "macr" 175 } - { "deg" 176 } - { "plusmn" 177 } - { "sup2" 178 } - { "sup3" 179 } - { "acute" 180 } - { "micro" 181 } - { "para" 182 } - { "middot" 183 } - { "cedil" 184 } - { "sup1" 185 } - { "ordm" 186 } - { "raquo" 187 } - { "frac14" 188 } - { "frac12" 189 } - { "frac34" 190 } - { "iquest" 191 } - { "Agrave" 192 } - { "Aacute" 193 } - { "Acirc" 194 } - { "Atilde" 195 } - { "Auml" 196 } - { "Aring" 197 } - { "AElig" 198 } - { "Ccedil" 199 } - { "Egrave" 200 } - { "Eacute" 201 } - { "Ecirc" 202 } - { "Euml" 203 } - { "Igrave" 204 } - { "Iacute" 205 } - { "Icirc" 206 } - { "Iuml" 207 } - { "ETH" 208 } - { "Ntilde" 209 } - { "Ograve" 210 } - { "Oacute" 211 } - { "Ocirc" 212 } - { "Otilde" 213 } - { "Ouml" 214 } - { "times" 215 } - { "Oslash" 216 } - { "Ugrave" 217 } - { "Uacute" 218 } - { "Ucirc" 219 } - { "Uuml" 220 } - { "Yacute" 221 } - { "THORN" 222 } - { "szlig" 223 } - { "agrave" 224 } - { "aacute" 225 } - { "acirc" 226 } - { "atilde" 227 } - { "auml" 228 } - { "aring" 229 } - { "aelig" 230 } - { "ccedil" 231 } - { "egrave" 232 } - { "eacute" 233 } - { "ecirc" 234 } - { "euml" 235 } - { "igrave" 236 } - { "iacute" 237 } - { "icirc" 238 } - { "iuml" 239 } - { "eth" 240 } - { "ntilde" 241 } - { "ograve" 242 } - { "oacute" 243 } - { "ocirc" 244 } - { "otilde" 245 } - { "ouml" 246 } - { "divide" 247 } - { "oslash" 248 } - { "ugrave" 249 } - { "uacute" 250 } - { "ucirc" 251 } - { "uuml" 252 } - { "yacute" 253 } - { "thorn" 254 } - { "yuml" 255 } - { "fnof" 402 } - { "Alpha" 913 } - { "Beta" 914 } - { "Gamma" 915 } - { "Delta" 916 } - { "Epsilon" 917 } - { "Zeta" 918 } - { "Eta" 919 } - { "Theta" 920 } - { "Iota" 921 } - { "Kappa" 922 } - { "Lambda" 923 } - { "Mu" 924 } - { "Nu" 925 } - { "Xi" 926 } - { "Omicron" 927 } - { "Pi" 928 } - { "Rho" 929 } - { "Sigma" 931 } - { "Tau" 932 } - { "Upsilon" 933 } - { "Phi" 934 } - { "Chi" 935 } - { "Psi" 936 } - { "Omega" 937 } - { "alpha" 945 } - { "beta" 946 } - { "gamma" 947 } - { "delta" 948 } - { "epsilon" 949 } - { "zeta" 950 } - { "eta" 951 } - { "theta" 952 } - { "iota" 953 } - { "kappa" 954 } - { "lambda" 955 } - { "mu" 956 } - { "nu" 957 } - { "xi" 958 } - { "omicron" 959 } - { "pi" 960 } - { "rho" 961 } - { "sigmaf" 962 } - { "sigma" 963 } - { "tau" 964 } - { "upsilon" 965 } - { "phi" 966 } - { "chi" 967 } - { "psi" 968 } - { "omega" 969 } - { "thetasym" 977 } - { "upsih" 978 } - { "piv" 982 } - { "bull" 8226 } - { "hellip" 8230 } - { "prime" 8242 } - { "Prime" 8243 } - { "oline" 8254 } - { "frasl" 8260 } - { "weierp" 8472 } - { "image" 8465 } - { "real" 8476 } - { "trade" 8482 } - { "alefsym" 8501 } - { "larr" 8592 } - { "uarr" 8593 } - { "rarr" 8594 } - { "darr" 8595 } - { "harr" 8596 } - { "crarr" 8629 } - { "lArr" 8656 } - { "uArr" 8657 } - { "rArr" 8658 } - { "dArr" 8659 } - { "hArr" 8660 } - { "forall" 8704 } - { "part" 8706 } - { "exist" 8707 } - { "empty" 8709 } - { "nabla" 8711 } - { "isin" 8712 } - { "notin" 8713 } - { "ni" 8715 } - { "prod" 8719 } - { "sum" 8721 } - { "minus" 8722 } - { "lowast" 8727 } - { "radic" 8730 } - { "prop" 8733 } - { "infin" 8734 } - { "ang" 8736 } - { "and" 8743 } - { "or" 8744 } - { "cap" 8745 } - { "cup" 8746 } - { "int" 8747 } - { "there4" 8756 } - { "sim" 8764 } - { "cong" 8773 } - { "asymp" 8776 } - { "ne" 8800 } - { "equiv" 8801 } - { "le" 8804 } - { "ge" 8805 } - { "sub" 8834 } - { "sup" 8835 } - { "nsub" 8836 } - { "sube" 8838 } - { "supe" 8839 } - { "oplus" 8853 } - { "otimes" 8855 } - { "perp" 8869 } - { "sdot" 8901 } - { "lceil" 8968 } - { "rceil" 8969 } - { "lfloor" 8970 } - { "rfloor" 8971 } - { "lang" 9001 } - { "rang" 9002 } - { "loz" 9674 } - { "spades" 9824 } - { "clubs" 9827 } - { "hearts" 9829 } - { "diams" 9830 } - { "OElig" 338 } - { "oelig" 339 } - { "Scaron" 352 } - { "scaron" 353 } - { "Yuml" 376 } - { "circ" 710 } - { "tilde" 732 } - { "ensp" 8194 } - { "emsp" 8195 } - { "thinsp" 8201 } - { "zwnj" 8204 } - { "zwj" 8205 } - { "lrm" 8206 } - { "rlm" 8207 } - { "ndash" 8211 } - { "mdash" 8212 } - { "lsquo" 8216 } - { "rsquo" 8217 } - { "sbquo" 8218 } - { "ldquo" 8220 } - { "rdquo" 8221 } - { "bdquo" 8222 } - { "dagger" 8224 } - { "Dagger" 8225 } - { "permil" 8240 } - { "lsaquo" 8249 } - { "rsaquo" 8250 } - { "euro" 8364 } - } ; - SYMBOL: extra-entities -f extra-entities set-global : with-entities ( entities quot -- ) [ swap extra-entities set call ] with-scope ; inline - -: with-html-entities ( quot -- ) - html-entities swap with-entities ; inline diff --git a/basis/xml/entities/html/authors.txt b/basis/xml/entities/html/authors.txt new file mode 100644 index 0000000000..29e79639ae --- /dev/null +++ b/basis/xml/entities/html/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg \ No newline at end of file diff --git a/basis/xml/entities/html/html-tests.factor b/basis/xml/entities/html/html-tests.factor new file mode 100644 index 0000000000..68b10bebe7 --- /dev/null +++ b/basis/xml/entities/html/html-tests.factor @@ -0,0 +1,4 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: tools.test xml.entities.html ; +IN: xml.entities.html.tests diff --git a/basis/xml/entities/html/html.factor b/basis/xml/entities/html/html.factor new file mode 100644 index 0000000000..6f2732f1d9 --- /dev/null +++ b/basis/xml/entities/html/html.factor @@ -0,0 +1,22 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: assocs io.encodings.binary io.files kernel namespaces sequences +values xml xml.entities ; +IN: xml.entities.html + +VALUE: html-entities + +: read-entities-file ( file -- table ) + f swap binary + [ 2drop extra-entities get ] sax ; + +: get-html ( -- table ) + { "lat1" "special" "symbol" } [ + "resource:basis/xml/entities/html/xhtml-" + swap ".ent" 3append read-entities-file + ] map first3 assoc-union assoc-union ; + +get-html to: html-entities + +: with-html-entities ( quot -- ) + html-entities swap with-entities ; inline diff --git a/basis/xml/entities/html/xhtml-lat1.ent b/basis/xml/entities/html/xhtml-lat1.ent new file mode 100644 index 0000000000..ffee223eb1 --- /dev/null +++ b/basis/xml/entities/html/xhtml-lat1.ent @@ -0,0 +1,196 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/basis/xml/entities/html/xhtml-special.ent b/basis/xml/entities/html/xhtml-special.ent new file mode 100644 index 0000000000..ca358b2fec --- /dev/null +++ b/basis/xml/entities/html/xhtml-special.ent @@ -0,0 +1,80 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/basis/xml/entities/html/xhtml-symbol.ent b/basis/xml/entities/html/xhtml-symbol.ent new file mode 100644 index 0000000000..63c2abfa6f --- /dev/null +++ b/basis/xml/entities/html/xhtml-symbol.ent @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/basis/xml/tests/test.factor b/basis/xml/tests/test.factor index a565df6a9d..7a826756b6 100644 --- a/basis/xml/tests/test.factor +++ b/basis/xml/tests/test.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. IN: xml.tests USING: kernel xml tools.test io namespaces make sequences -xml.errors xml.entities parser strings xml.data io.files +xml.errors xml.entities.html parser strings xml.data io.files xml.writer xml.utilities state-parser continuations assocs sequences.deep accessors io.streams.string ; @@ -62,3 +62,6 @@ SYMBOL: xml-file [ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test [ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test [ t ] [ "" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test +[ "foo" ] [ "&bar;" string>xml children>string ] unit-test +[ V{ "hello" } ] [ "hello" string>xml-chunk ] unit-test +[ 958 ] [ [ "ξ" string>xml-chunk ] with-html-entities first first ] unit-test \ No newline at end of file diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor index 70b7150ec1..a2ae9c4d58 100644 --- a/basis/xml/tokenize/tokenize.factor +++ b/basis/xml/tokenize/tokenize.factor @@ -4,7 +4,7 @@ USING: accessors arrays ascii assocs combinators combinators.short-circuit fry io.encodings io.encodings.iana io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make math math.parser namespaces sequences sets splitting state-parser -strings xml.char-classes xml.data xml.entities xml.errors ; +strings xml.char-classes xml.data xml.entities xml.errors hashtables ; IN: xml.tokenize ! XML namespace processing: ns = namespace @@ -74,20 +74,17 @@ SYMBOL: ns-stack ! -- Parsing strings -: (parse-entity) ( string -- ) +: parse-named-entity ( string -- ) dup entities at [ , ] [ - prolog-data get standalone>> - [ no-entity ] [ - dup extra-entities get at - [ , ] [ no-entity ] ?if - ] if + dup extra-entities get at + [ dup number? [ , ] [ % ] if ] [ no-entity ] ?if ! Make less hackish ] ?if ; : parse-entity ( -- ) next CHAR: ; take-char next "#" ?head [ "x" ?head 16 10 ? base> , - ] [ (parse-entity) ] if ; + ] [ parse-named-entity ] if ; : (parse-char) ( ch -- ) get-char { @@ -100,10 +97,6 @@ SYMBOL: ns-stack : parse-char ( ch -- string ) [ (parse-char) ] "" make ; -: parse-quot ( ch -- string ) - parse-char get-char - [ unclosed-quote ] unless ; - : parse-text ( -- string ) CHAR: < parse-char ; @@ -114,14 +107,18 @@ SYMBOL: ns-stack get-char CHAR: / = dup [ next ] when parse-name swap ; -: parse-attr-value ( -- seq ) - get-char dup "'\"" member? - [ next parse-quot ] [ quoteless-attr ] if ; +: (parse-quote) ( ch -- string ) + parse-char get-char + [ unclosed-quote ] unless ; + +: parse-quote ( -- seq ) + pass-blank get-char dup "'\"" member? + [ next (parse-quote) ] [ quoteless-attr ] if ; : parse-attr ( -- ) - [ parse-name ] with-scope - pass-blank CHAR: = expect pass-blank - [ parse-attr-value ] with-scope + parse-name + pass-blank CHAR: = expect + parse-quote 2array , ; : (middle-tag) ( -- ) @@ -157,7 +154,7 @@ SYMBOL: ns-stack : only-blanks ( str -- ) [ blank? ] all? [ bad-doctype-decl ] unless ; -: take-system-literal ( -- str ) +: take-system-literal ( -- str ) ! replace with parse-quote? pass-blank get-char next { { CHAR: ' [ "'" take-string ] } { CHAR: " [ "\"" take-string ] } @@ -211,15 +208,18 @@ DEFER: direct : take-entity-def ( -- entity-name entity-def ) " " take-string pass-blank get-char { - { CHAR: ' [ take-system-literal ] } - { CHAR: " [ take-system-literal ] } + { CHAR: ' [ parse-quote ] } + { CHAR: " [ parse-quote ] } [ drop take-external-id ] } case ; +: associate-entity ( entity-name entity-def -- ) + swap extra-entities [ ?set-at ] change ; + : take-entity-decl ( -- entity-decl ) pass-blank get-char { { CHAR: % [ next pass-blank take-entity-def ] } - [ drop take-entity-def ] + [ drop take-entity-def 2dup associate-entity ] } case ">" take-string only-blanks ; @@ -257,14 +257,22 @@ DEFER: direct : good-version ( version -- version ) dup { "1.0" "1.1" } member? [ bad-version ] unless ; -: prolog-attrs ( alist -- prolog ) - [ T{ name f "" "version" f } swap at - [ good-version ] [ versionless-prolog ] if* ] keep - [ T{ name f "" "encoding" f } swap at - "UTF-8" or ] keep +: prolog-version ( alist -- version ) + T{ name f "" "version" f } swap at + [ good-version ] [ versionless-prolog ] if* ; + +: prolog-encoding ( alist -- encoding ) + T{ name f "" "encoding" f } swap at "UTF-8" or ; + +: prolog-standalone ( alist -- version ) T{ name f "" "standalone" f } swap at - [ yes/no>bool ] [ f ] if* - ; + [ yes/no>bool ] [ f ] if* ; + +: prolog-attrs ( alist -- prolog ) + [ prolog-version ] + [ prolog-encoding ] + [ prolog-standalone ] + tri ; SYMBOL: string-input? : decode-input-if ( encoding -- ) @@ -288,7 +296,7 @@ SYMBOL: string-input? : make-tag ( -- tag ) { { [ get-char dup CHAR: ! = ] [ drop next direct ] } - { [ CHAR: ? = ] [ next instruct ] } + { [ CHAR: ? = ] [ next instruct ] } [ start-tag [ dup add-ns pop-ns ] [ middle-tag end-tag ] if @@ -331,19 +339,17 @@ SYMBOL: string-input? "\u0000bb\u0000bf" expect utf8 decode-input CHAR: < expect make-tag ; +: decode-expecting ( encoding string -- tag ) + [ decode-input-if next ] [ expect-string ] bi* make-tag ; + : start-utf16be ( -- tag ) - utf16be decode-input-if - next CHAR: < expect make-tag ; + utf16be "<" decode-expecting ; : skip-utf16le-bom ( -- tag ) - utf16le decode-input-if - next HEX: FE expect - CHAR: < expect make-tag ; + utf16le "\u0000fe<" decode-expecting ; : skip-utf16be-bom ( -- tag ) - utf16be decode-input-if - next HEX: FF expect - CHAR: < expect make-tag ; + utf16be "\u0000ff<" decode-expecting ; : start-document ( -- tag ) get-char { @@ -353,8 +359,6 @@ SYMBOL: string-input? { HEX: FF [ skip-utf16le-bom ] } { HEX: FE [ skip-utf16be-bom ] } { f [ "" ] } - [ dup blank? - [ drop pass-blank utf8 decode-input-if CHAR: < expect make-tag ] - [ 1string ] if ! Replace with proper error? - ] + [ drop utf8 decode-input-if f ] + ! Same problem as with , in the case of XML chunks? } case ; diff --git a/basis/xml/xml-docs.factor b/basis/xml/xml-docs.factor index 18bd4d7328..e87c32d375 100644 --- a/basis/xml/xml-docs.factor +++ b/basis/xml/xml-docs.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: help.markup help.syntax kernel xml.data xml.errors xml.writer state-parser xml.tokenize xml.utilities xml.entities -strings sequences io ; +strings sequences io xml.entities.html ; IN: xml HELP: string>xml diff --git a/basis/xml/xml.factor b/basis/xml/xml.factor index e29bb82eaf..328a058a58 100644 --- a/basis/xml/xml.factor +++ b/basis/xml/xml.factor @@ -133,12 +133,12 @@ TUPLE: pull-xml scope ; : sax ( stream quot: ( xml-elem -- ) -- ) swap [ reset-prolog init-ns-stack - start-document call-under + start-document [ call-under ] when* sax-loop ] state-parse ; inline recursive : (read-xml) ( -- ) - start-document process + start-document [ process ] when* [ process ] sax-loop ; inline : (read-xml-chunk) ( stream -- prolog seq )