diff --git a/basis/io/streams/duplex/duplex-docs.factor b/basis/io/streams/duplex/duplex-docs.factor index 48afafeec7..5bf33e9002 100644 --- a/basis/io/streams/duplex/duplex-docs.factor +++ b/basis/io/streams/duplex/duplex-docs.factor @@ -20,11 +20,11 @@ HELP: <duplex-stream> HELP: with-stream { $values { "stream" duplex-stream } { "quot" quotation } } -{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } ". The stream is closed if the quotation returns or throws an error." } ; +{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } ", which must be a duplex stream. The stream is closed if the quotation returns or throws an error." } ; HELP: with-stream* { $values { "stream" duplex-stream } { "quot" quotation } } -{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } "." } +{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } ", which must be a duplex stream." } { $notes "This word does not close the stream. Compare with " { $link with-stream } "." } ; HELP: <encoder-duplex> diff --git a/basis/pack/pack-tests.factor b/basis/pack/pack-tests.factor index b1a354cd4e..1be37292a0 100755 --- a/basis/pack/pack-tests.factor +++ b/basis/pack/pack-tests.factor @@ -46,3 +46,10 @@ pack strings tools.test ; [ f ] [ "" [ read-c-string ] with-string-reader ] unit-test [ 5 ] [ "FRAM\0\u000005\0\0\0\0\0\0\0" [ read-c-string drop read-u64 ] with-string-reader ] unit-test +[ 9 ] [ "iic" packed-length ] unit-test +[ "iii" read-packed-le ] must-infer +[ "iii" read-packed-be ] must-infer +[ "iii" read-packed-native ] must-infer +[ "iii" unpack-le ] must-infer +[ "iii" unpack-be ] must-infer +[ "iii" unpack-native ] must-infer diff --git a/basis/pack/pack.factor b/basis/pack/pack.factor index 0e5cb7dbbc..136deb9ff5 100755 --- a/basis/pack/pack.factor +++ b/basis/pack/pack.factor @@ -1,7 +1,9 @@ +! Copyright (C) 2009 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. USING: alien alien.c-types arrays assocs byte-arrays io io.binary io.streams.string kernel math math.parser namespaces make parser prettyprint quotations sequences strings vectors -words macros math.functions math.bitwise ; +words macros math.functions math.bitwise fry ; IN: pack SYMBOL: big-endian @@ -9,6 +11,13 @@ SYMBOL: big-endian : big-endian? ( -- ? ) 1 <int> *char zero? ; +<PRIVATE + +: set-big-endian ( -- ) + big-endian? big-endian set ; inline + +PRIVATE> + : >endian ( obj n -- str ) big-endian get [ >be ] [ >le ] if ; inline @@ -39,6 +48,8 @@ M: string b, ( n string -- ) heap-size b, ; : double, ( n -- ) double>bits 8 b, ; : c-string, ( str -- ) % 0 u8, ; +<PRIVATE + : (>128-ber) ( n -- ) dup 0 > [ [ HEX: 7f bitand HEX: 80 bitor , ] keep -7 shift @@ -47,6 +58,8 @@ M: string b, ( n string -- ) heap-size b, ; drop ] if ; +PRIVATE> + : >128-ber ( n -- str ) [ [ HEX: 7f bitand , ] keep -7 shift @@ -70,7 +83,7 @@ M: string b, ( n string -- ) heap-size b, ; : read-s32 ( -- n ) 4 read-signed ; : read-u32 ( -- n ) 4 read-unsigned ; : read-s64 ( -- n ) 8 read-signed ; -: read-u64 ( -- n ) 8 read-signed ; +: read-u64 ( -- n ) 8 read-unsigned ; : read-s128 ( -- n ) 16 read-signed ; : read-u128 ( -- n ) 16 read-unsigned ; @@ -81,7 +94,7 @@ M: string b, ( n string -- ) heap-size b, ; 8 read endian> bits>double ; : read-c-string ( -- str/f ) - "\0" read-until [ drop f ] unless ; + "\0" read-until swap and ; : read-c-string* ( n -- str/f ) read [ zero? ] trim-right [ f ] when-empty ; @@ -94,7 +107,9 @@ M: string b, ( n string -- ) heap-size b, ; : read-128-ber ( -- n ) 0 (read-128-ber) ; -: pack-table ( -- hash ) +<PRIVATE + +CONSTANT: pack-table H{ { CHAR: c s8, } { CHAR: C u8, } @@ -110,9 +125,9 @@ M: string b, ( n string -- ) heap-size b, ; { CHAR: F float, } { CHAR: d double, } { CHAR: D double, } - } ; + } -: unpack-table ( -- hash ) +CONSTANT: unpack-table H{ { CHAR: c read-s8 } { CHAR: C read-u8 } @@ -128,47 +143,79 @@ M: string b, ( n string -- ) heap-size b, ; { CHAR: F read-float } { CHAR: d read-double } { CHAR: D read-double } - } ; + } -MACRO: (pack) ( seq str -- quot ) - [ - [ - [ - swap , pack-table at , - ] 2each - ] [ ] make 1quotation % - [ B{ } make ] % - ] [ ] make ; +CONSTANT: packed-length-table + H{ + { CHAR: c 1 } + { CHAR: C 1 } + { CHAR: s 2 } + { CHAR: S 2 } + { CHAR: t 3 } + { CHAR: T 3 } + { CHAR: i 4 } + { CHAR: I 4 } + { CHAR: q 8 } + { CHAR: Q 8 } + { CHAR: f 4 } + { CHAR: F 4 } + { CHAR: d 8 } + { CHAR: D 8 } + } +MACRO: pack ( seq str -- quot ) + [ pack-table at 1quotation '[ _ @ ] ] [ ] 2map-as concat + '[ _ B{ } make ] ; + +PRIVATE> + : pack-native ( seq str -- seq ) - [ - big-endian? big-endian set (pack) - ] with-scope ; + [ set-big-endian pack ] with-scope ; inline : pack-be ( seq str -- seq ) - [ big-endian on (pack) ] with-scope ; + [ big-endian on pack ] with-scope ; inline : pack-le ( seq str -- seq ) - [ big-endian off (pack) ] with-scope ; + [ big-endian off pack ] with-scope ; inline +<PRIVATE -MACRO: (unpack) ( str -- quot ) - [ - [ - [ unpack-table at , \ , , ] each - ] [ ] make - 1quotation [ { } make ] append - 1quotation % - \ with-string-reader , - ] [ ] make ; +MACRO: unpack ( str -- quot ) + [ unpack-table at 1quotation '[ @ , ] ] { } map-as concat + '[ [ _ { } make ] with-string-reader ] ; + +PRIVATE> : unpack-native ( seq str -- seq ) - [ - big-endian? big-endian set (unpack) - ] with-scope ; + [ set-big-endian unpack ] with-scope ; inline : unpack-be ( seq str -- seq ) - [ big-endian on (unpack) ] with-scope ; + [ big-endian on unpack ] with-scope ; inline : unpack-le ( seq str -- seq ) - [ big-endian off (unpack) ] with-scope ; + [ big-endian off unpack ] with-scope ; inline + +: packed-length ( str -- n ) + [ packed-length-table at ] sigma ; + +ERROR: packed-read-fail str bytes ; + +<PRIVATE + +: read-packed-bytes ( str -- bytes ) + dup packed-length [ read dup length ] keep = + [ nip ] [ packed-read-fail ] if ; inline + +PRIVATE> + +: read-packed ( str quot -- seq ) + [ read-packed-bytes ] swap bi ; inline + +: read-packed-le ( str -- seq ) + [ unpack-le ] read-packed ; inline + +: read-packed-be ( str -- seq ) + [ unpack-be ] read-packed ; inline + +: read-packed-native ( str -- seq ) + [ unpack-native ] read-packed ; inline diff --git a/basis/roman/roman.factor b/basis/roman/roman.factor index 866ac92872..c9394b07ed 100644 --- a/basis/roman/roman.factor +++ b/basis/roman/roman.factor @@ -39,16 +39,14 @@ ERROR: roman-range-error n ; PRIVATE> : >roman ( n -- str ) - dup roman-range-check [ - (>roman) - ] "" make ; + dup roman-range-check + [ (>roman) ] "" make ; : >ROMAN ( n -- str ) >roman >upper ; : roman> ( str -- n ) - >lower [ roman<= ] monotonic-split [ - (roman>) - ] map sum ; + >lower [ roman<= ] monotonic-split + [ (roman>) ] sigma ; <PRIVATE diff --git a/basis/syndication/syndication.factor b/basis/syndication/syndication.factor index a6eaff4492..c82fe4006d 100644 --- a/basis/syndication/syndication.factor +++ b/basis/syndication/syndication.factor @@ -3,7 +3,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: xml.utilities kernel assocs xml.generator math.order strings sequences xml.data xml.writer - io.streams.string combinators xml xml.entities io.files io + io.streams.string combinators xml xml.entities.html io.files io http.client namespaces make xml.generator hashtables calendar.format accessors continuations urls present ; IN: syndication diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor index 0524825d43..336d99657e 100644 --- a/basis/unicode/breaks/breaks.factor +++ b/basis/unicode/breaks/breaks.factor @@ -61,7 +61,7 @@ SYMBOL: table : eval-seq ( seq -- seq ) [ dup word? [ execute ] when ] map ; : (set-table) ( class1 class2 val -- ) - -rot table get nth [ swap or ] change-nth ; + [ table get nth ] dip '[ _ or ] change-nth ; : set-table ( classes1 classes2 val -- ) [ [ eval-seq ] bi@ ] dip @@ -199,8 +199,8 @@ to: word-table : walk-down ( str i -- j ) dupd (walk-down) [ 1- (walk-down) ] [ drop f ] if* ; -: word-break? ( table-entry i str -- ? ) - spin { +: word-break? ( str i table-entry -- ? ) + { { t [ 2drop f ] } { f [ 2drop t ] } { check-letter-after @@ -214,10 +214,10 @@ to: word-table } case ; :: word-break-next ( old-class new-char i str -- next-class ? ) - new-char dup format/extended? - [ drop old-class dup { 1 2 3 } member? ] [ - word-break-prop old-class over word-table-nth - i str word-break? + new-char format/extended? + [ old-class dup { 1 2 3 } member? ] [ + new-char word-break-prop old-class over word-table-nth + [ str i ] dip word-break? ] if ; PRIVATE> diff --git a/basis/xml/entities/entities.factor b/basis/xml/entities/entities.factor index 03de0f78d1..a3812c7723 100644 --- a/basis/xml/entities/entities.factor +++ b/basis/xml/entities/entities.factor @@ -1,6 +1,7 @@ ! Copyright (C) 2005, 2006 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: namespaces make kernel assocs sequences fry ; +USING: namespaces make kernel assocs sequences fry values +io.files io.encodings.binary ; IN: xml.entities : entities-out @@ -36,265 +37,7 @@ IN: xml.entities { "quot" CHAR: " } } ; -: html-entities - #! generated from: - #! http://www.w3.org/TR/REC-html40/sgml/entities.html - H{ - { "nbsp" 160 } - { "iexcl" 161 } - { "cent" 162 } - { "pound" 163 } - { "curren" 164 } - { "yen" 165 } - { "brvbar" 166 } - { "sect" 167 } - { "uml" 168 } - { "copy" 169 } - { "ordf" 170 } - { "laquo" 171 } - { "not" 172 } - { "shy" 173 } - { "reg" 174 } - { "macr" 175 } - { "deg" 176 } - { "plusmn" 177 } - { "sup2" 178 } - { "sup3" 179 } - { "acute" 180 } - { "micro" 181 } - { "para" 182 } - { "middot" 183 } - { "cedil" 184 } - { "sup1" 185 } - { "ordm" 186 } - { "raquo" 187 } - { "frac14" 188 } - { "frac12" 189 } - { "frac34" 190 } - { "iquest" 191 } - { "Agrave" 192 } - { "Aacute" 193 } - { "Acirc" 194 } - { "Atilde" 195 } - { "Auml" 196 } - { "Aring" 197 } - { "AElig" 198 } - { "Ccedil" 199 } - { "Egrave" 200 } - { "Eacute" 201 } - { "Ecirc" 202 } - { "Euml" 203 } - { "Igrave" 204 } - { "Iacute" 205 } - { "Icirc" 206 } - { "Iuml" 207 } - { "ETH" 208 } - { "Ntilde" 209 } - { "Ograve" 210 } - { "Oacute" 211 } - { "Ocirc" 212 } - { "Otilde" 213 } - { "Ouml" 214 } - { "times" 215 } - { "Oslash" 216 } - { "Ugrave" 217 } - { "Uacute" 218 } - { "Ucirc" 219 } - { "Uuml" 220 } - { "Yacute" 221 } - { "THORN" 222 } - { "szlig" 223 } - { "agrave" 224 } - { "aacute" 225 } - { "acirc" 226 } - { "atilde" 227 } - { "auml" 228 } - { "aring" 229 } - { "aelig" 230 } - { "ccedil" 231 } - { "egrave" 232 } - { "eacute" 233 } - { "ecirc" 234 } - { "euml" 235 } - { "igrave" 236 } - { "iacute" 237 } - { "icirc" 238 } - { "iuml" 239 } - { "eth" 240 } - { "ntilde" 241 } - { "ograve" 242 } - { "oacute" 243 } - { "ocirc" 244 } - { "otilde" 245 } - { "ouml" 246 } - { "divide" 247 } - { "oslash" 248 } - { "ugrave" 249 } - { "uacute" 250 } - { "ucirc" 251 } - { "uuml" 252 } - { "yacute" 253 } - { "thorn" 254 } - { "yuml" 255 } - { "fnof" 402 } - { "Alpha" 913 } - { "Beta" 914 } - { "Gamma" 915 } - { "Delta" 916 } - { "Epsilon" 917 } - { "Zeta" 918 } - { "Eta" 919 } - { "Theta" 920 } - { "Iota" 921 } - { "Kappa" 922 } - { "Lambda" 923 } - { "Mu" 924 } - { "Nu" 925 } - { "Xi" 926 } - { "Omicron" 927 } - { "Pi" 928 } - { "Rho" 929 } - { "Sigma" 931 } - { "Tau" 932 } - { "Upsilon" 933 } - { "Phi" 934 } - { "Chi" 935 } - { "Psi" 936 } - { "Omega" 937 } - { "alpha" 945 } - { "beta" 946 } - { "gamma" 947 } - { "delta" 948 } - { "epsilon" 949 } - { "zeta" 950 } - { "eta" 951 } - { "theta" 952 } - { "iota" 953 } - { "kappa" 954 } - { "lambda" 955 } - { "mu" 956 } - { "nu" 957 } - { "xi" 958 } - { "omicron" 959 } - { "pi" 960 } - { "rho" 961 } - { "sigmaf" 962 } - { "sigma" 963 } - { "tau" 964 } - { "upsilon" 965 } - { "phi" 966 } - { "chi" 967 } - { "psi" 968 } - { "omega" 969 } - { "thetasym" 977 } - { "upsih" 978 } - { "piv" 982 } - { "bull" 8226 } - { "hellip" 8230 } - { "prime" 8242 } - { "Prime" 8243 } - { "oline" 8254 } - { "frasl" 8260 } - { "weierp" 8472 } - { "image" 8465 } - { "real" 8476 } - { "trade" 8482 } - { "alefsym" 8501 } - { "larr" 8592 } - { "uarr" 8593 } - { "rarr" 8594 } - { "darr" 8595 } - { "harr" 8596 } - { "crarr" 8629 } - { "lArr" 8656 } - { "uArr" 8657 } - { "rArr" 8658 } - { "dArr" 8659 } - { "hArr" 8660 } - { "forall" 8704 } - { "part" 8706 } - { "exist" 8707 } - { "empty" 8709 } - { "nabla" 8711 } - { "isin" 8712 } - { "notin" 8713 } - { "ni" 8715 } - { "prod" 8719 } - { "sum" 8721 } - { "minus" 8722 } - { "lowast" 8727 } - { "radic" 8730 } - { "prop" 8733 } - { "infin" 8734 } - { "ang" 8736 } - { "and" 8743 } - { "or" 8744 } - { "cap" 8745 } - { "cup" 8746 } - { "int" 8747 } - { "there4" 8756 } - { "sim" 8764 } - { "cong" 8773 } - { "asymp" 8776 } - { "ne" 8800 } - { "equiv" 8801 } - { "le" 8804 } - { "ge" 8805 } - { "sub" 8834 } - { "sup" 8835 } - { "nsub" 8836 } - { "sube" 8838 } - { "supe" 8839 } - { "oplus" 8853 } - { "otimes" 8855 } - { "perp" 8869 } - { "sdot" 8901 } - { "lceil" 8968 } - { "rceil" 8969 } - { "lfloor" 8970 } - { "rfloor" 8971 } - { "lang" 9001 } - { "rang" 9002 } - { "loz" 9674 } - { "spades" 9824 } - { "clubs" 9827 } - { "hearts" 9829 } - { "diams" 9830 } - { "OElig" 338 } - { "oelig" 339 } - { "Scaron" 352 } - { "scaron" 353 } - { "Yuml" 376 } - { "circ" 710 } - { "tilde" 732 } - { "ensp" 8194 } - { "emsp" 8195 } - { "thinsp" 8201 } - { "zwnj" 8204 } - { "zwj" 8205 } - { "lrm" 8206 } - { "rlm" 8207 } - { "ndash" 8211 } - { "mdash" 8212 } - { "lsquo" 8216 } - { "rsquo" 8217 } - { "sbquo" 8218 } - { "ldquo" 8220 } - { "rdquo" 8221 } - { "bdquo" 8222 } - { "dagger" 8224 } - { "Dagger" 8225 } - { "permil" 8240 } - { "lsaquo" 8249 } - { "rsaquo" 8250 } - { "euro" 8364 } - } ; - SYMBOL: extra-entities -f extra-entities set-global : with-entities ( entities quot -- ) [ swap extra-entities set call ] with-scope ; inline - -: with-html-entities ( quot -- ) - html-entities swap with-entities ; inline diff --git a/basis/xml/entities/html/authors.txt b/basis/xml/entities/html/authors.txt new file mode 100644 index 0000000000..29e79639ae --- /dev/null +++ b/basis/xml/entities/html/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg \ No newline at end of file diff --git a/basis/xml/entities/html/html-tests.factor b/basis/xml/entities/html/html-tests.factor new file mode 100644 index 0000000000..68b10bebe7 --- /dev/null +++ b/basis/xml/entities/html/html-tests.factor @@ -0,0 +1,4 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: tools.test xml.entities.html ; +IN: xml.entities.html.tests diff --git a/basis/xml/entities/html/html.factor b/basis/xml/entities/html/html.factor new file mode 100644 index 0000000000..6f2732f1d9 --- /dev/null +++ b/basis/xml/entities/html/html.factor @@ -0,0 +1,22 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: assocs io.encodings.binary io.files kernel namespaces sequences +values xml xml.entities ; +IN: xml.entities.html + +VALUE: html-entities + +: read-entities-file ( file -- table ) + f swap binary <file-reader> + [ 2drop extra-entities get ] sax ; + +: get-html ( -- table ) + { "lat1" "special" "symbol" } [ + "resource:basis/xml/entities/html/xhtml-" + swap ".ent" 3append read-entities-file + ] map first3 assoc-union assoc-union ; + +get-html to: html-entities + +: with-html-entities ( quot -- ) + html-entities swap with-entities ; inline diff --git a/basis/xml/entities/html/xhtml-lat1.ent b/basis/xml/entities/html/xhtml-lat1.ent new file mode 100644 index 0000000000..ffee223eb1 --- /dev/null +++ b/basis/xml/entities/html/xhtml-lat1.ent @@ -0,0 +1,196 @@ +<!-- Portions (C) International Organization for Standardization 1986 + Permission to copy in any form is granted for use with + conforming SGML systems and applications as defined in + ISO 8879, provided this notice is included in all copies. +--> +<!-- Character entity set. Typical invocation: + <!ENTITY % HTMLlat1 PUBLIC + "-//W3C//ENTITIES Latin 1 for XHTML//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent"> + %HTMLlat1; +--> + +<!ENTITY nbsp " "> <!-- no-break space = non-breaking space, + U+00A0 ISOnum --> +<!ENTITY iexcl "¡"> <!-- inverted exclamation mark, U+00A1 ISOnum --> +<!ENTITY cent "¢"> <!-- cent sign, U+00A2 ISOnum --> +<!ENTITY pound "£"> <!-- pound sign, U+00A3 ISOnum --> +<!ENTITY curren "¤"> <!-- currency sign, U+00A4 ISOnum --> +<!ENTITY yen "¥"> <!-- yen sign = yuan sign, U+00A5 ISOnum --> +<!ENTITY brvbar "¦"> <!-- broken bar = broken vertical bar, + U+00A6 ISOnum --> +<!ENTITY sect "§"> <!-- section sign, U+00A7 ISOnum --> +<!ENTITY uml "¨"> <!-- diaeresis = spacing diaeresis, + U+00A8 ISOdia --> +<!ENTITY copy "©"> <!-- copyright sign, U+00A9 ISOnum --> +<!ENTITY ordf "ª"> <!-- feminine ordinal indicator, U+00AA ISOnum --> +<!ENTITY laquo "«"> <!-- left-pointing double angle quotation mark + = left pointing guillemet, U+00AB ISOnum --> +<!ENTITY not "¬"> <!-- not sign = angled dash, + U+00AC ISOnum --> +<!ENTITY shy "­"> <!-- soft hyphen = discretionary hyphen, + U+00AD ISOnum --> +<!ENTITY reg "®"> <!-- registered sign = registered trade mark sign, + U+00AE ISOnum --> +<!ENTITY macr "¯"> <!-- macron = spacing macron = overline + = APL overbar, U+00AF ISOdia --> +<!ENTITY deg "°"> <!-- degree sign, U+00B0 ISOnum --> +<!ENTITY plusmn "±"> <!-- plus-minus sign = plus-or-minus sign, + U+00B1 ISOnum --> +<!ENTITY sup2 "²"> <!-- superscript two = superscript digit two + = squared, U+00B2 ISOnum --> +<!ENTITY sup3 "³"> <!-- superscript three = superscript digit three + = cubed, U+00B3 ISOnum --> +<!ENTITY acute "´"> <!-- acute accent = spacing acute, + U+00B4 ISOdia --> +<!ENTITY micro "µ"> <!-- micro sign, U+00B5 ISOnum --> +<!ENTITY para "¶"> <!-- pilcrow sign = paragraph sign, + U+00B6 ISOnum --> +<!ENTITY middot "·"> <!-- middle dot = Georgian comma + = Greek middle dot, U+00B7 ISOnum --> +<!ENTITY cedil "¸"> <!-- cedilla = spacing cedilla, U+00B8 ISOdia --> +<!ENTITY sup1 "¹"> <!-- superscript one = superscript digit one, + U+00B9 ISOnum --> +<!ENTITY ordm "º"> <!-- masculine ordinal indicator, + U+00BA ISOnum --> +<!ENTITY raquo "»"> <!-- right-pointing double angle quotation mark + = right pointing guillemet, U+00BB ISOnum --> +<!ENTITY frac14 "¼"> <!-- vulgar fraction one quarter + = fraction one quarter, U+00BC ISOnum --> +<!ENTITY frac12 "½"> <!-- vulgar fraction one half + = fraction one half, U+00BD ISOnum --> +<!ENTITY frac34 "¾"> <!-- vulgar fraction three quarters + = fraction three quarters, U+00BE ISOnum --> +<!ENTITY iquest "¿"> <!-- inverted question mark + = turned question mark, U+00BF ISOnum --> +<!ENTITY Agrave "À"> <!-- latin capital letter A with grave + = latin capital letter A grave, + U+00C0 ISOlat1 --> +<!ENTITY Aacute "Á"> <!-- latin capital letter A with acute, + U+00C1 ISOlat1 --> +<!ENTITY Acirc "Â"> <!-- latin capital letter A with circumflex, + U+00C2 ISOlat1 --> +<!ENTITY Atilde "Ã"> <!-- latin capital letter A with tilde, + U+00C3 ISOlat1 --> +<!ENTITY Auml "Ä"> <!-- latin capital letter A with diaeresis, + U+00C4 ISOlat1 --> +<!ENTITY Aring "Å"> <!-- latin capital letter A with ring above + = latin capital letter A ring, + U+00C5 ISOlat1 --> +<!ENTITY AElig "Æ"> <!-- latin capital letter AE + = latin capital ligature AE, + U+00C6 ISOlat1 --> +<!ENTITY Ccedil "Ç"> <!-- latin capital letter C with cedilla, + U+00C7 ISOlat1 --> +<!ENTITY Egrave "È"> <!-- latin capital letter E with grave, + U+00C8 ISOlat1 --> +<!ENTITY Eacute "É"> <!-- latin capital letter E with acute, + U+00C9 ISOlat1 --> +<!ENTITY Ecirc "Ê"> <!-- latin capital letter E with circumflex, + U+00CA ISOlat1 --> +<!ENTITY Euml "Ë"> <!-- latin capital letter E with diaeresis, + U+00CB ISOlat1 --> +<!ENTITY Igrave "Ì"> <!-- latin capital letter I with grave, + U+00CC ISOlat1 --> +<!ENTITY Iacute "Í"> <!-- latin capital letter I with acute, + U+00CD ISOlat1 --> +<!ENTITY Icirc "Î"> <!-- latin capital letter I with circumflex, + U+00CE ISOlat1 --> +<!ENTITY Iuml "Ï"> <!-- latin capital letter I with diaeresis, + U+00CF ISOlat1 --> +<!ENTITY ETH "Ð"> <!-- latin capital letter ETH, U+00D0 ISOlat1 --> +<!ENTITY Ntilde "Ñ"> <!-- latin capital letter N with tilde, + U+00D1 ISOlat1 --> +<!ENTITY Ograve "Ò"> <!-- latin capital letter O with grave, + U+00D2 ISOlat1 --> +<!ENTITY Oacute "Ó"> <!-- latin capital letter O with acute, + U+00D3 ISOlat1 --> +<!ENTITY Ocirc "Ô"> <!-- latin capital letter O with circumflex, + U+00D4 ISOlat1 --> +<!ENTITY Otilde "Õ"> <!-- latin capital letter O with tilde, + U+00D5 ISOlat1 --> +<!ENTITY Ouml "Ö"> <!-- latin capital letter O with diaeresis, + U+00D6 ISOlat1 --> +<!ENTITY times "×"> <!-- multiplication sign, U+00D7 ISOnum --> +<!ENTITY Oslash "Ø"> <!-- latin capital letter O with stroke + = latin capital letter O slash, + U+00D8 ISOlat1 --> +<!ENTITY Ugrave "Ù"> <!-- latin capital letter U with grave, + U+00D9 ISOlat1 --> +<!ENTITY Uacute "Ú"> <!-- latin capital letter U with acute, + U+00DA ISOlat1 --> +<!ENTITY Ucirc "Û"> <!-- latin capital letter U with circumflex, + U+00DB ISOlat1 --> +<!ENTITY Uuml "Ü"> <!-- latin capital letter U with diaeresis, + U+00DC ISOlat1 --> +<!ENTITY Yacute "Ý"> <!-- latin capital letter Y with acute, + U+00DD ISOlat1 --> +<!ENTITY THORN "Þ"> <!-- latin capital letter THORN, + U+00DE ISOlat1 --> +<!ENTITY szlig "ß"> <!-- latin small letter sharp s = ess-zed, + U+00DF ISOlat1 --> +<!ENTITY agrave "à"> <!-- latin small letter a with grave + = latin small letter a grave, + U+00E0 ISOlat1 --> +<!ENTITY aacute "á"> <!-- latin small letter a with acute, + U+00E1 ISOlat1 --> +<!ENTITY acirc "â"> <!-- latin small letter a with circumflex, + U+00E2 ISOlat1 --> +<!ENTITY atilde "ã"> <!-- latin small letter a with tilde, + U+00E3 ISOlat1 --> +<!ENTITY auml "ä"> <!-- latin small letter a with diaeresis, + U+00E4 ISOlat1 --> +<!ENTITY aring "å"> <!-- latin small letter a with ring above + = latin small letter a ring, + U+00E5 ISOlat1 --> +<!ENTITY aelig "æ"> <!-- latin small letter ae + = latin small ligature ae, U+00E6 ISOlat1 --> +<!ENTITY ccedil "ç"> <!-- latin small letter c with cedilla, + U+00E7 ISOlat1 --> +<!ENTITY egrave "è"> <!-- latin small letter e with grave, + U+00E8 ISOlat1 --> +<!ENTITY eacute "é"> <!-- latin small letter e with acute, + U+00E9 ISOlat1 --> +<!ENTITY ecirc "ê"> <!-- latin small letter e with circumflex, + U+00EA ISOlat1 --> +<!ENTITY euml "ë"> <!-- latin small letter e with diaeresis, + U+00EB ISOlat1 --> +<!ENTITY igrave "ì"> <!-- latin small letter i with grave, + U+00EC ISOlat1 --> +<!ENTITY iacute "í"> <!-- latin small letter i with acute, + U+00ED ISOlat1 --> +<!ENTITY icirc "î"> <!-- latin small letter i with circumflex, + U+00EE ISOlat1 --> +<!ENTITY iuml "ï"> <!-- latin small letter i with diaeresis, + U+00EF ISOlat1 --> +<!ENTITY eth "ð"> <!-- latin small letter eth, U+00F0 ISOlat1 --> +<!ENTITY ntilde "ñ"> <!-- latin small letter n with tilde, + U+00F1 ISOlat1 --> +<!ENTITY ograve "ò"> <!-- latin small letter o with grave, + U+00F2 ISOlat1 --> +<!ENTITY oacute "ó"> <!-- latin small letter o with acute, + U+00F3 ISOlat1 --> +<!ENTITY ocirc "ô"> <!-- latin small letter o with circumflex, + U+00F4 ISOlat1 --> +<!ENTITY otilde "õ"> <!-- latin small letter o with tilde, + U+00F5 ISOlat1 --> +<!ENTITY ouml "ö"> <!-- latin small letter o with diaeresis, + U+00F6 ISOlat1 --> +<!ENTITY divide "÷"> <!-- division sign, U+00F7 ISOnum --> +<!ENTITY oslash "ø"> <!-- latin small letter o with stroke, + = latin small letter o slash, + U+00F8 ISOlat1 --> +<!ENTITY ugrave "ù"> <!-- latin small letter u with grave, + U+00F9 ISOlat1 --> +<!ENTITY uacute "ú"> <!-- latin small letter u with acute, + U+00FA ISOlat1 --> +<!ENTITY ucirc "û"> <!-- latin small letter u with circumflex, + U+00FB ISOlat1 --> +<!ENTITY uuml "ü"> <!-- latin small letter u with diaeresis, + U+00FC ISOlat1 --> +<!ENTITY yacute "ý"> <!-- latin small letter y with acute, + U+00FD ISOlat1 --> +<!ENTITY thorn "þ"> <!-- latin small letter thorn, + U+00FE ISOlat1 --> +<!ENTITY yuml "ÿ"> <!-- latin small letter y with diaeresis, + U+00FF ISOlat1 --> diff --git a/basis/xml/entities/html/xhtml-special.ent b/basis/xml/entities/html/xhtml-special.ent new file mode 100644 index 0000000000..ca358b2fec --- /dev/null +++ b/basis/xml/entities/html/xhtml-special.ent @@ -0,0 +1,80 @@ +<!-- Special characters for XHTML --> + +<!-- Character entity set. Typical invocation: + <!ENTITY % HTMLspecial PUBLIC + "-//W3C//ENTITIES Special for XHTML//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent"> + %HTMLspecial; +--> + +<!-- Portions (C) International Organization for Standardization 1986: + Permission to copy in any form is granted for use with + conforming SGML systems and applications as defined in + ISO 8879, provided this notice is included in all copies. +--> + +<!-- Relevant ISO entity set is given unless names are newly introduced. + New names (i.e., not in ISO 8879 list) do not clash with any + existing ISO 8879 entity names. ISO 10646 character numbers + are given for each character, in hex. values are decimal + conversions of the ISO 10646 values and refer to the document + character set. Names are Unicode names. +--> + +<!-- C0 Controls and Basic Latin --> +<!ENTITY quot """> <!-- quotation mark, U+0022 ISOnum --> +<!ENTITY amp "&#38;"> <!-- ampersand, U+0026 ISOnum --> +<!ENTITY lt "&#60;"> <!-- less-than sign, U+003C ISOnum --> +<!ENTITY gt ">"> <!-- greater-than sign, U+003E ISOnum --> +<!ENTITY apos "'"> <!-- apostrophe = APL quote, U+0027 ISOnum --> + +<!-- Latin Extended-A --> +<!ENTITY OElig "Œ"> <!-- latin capital ligature OE, + U+0152 ISOlat2 --> +<!ENTITY oelig "œ"> <!-- latin small ligature oe, U+0153 ISOlat2 --> +<!-- ligature is a misnomer, this is a separate character in some languages --> +<!ENTITY Scaron "Š"> <!-- latin capital letter S with caron, + U+0160 ISOlat2 --> +<!ENTITY scaron "š"> <!-- latin small letter s with caron, + U+0161 ISOlat2 --> +<!ENTITY Yuml "Ÿ"> <!-- latin capital letter Y with diaeresis, + U+0178 ISOlat2 --> + +<!-- Spacing Modifier Letters --> +<!ENTITY circ "ˆ"> <!-- modifier letter circumflex accent, + U+02C6 ISOpub --> +<!ENTITY tilde "˜"> <!-- small tilde, U+02DC ISOdia --> + +<!-- General Punctuation --> +<!ENTITY ensp " "> <!-- en space, U+2002 ISOpub --> +<!ENTITY emsp " "> <!-- em space, U+2003 ISOpub --> +<!ENTITY thinsp " "> <!-- thin space, U+2009 ISOpub --> +<!ENTITY zwnj "‌"> <!-- zero width non-joiner, + U+200C NEW RFC 2070 --> +<!ENTITY zwj "‍"> <!-- zero width joiner, U+200D NEW RFC 2070 --> +<!ENTITY lrm "‎"> <!-- left-to-right mark, U+200E NEW RFC 2070 --> +<!ENTITY rlm "‏"> <!-- right-to-left mark, U+200F NEW RFC 2070 --> +<!ENTITY ndash "–"> <!-- en dash, U+2013 ISOpub --> +<!ENTITY mdash "—"> <!-- em dash, U+2014 ISOpub --> +<!ENTITY lsquo "‘"> <!-- left single quotation mark, + U+2018 ISOnum --> +<!ENTITY rsquo "’"> <!-- right single quotation mark, + U+2019 ISOnum --> +<!ENTITY sbquo "‚"> <!-- single low-9 quotation mark, U+201A NEW --> +<!ENTITY ldquo "“"> <!-- left double quotation mark, + U+201C ISOnum --> +<!ENTITY rdquo "”"> <!-- right double quotation mark, + U+201D ISOnum --> +<!ENTITY bdquo "„"> <!-- double low-9 quotation mark, U+201E NEW --> +<!ENTITY dagger "†"> <!-- dagger, U+2020 ISOpub --> +<!ENTITY Dagger "‡"> <!-- double dagger, U+2021 ISOpub --> +<!ENTITY permil "‰"> <!-- per mille sign, U+2030 ISOtech --> +<!ENTITY lsaquo "‹"> <!-- single left-pointing angle quotation mark, + U+2039 ISO proposed --> +<!-- lsaquo is proposed but not yet ISO standardized --> +<!ENTITY rsaquo "›"> <!-- single right-pointing angle quotation mark, + U+203A ISO proposed --> +<!-- rsaquo is proposed but not yet ISO standardized --> + +<!-- Currency Symbols --> +<!ENTITY euro "€"> <!-- euro sign, U+20AC NEW --> diff --git a/basis/xml/entities/html/xhtml-symbol.ent b/basis/xml/entities/html/xhtml-symbol.ent new file mode 100644 index 0000000000..63c2abfa6f --- /dev/null +++ b/basis/xml/entities/html/xhtml-symbol.ent @@ -0,0 +1,237 @@ +<!-- Mathematical, Greek and Symbolic characters for XHTML --> + +<!-- Character entity set. Typical invocation: + <!ENTITY % HTMLsymbol PUBLIC + "-//W3C//ENTITIES Symbols for XHTML//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent"> + %HTMLsymbol; +--> + +<!-- Portions (C) International Organization for Standardization 1986: + Permission to copy in any form is granted for use with + conforming SGML systems and applications as defined in + ISO 8879, provided this notice is included in all copies. +--> + +<!-- Relevant ISO entity set is given unless names are newly introduced. + New names (i.e., not in ISO 8879 list) do not clash with any + existing ISO 8879 entity names. ISO 10646 character numbers + are given for each character, in hex. values are decimal + conversions of the ISO 10646 values and refer to the document + character set. Names are Unicode names. +--> + +<!-- Latin Extended-B --> +<!ENTITY fnof "ƒ"> <!-- latin small letter f with hook = function + = florin, U+0192 ISOtech --> + +<!-- Greek --> +<!ENTITY Alpha "Α"> <!-- greek capital letter alpha, U+0391 --> +<!ENTITY Beta "Β"> <!-- greek capital letter beta, U+0392 --> +<!ENTITY Gamma "Γ"> <!-- greek capital letter gamma, + U+0393 ISOgrk3 --> +<!ENTITY Delta "Δ"> <!-- greek capital letter delta, + U+0394 ISOgrk3 --> +<!ENTITY Epsilon "Ε"> <!-- greek capital letter epsilon, U+0395 --> +<!ENTITY Zeta "Ζ"> <!-- greek capital letter zeta, U+0396 --> +<!ENTITY Eta "Η"> <!-- greek capital letter eta, U+0397 --> +<!ENTITY Theta "Θ"> <!-- greek capital letter theta, + U+0398 ISOgrk3 --> +<!ENTITY Iota "Ι"> <!-- greek capital letter iota, U+0399 --> +<!ENTITY Kappa "Κ"> <!-- greek capital letter kappa, U+039A --> +<!ENTITY Lambda "Λ"> <!-- greek capital letter lamda, + U+039B ISOgrk3 --> +<!ENTITY Mu "Μ"> <!-- greek capital letter mu, U+039C --> +<!ENTITY Nu "Ν"> <!-- greek capital letter nu, U+039D --> +<!ENTITY Xi "Ξ"> <!-- greek capital letter xi, U+039E ISOgrk3 --> +<!ENTITY Omicron "Ο"> <!-- greek capital letter omicron, U+039F --> +<!ENTITY Pi "Π"> <!-- greek capital letter pi, U+03A0 ISOgrk3 --> +<!ENTITY Rho "Ρ"> <!-- greek capital letter rho, U+03A1 --> +<!-- there is no Sigmaf, and no U+03A2 character either --> +<!ENTITY Sigma "Σ"> <!-- greek capital letter sigma, + U+03A3 ISOgrk3 --> +<!ENTITY Tau "Τ"> <!-- greek capital letter tau, U+03A4 --> +<!ENTITY Upsilon "Υ"> <!-- greek capital letter upsilon, + U+03A5 ISOgrk3 --> +<!ENTITY Phi "Φ"> <!-- greek capital letter phi, + U+03A6 ISOgrk3 --> +<!ENTITY Chi "Χ"> <!-- greek capital letter chi, U+03A7 --> +<!ENTITY Psi "Ψ"> <!-- greek capital letter psi, + U+03A8 ISOgrk3 --> +<!ENTITY Omega "Ω"> <!-- greek capital letter omega, + U+03A9 ISOgrk3 --> + +<!ENTITY alpha "α"> <!-- greek small letter alpha, + U+03B1 ISOgrk3 --> +<!ENTITY beta "β"> <!-- greek small letter beta, U+03B2 ISOgrk3 --> +<!ENTITY gamma "γ"> <!-- greek small letter gamma, + U+03B3 ISOgrk3 --> +<!ENTITY delta "δ"> <!-- greek small letter delta, + U+03B4 ISOgrk3 --> +<!ENTITY epsilon "ε"> <!-- greek small letter epsilon, + U+03B5 ISOgrk3 --> +<!ENTITY zeta "ζ"> <!-- greek small letter zeta, U+03B6 ISOgrk3 --> +<!ENTITY eta "η"> <!-- greek small letter eta, U+03B7 ISOgrk3 --> +<!ENTITY theta "θ"> <!-- greek small letter theta, + U+03B8 ISOgrk3 --> +<!ENTITY iota "ι"> <!-- greek small letter iota, U+03B9 ISOgrk3 --> +<!ENTITY kappa "κ"> <!-- greek small letter kappa, + U+03BA ISOgrk3 --> +<!ENTITY lambda "λ"> <!-- greek small letter lamda, + U+03BB ISOgrk3 --> +<!ENTITY mu "μ"> <!-- greek small letter mu, U+03BC ISOgrk3 --> +<!ENTITY nu "ν"> <!-- greek small letter nu, U+03BD ISOgrk3 --> +<!ENTITY xi "ξ"> <!-- greek small letter xi, U+03BE ISOgrk3 --> +<!ENTITY omicron "ο"> <!-- greek small letter omicron, U+03BF NEW --> +<!ENTITY pi "π"> <!-- greek small letter pi, U+03C0 ISOgrk3 --> +<!ENTITY rho "ρ"> <!-- greek small letter rho, U+03C1 ISOgrk3 --> +<!ENTITY sigmaf "ς"> <!-- greek small letter final sigma, + U+03C2 ISOgrk3 --> +<!ENTITY sigma "σ"> <!-- greek small letter sigma, + U+03C3 ISOgrk3 --> +<!ENTITY tau "τ"> <!-- greek small letter tau, U+03C4 ISOgrk3 --> +<!ENTITY upsilon "υ"> <!-- greek small letter upsilon, + U+03C5 ISOgrk3 --> +<!ENTITY phi "φ"> <!-- greek small letter phi, U+03C6 ISOgrk3 --> +<!ENTITY chi "χ"> <!-- greek small letter chi, U+03C7 ISOgrk3 --> +<!ENTITY psi "ψ"> <!-- greek small letter psi, U+03C8 ISOgrk3 --> +<!ENTITY omega "ω"> <!-- greek small letter omega, + U+03C9 ISOgrk3 --> +<!ENTITY thetasym "ϑ"> <!-- greek theta symbol, + U+03D1 NEW --> +<!ENTITY upsih "ϒ"> <!-- greek upsilon with hook symbol, + U+03D2 NEW --> +<!ENTITY piv "ϖ"> <!-- greek pi symbol, U+03D6 ISOgrk3 --> + +<!-- General Punctuation --> +<!ENTITY bull "•"> <!-- bullet = black small circle, + U+2022 ISOpub --> +<!-- bullet is NOT the same as bullet operator, U+2219 --> +<!ENTITY hellip "…"> <!-- horizontal ellipsis = three dot leader, + U+2026 ISOpub --> +<!ENTITY prime "′"> <!-- prime = minutes = feet, U+2032 ISOtech --> +<!ENTITY Prime "″"> <!-- double prime = seconds = inches, + U+2033 ISOtech --> +<!ENTITY oline "‾"> <!-- overline = spacing overscore, + U+203E NEW --> +<!ENTITY frasl "⁄"> <!-- fraction slash, U+2044 NEW --> + +<!-- Letterlike Symbols --> +<!ENTITY weierp "℘"> <!-- script capital P = power set + = Weierstrass p, U+2118 ISOamso --> +<!ENTITY image "ℑ"> <!-- black-letter capital I = imaginary part, + U+2111 ISOamso --> +<!ENTITY real "ℜ"> <!-- black-letter capital R = real part symbol, + U+211C ISOamso --> +<!ENTITY trade "™"> <!-- trade mark sign, U+2122 ISOnum --> +<!ENTITY alefsym "ℵ"> <!-- alef symbol = first transfinite cardinal, + U+2135 NEW --> +<!-- alef symbol is NOT the same as hebrew letter alef, + U+05D0 although the same glyph could be used to depict both characters --> + +<!-- Arrows --> +<!ENTITY larr "←"> <!-- leftwards arrow, U+2190 ISOnum --> +<!ENTITY uarr "↑"> <!-- upwards arrow, U+2191 ISOnum--> +<!ENTITY rarr "→"> <!-- rightwards arrow, U+2192 ISOnum --> +<!ENTITY darr "↓"> <!-- downwards arrow, U+2193 ISOnum --> +<!ENTITY harr "↔"> <!-- left right arrow, U+2194 ISOamsa --> +<!ENTITY crarr "↵"> <!-- downwards arrow with corner leftwards + = carriage return, U+21B5 NEW --> +<!ENTITY lArr "⇐"> <!-- leftwards double arrow, U+21D0 ISOtech --> +<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow + but also does not have any other character for that function. So lArr can + be used for 'is implied by' as ISOtech suggests --> +<!ENTITY uArr "⇑"> <!-- upwards double arrow, U+21D1 ISOamsa --> +<!ENTITY rArr "⇒"> <!-- rightwards double arrow, + U+21D2 ISOtech --> +<!-- Unicode does not say this is the 'implies' character but does not have + another character with this function so rArr can be used for 'implies' + as ISOtech suggests --> +<!ENTITY dArr "⇓"> <!-- downwards double arrow, U+21D3 ISOamsa --> +<!ENTITY hArr "⇔"> <!-- left right double arrow, + U+21D4 ISOamsa --> + +<!-- Mathematical Operators --> +<!ENTITY forall "∀"> <!-- for all, U+2200 ISOtech --> +<!ENTITY part "∂"> <!-- partial differential, U+2202 ISOtech --> +<!ENTITY exist "∃"> <!-- there exists, U+2203 ISOtech --> +<!ENTITY empty "∅"> <!-- empty set = null set, U+2205 ISOamso --> +<!ENTITY nabla "∇"> <!-- nabla = backward difference, + U+2207 ISOtech --> +<!ENTITY isin "∈"> <!-- element of, U+2208 ISOtech --> +<!ENTITY notin "∉"> <!-- not an element of, U+2209 ISOtech --> +<!ENTITY ni "∋"> <!-- contains as member, U+220B ISOtech --> +<!ENTITY prod "∏"> <!-- n-ary product = product sign, + U+220F ISOamsb --> +<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though + the same glyph might be used for both --> +<!ENTITY sum "∑"> <!-- n-ary summation, U+2211 ISOamsb --> +<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma' + though the same glyph might be used for both --> +<!ENTITY minus "−"> <!-- minus sign, U+2212 ISOtech --> +<!ENTITY lowast "∗"> <!-- asterisk operator, U+2217 ISOtech --> +<!ENTITY radic "√"> <!-- square root = radical sign, + U+221A ISOtech --> +<!ENTITY prop "∝"> <!-- proportional to, U+221D ISOtech --> +<!ENTITY infin "∞"> <!-- infinity, U+221E ISOtech --> +<!ENTITY ang "∠"> <!-- angle, U+2220 ISOamso --> +<!ENTITY and "∧"> <!-- logical and = wedge, U+2227 ISOtech --> +<!ENTITY or "∨"> <!-- logical or = vee, U+2228 ISOtech --> +<!ENTITY cap "∩"> <!-- intersection = cap, U+2229 ISOtech --> +<!ENTITY cup "∪"> <!-- union = cup, U+222A ISOtech --> +<!ENTITY int "∫"> <!-- integral, U+222B ISOtech --> +<!ENTITY there4 "∴"> <!-- therefore, U+2234 ISOtech --> +<!ENTITY sim "∼"> <!-- tilde operator = varies with = similar to, + U+223C ISOtech --> +<!-- tilde operator is NOT the same character as the tilde, U+007E, + although the same glyph might be used to represent both --> +<!ENTITY cong "≅"> <!-- approximately equal to, U+2245 ISOtech --> +<!ENTITY asymp "≈"> <!-- almost equal to = asymptotic to, + U+2248 ISOamsr --> +<!ENTITY ne "≠"> <!-- not equal to, U+2260 ISOtech --> +<!ENTITY equiv "≡"> <!-- identical to, U+2261 ISOtech --> +<!ENTITY le "≤"> <!-- less-than or equal to, U+2264 ISOtech --> +<!ENTITY ge "≥"> <!-- greater-than or equal to, + U+2265 ISOtech --> +<!ENTITY sub "⊂"> <!-- subset of, U+2282 ISOtech --> +<!ENTITY sup "⊃"> <!-- superset of, U+2283 ISOtech --> +<!ENTITY nsub "⊄"> <!-- not a subset of, U+2284 ISOamsn --> +<!ENTITY sube "⊆"> <!-- subset of or equal to, U+2286 ISOtech --> +<!ENTITY supe "⊇"> <!-- superset of or equal to, + U+2287 ISOtech --> +<!ENTITY oplus "⊕"> <!-- circled plus = direct sum, + U+2295 ISOamsb --> +<!ENTITY otimes "⊗"> <!-- circled times = vector product, + U+2297 ISOamsb --> +<!ENTITY perp "⊥"> <!-- up tack = orthogonal to = perpendicular, + U+22A5 ISOtech --> +<!ENTITY sdot "⋅"> <!-- dot operator, U+22C5 ISOamsb --> +<!-- dot operator is NOT the same character as U+00B7 middle dot --> + +<!-- Miscellaneous Technical --> +<!ENTITY lceil "⌈"> <!-- left ceiling = APL upstile, + U+2308 ISOamsc --> +<!ENTITY rceil "⌉"> <!-- right ceiling, U+2309 ISOamsc --> +<!ENTITY lfloor "⌊"> <!-- left floor = APL downstile, + U+230A ISOamsc --> +<!ENTITY rfloor "⌋"> <!-- right floor, U+230B ISOamsc --> +<!ENTITY lang "〈"> <!-- left-pointing angle bracket = bra, + U+2329 ISOtech --> +<!-- lang is NOT the same character as U+003C 'less than sign' + or U+2039 'single left-pointing angle quotation mark' --> +<!ENTITY rang "〉"> <!-- right-pointing angle bracket = ket, + U+232A ISOtech --> +<!-- rang is NOT the same character as U+003E 'greater than sign' + or U+203A 'single right-pointing angle quotation mark' --> + +<!-- Geometric Shapes --> +<!ENTITY loz "◊"> <!-- lozenge, U+25CA ISOpub --> + +<!-- Miscellaneous Symbols --> +<!ENTITY spades "♠"> <!-- black spade suit, U+2660 ISOpub --> +<!-- black here seems to mean filled as opposed to hollow --> +<!ENTITY clubs "♣"> <!-- black club suit = shamrock, + U+2663 ISOpub --> +<!ENTITY hearts "♥"> <!-- black heart suit = valentine, + U+2665 ISOpub --> +<!ENTITY diams "♦"> <!-- black diamond suit, U+2666 ISOpub --> diff --git a/basis/xml/errors/errors-tests.factor b/basis/xml/errors/errors-tests.factor index e72e465f0d..426ef57736 100644 --- a/basis/xml/errors/errors-tests.factor +++ b/basis/xml/errors/errors-tests.factor @@ -6,22 +6,27 @@ IN: xml.errors.tests '[ _ string>xml ] swap '[ _ = ] must-fail-with ; T{ no-entity f 1 10 "nbsp" } "<x> </x>" xml-error-test -T{ mismatched f 1 8 T{ name f "" "x" "" } T{ name f "" "y" "" } -} "<x></y>" xml-error-test +T{ mismatched f 1 8 T{ name f "" "x" "" } T{ name f "" "y" "" } } + "<x></y>" xml-error-test T{ unclosed f 1 4 V{ T{ name f "" "x" "" } } } "<x>" xml-error-test T{ nonexist-ns f 1 5 "x" } "<x:y/>" xml-error-test T{ unopened f 1 5 } "</x>" xml-error-test -T{ not-yes/no f 1 41 "maybe" } "<?xml version='1.0' standalone='maybe'?><x/>" xml-error-test +T{ not-yes/no f 1 41 "maybe" } + "<?xml version='1.0' standalone='maybe'?><x/>" xml-error-test T{ extra-attrs f 1 32 V{ T{ name f "" "foo" f } } } "<?xml version='1.1' foo='bar'?><x/>" xml-error-test -T{ bad-version f 1 28 "5 million" } "<?xml version='5 million'?><x/>" xml-error-test +T{ bad-version f 1 28 "5 million" } + "<?xml version='5 million'?><x/>" xml-error-test T{ notags f } "" xml-error-test T{ multitags } "<x/><y/>" xml-error-test -T{ bad-prolog f 1 26 T{ prolog f "1.0" "UTF-8" f } -} "<x/><?xml version='1.0'?>" xml-error-test +T{ bad-prolog f 1 26 T{ prolog f "1.0" "UTF-8" f } } + "<x/><?xml version='1.0'?>" xml-error-test T{ capitalized-prolog f 1 6 "XmL" } "<?XmL version='1.0'?><x/>" -xml-error-test + xml-error-test T{ pre/post-content f "x" t } "x<y/>" xml-error-test T{ versionless-prolog f 1 8 } "<?xml?><x/>" xml-error-test -T{ bad-instruction f 1 11 T{ instruction f "xsl" } -} "<x><?xsl?></x>" xml-error-test +T{ bad-instruction f 1 11 T{ instruction f "xsl" } } + "<x><?xsl?></x>" xml-error-test +T{ unclosed-quote f 1 13 } "<x value='/>" xml-error-test +T{ bad-name f 1 3 "-" } "<-/>" xml-error-test +T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test \ No newline at end of file diff --git a/basis/xml/errors/errors.factor b/basis/xml/errors/errors.factor index 0c039d526c..9d3d8a6bb0 100644 --- a/basis/xml/errors/errors.factor +++ b/basis/xml/errors/errors.factor @@ -32,17 +32,6 @@ M: no-entity summary ( obj -- str ) "Entity does not exist: &" write thing>> write ";" print ] with-string-writer ; -TUPLE: xml-string-error < parsing-error string ; ! this should not exist - -: xml-string-error ( string -- * ) - \ xml-string-error parsing-error swap >>string throw ; - -M: xml-string-error summary ( obj -- str ) - [ - dup call-next-method write - string>> print - ] with-string-writer ; - TUPLE: mismatched < parsing-error open close ; : mismatched ( open close -- * ) @@ -233,7 +222,34 @@ M: misplaced-directive summary ( obj -- str ) dir>> write-xml-chunk nl ] with-string-writer ; +TUPLE: bad-name < parsing-error name ; + +: bad-name ( name -- * ) + \ bad-name parsing-error swap >>name throw ; + +M: bad-name summary ( obj -- str ) + [ call-next-method ] + [ "Invalid name: " swap name>> "\n" 3append ] + bi append ; + +TUPLE: unclosed-quote < parsing-error ; + +: unclosed-quote ( -- * ) + \ unclosed-quote parsing-error throw ; + +M: unclosed-quote summary + call-next-method + "XML document ends with quote still open\n" append ; + +TUPLE: quoteless-attr < parsing-error ; + +: quoteless-attr ( -- * ) + \ quoteless-attr parsing-error throw ; + +M: quoteless-attr summary + call-next-method "Attribute lacks quotes around value\n" append ; + UNION: xml-parse-error multitags notags extra-attrs nonexist-ns - not-yes/no unclosed mismatched xml-string-error expected no-entity + not-yes/no unclosed mismatched expected no-entity bad-prolog versionless-prolog capitalized-prolog bad-instruction - bad-directive ; + bad-directive bad-name unclosed-quote quoteless-attr ; diff --git a/basis/xml/tests/ascii.xml b/basis/xml/tests/ascii.xml new file mode 100644 index 0000000000..ca1c355c81 --- /dev/null +++ b/basis/xml/tests/ascii.xml @@ -0,0 +1 @@ +<?xml version='1.0' encoding='ASCII'?><x>e</x> \ No newline at end of file diff --git a/basis/xml/tests/encodings.factor b/basis/xml/tests/encodings.factor new file mode 100644 index 0000000000..720b04ca42 --- /dev/null +++ b/basis/xml/tests/encodings.factor @@ -0,0 +1,14 @@ +USING: xml xml.data xml.utilities tools.test accessors kernel ; + +[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/spaces.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be-bom.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test +[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test +[ "\u0000e9" "x" ] [ "resource:basis/xml/tests/unitag.xml" file>xml [ name>> main>> ] [ children>string ] bi ] unit-test \ No newline at end of file diff --git a/basis/xml/tests/latin1.xml b/basis/xml/tests/latin1.xml new file mode 100644 index 0000000000..f8bc6bcd61 --- /dev/null +++ b/basis/xml/tests/latin1.xml @@ -0,0 +1 @@ +<?xml version='1.0' encoding='ISO-8859-1'?><x>�</x> \ No newline at end of file diff --git a/basis/xml/tests/latin5.xml b/basis/xml/tests/latin5.xml new file mode 100644 index 0000000000..afbcf09fc7 --- /dev/null +++ b/basis/xml/tests/latin5.xml @@ -0,0 +1 @@ +<?xml version='1.0' encoding='ISO-8859-9'?><x>�</x> \ No newline at end of file diff --git a/basis/xml/tests/prologless.xml b/basis/xml/tests/prologless.xml new file mode 100644 index 0000000000..a60ed31cc6 --- /dev/null +++ b/basis/xml/tests/prologless.xml @@ -0,0 +1 @@ +<x>é</x> \ No newline at end of file diff --git a/basis/xml/tests/spaces.xml b/basis/xml/tests/spaces.xml new file mode 100644 index 0000000000..dd194ff218 --- /dev/null +++ b/basis/xml/tests/spaces.xml @@ -0,0 +1,3 @@ + + +<x>é</x> diff --git a/basis/xml/tests/test.factor b/basis/xml/tests/test.factor index f37c3fa7ac..7a826756b6 100644 --- a/basis/xml/tests/test.factor +++ b/basis/xml/tests/test.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. IN: xml.tests USING: kernel xml tools.test io namespaces make sequences -xml.errors xml.entities parser strings xml.data io.files +xml.errors xml.entities.html parser strings xml.data io.files xml.writer xml.utilities state-parser continuations assocs sequences.deep accessors io.streams.string ; @@ -53,12 +53,15 @@ SYMBOL: xml-file [ "<foo> bar </foo>" string>xml pprint-xml>string ] unit-test [ "<!-- B+, B, or B--->" string>xml ] must-fail [ ] [ "<?xml version='1.0'?><!-- declarations for <head> & <body> --><foo/>" string>xml drop ] unit-test -[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>xml-chunk second ] unit-test -[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>xml-chunk second ] unit-test -[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>xml-chunk second ] unit-test -[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk second ] unit-test +[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>xml-chunk first ] unit-test +[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>xml-chunk first ] unit-test +[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>xml-chunk first ] unit-test +[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk first ] unit-test [ t ] [ "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' >" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test +[ "foo" ] [ "<!ENTITY bar 'foo'><x>&bar;</x>" string>xml children>string ] unit-test +[ V{ "hello" } ] [ "hello" string>xml-chunk ] unit-test +[ 958 ] [ [ "ξ" string>xml-chunk ] with-html-entities first first ] unit-test \ No newline at end of file diff --git a/basis/xml/tests/unitag.xml b/basis/xml/tests/unitag.xml new file mode 100644 index 0000000000..b7ef6ade3c --- /dev/null +++ b/basis/xml/tests/unitag.xml @@ -0,0 +1 @@ +<é>x</é> \ No newline at end of file diff --git a/basis/xml/tests/utf16.xml b/basis/xml/tests/utf16.xml new file mode 100644 index 0000000000..d8775098a1 Binary files /dev/null and b/basis/xml/tests/utf16.xml differ diff --git a/basis/xml/tests/utf16be-bom.xml b/basis/xml/tests/utf16be-bom.xml new file mode 100644 index 0000000000..4a6f3e255c Binary files /dev/null and b/basis/xml/tests/utf16be-bom.xml differ diff --git a/basis/xml/tests/utf16be.xml b/basis/xml/tests/utf16be.xml new file mode 100644 index 0000000000..c97bff7593 Binary files /dev/null and b/basis/xml/tests/utf16be.xml differ diff --git a/basis/xml/tests/utf16le-bom.xml b/basis/xml/tests/utf16le-bom.xml new file mode 100644 index 0000000000..ac7d8b8c70 Binary files /dev/null and b/basis/xml/tests/utf16le-bom.xml differ diff --git a/basis/xml/tests/utf16le.xml b/basis/xml/tests/utf16le.xml new file mode 100644 index 0000000000..5a0c7d9551 Binary files /dev/null and b/basis/xml/tests/utf16le.xml differ diff --git a/basis/xml/tests/utf8-bom.xml b/basis/xml/tests/utf8-bom.xml new file mode 100644 index 0000000000..5486916f73 --- /dev/null +++ b/basis/xml/tests/utf8-bom.xml @@ -0,0 +1 @@ +<?xml version='1.0' encoding='UTF-8'?><x/> \ No newline at end of file diff --git a/basis/xml/tests/utf8.xml b/basis/xml/tests/utf8.xml new file mode 100644 index 0000000000..83b3e2d501 --- /dev/null +++ b/basis/xml/tests/utf8.xml @@ -0,0 +1 @@ +<?xml version='1.0' encoding='UTF-8'?><x>é</x> \ No newline at end of file diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor index 0c475c108d..a2ae9c4d58 100644 --- a/basis/xml/tokenize/tokenize.factor +++ b/basis/xml/tokenize/tokenize.factor @@ -1,9 +1,10 @@ ! Copyright (C) 2005, 2006 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: xml.errors xml.data xml.utilities xml.char-classes sets -xml.entities kernel state-parser kernel namespaces make strings -math math.parser sequences assocs arrays splitting combinators -unicode.case accessors fry ascii ; +USING: accessors arrays ascii assocs combinators +combinators.short-circuit fry io.encodings io.encodings.iana +io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make +math math.parser namespaces sequences sets splitting state-parser +strings xml.char-classes xml.data xml.entities xml.errors hashtables ; IN: xml.tokenize ! XML namespace processing: ns = namespace @@ -53,34 +54,37 @@ SYMBOL: ns-stack ! version=1.0? is calculated once and passed around for efficiency -: (parse-name) ( -- str ) - version=1.0? dup - get-char name-start? [ - [ dup get-char name-char? not ] take-until nip - ] [ - "Malformed name" xml-string-error - ] if ; +: assure-name ( str version=1.0? -- str ) + over { + [ first name-start? ] + [ rest-slice [ name-char? ] with all? ] + } 2&& [ bad-name ] unless ; + +: (parse-name) ( start -- str ) + version=1.0? + [ [ get-char name-char? not ] curry take-until append ] + [ assure-name ] bi ; + +: parse-name-starting ( start -- name ) + (parse-name) get-char CHAR: : = + [ next "" (parse-name) ] [ "" swap ] if f <name> ; : parse-name ( -- name ) - (parse-name) get-char CHAR: : = - [ next (parse-name) ] [ "" swap ] if f <name> ; + "" parse-name-starting ; ! -- Parsing strings -: (parse-entity) ( string -- ) +: parse-named-entity ( string -- ) dup entities at [ , ] [ - prolog-data get standalone>> - [ no-entity ] [ - dup extra-entities get at - [ , ] [ no-entity ] ?if - ] if + dup extra-entities get at + [ dup number? [ , ] [ % ] if ] [ no-entity ] ?if ! Make less hackish ] ?if ; : parse-entity ( -- ) next CHAR: ; take-char next "#" ?head [ "x" ?head 16 10 ? base> , - ] [ (parse-entity) ] if ; + ] [ parse-named-entity ] if ; : (parse-char) ( ch -- ) get-char { @@ -93,13 +97,9 @@ SYMBOL: ns-stack : parse-char ( ch -- string ) [ (parse-char) ] "" make ; -: parse-quot ( ch -- string ) - parse-char get-char - [ "XML file ends in a quote" xml-string-error ] unless ; - : parse-text ( -- string ) CHAR: < parse-char ; - + ! Parsing tags : start-tag ( -- name ? ) @@ -107,17 +107,18 @@ SYMBOL: ns-stack get-char CHAR: / = dup [ next ] when parse-name swap ; -: parse-attr-value ( -- seq ) - get-char dup "'\"" member? [ - next parse-quot - ] [ - "Attribute lacks quote" xml-string-error - ] if ; +: (parse-quote) ( ch -- string ) + parse-char get-char + [ unclosed-quote ] unless ; + +: parse-quote ( -- seq ) + pass-blank get-char dup "'\"" member? + [ next (parse-quote) ] [ quoteless-attr ] if ; : parse-attr ( -- ) - [ parse-name ] with-scope - pass-blank CHAR: = expect pass-blank - [ parse-attr-value ] with-scope + parse-name + pass-blank CHAR: = expect + parse-quote 2array , ; : (middle-tag) ( -- ) @@ -153,7 +154,7 @@ SYMBOL: ns-stack : only-blanks ( str -- ) [ blank? ] all? [ bad-doctype-decl ] unless ; -: take-system-literal ( -- str ) +: take-system-literal ( -- str ) ! replace with parse-quote? pass-blank get-char next { { CHAR: ' [ "'" take-string ] } { CHAR: " [ "\"" take-string ] } @@ -207,15 +208,18 @@ DEFER: direct : take-entity-def ( -- entity-name entity-def ) " " take-string pass-blank get-char { - { CHAR: ' [ take-system-literal ] } - { CHAR: " [ take-system-literal ] } + { CHAR: ' [ parse-quote ] } + { CHAR: " [ parse-quote ] } [ drop take-external-id ] } case ; +: associate-entity ( entity-name entity-def -- ) + swap extra-entities [ ?set-at ] change ; + : take-entity-decl ( -- entity-decl ) pass-blank get-char { { CHAR: % [ next pass-blank take-entity-def ] } - [ drop take-entity-def ] + [ drop take-entity-def 2dup associate-entity ] } case ">" take-string only-blanks <entity-decl> ; @@ -253,22 +257,36 @@ DEFER: direct : good-version ( version -- version ) dup { "1.0" "1.1" } member? [ bad-version ] unless ; -: prolog-attrs ( alist -- prolog ) - [ T{ name f "" "version" f } swap at - [ good-version ] [ versionless-prolog ] if* ] keep - [ T{ name f "" "encoding" f } swap at - "UTF-8" or ] keep +: prolog-version ( alist -- version ) + T{ name f "" "version" f } swap at + [ good-version ] [ versionless-prolog ] if* ; + +: prolog-encoding ( alist -- encoding ) + T{ name f "" "encoding" f } swap at "UTF-8" or ; + +: prolog-standalone ( alist -- version ) T{ name f "" "standalone" f } swap at - [ yes/no>bool ] [ f ] if* - <prolog> ; + [ yes/no>bool ] [ f ] if* ; + +: prolog-attrs ( alist -- prolog ) + [ prolog-version ] + [ prolog-encoding ] + [ prolog-standalone ] + tri <prolog> ; + +SYMBOL: string-input? +: decode-input-if ( encoding -- ) + string-input? get [ drop ] [ decode-input ] if ; : parse-prolog ( -- prolog ) pass-blank middle-tag "?>" expect-string dup assure-no-extra prolog-attrs + dup encoding>> dup "UTF-16" = + [ drop ] [ name>encoding [ decode-input-if ] when* ] if dup prolog-data set ; : instruct ( -- instruction ) - (parse-name) dup "xml" = + "" (parse-name) dup "xml" = [ drop parse-prolog ] [ dup >lower "xml" = [ capitalized-prolog ] @@ -278,10 +296,69 @@ DEFER: direct : make-tag ( -- tag ) { { [ get-char dup CHAR: ! = ] [ drop next direct ] } - { [ CHAR: ? = ] [ next instruct ] } + { [ CHAR: ? = ] [ next instruct ] } [ start-tag [ dup add-ns pop-ns <closer> ] [ middle-tag end-tag ] if CHAR: > expect ] } cond ; + +! Autodetecting encodings + +: continue-make-tag ( str -- tag ) + parse-name-starting middle-tag end-tag CHAR: > expect ; + +: start-utf16le ( -- tag ) + utf16le decode-input-if + CHAR: ? expect + 0 expect instruct ; + +: 10xxxxxx? ( ch -- ? ) + -6 shift 3 bitand 2 = ; + +: start<name ( ch -- tag ) + ascii? + [ utf8 decode-input-if next make-tag ] [ + next + [ get-next 10xxxxxx? not ] take-until + get-char suffix utf8 decode + utf8 decode-input-if next + continue-make-tag + ] if ; + +: start< ( -- tag ) + get-next { + { 0 [ next next start-utf16le ] } + { CHAR: ? [ next next instruct ] } ! XML prolog parsing sets the encoding + { CHAR: ! [ utf8 decode-input next next direct ] } + [ start<name ] + } case ; + +: skip-utf8-bom ( -- tag ) + "\u0000bb\u0000bf" expect utf8 decode-input + CHAR: < expect make-tag ; + +: decode-expecting ( encoding string -- tag ) + [ decode-input-if next ] [ expect-string ] bi* make-tag ; + +: start-utf16be ( -- tag ) + utf16be "<" decode-expecting ; + +: skip-utf16le-bom ( -- tag ) + utf16le "\u0000fe<" decode-expecting ; + +: skip-utf16be-bom ( -- tag ) + utf16be "\u0000ff<" decode-expecting ; + +: start-document ( -- tag ) + get-char { + { CHAR: < [ start< ] } + { 0 [ start-utf16be ] } + { HEX: EF [ skip-utf8-bom ] } + { HEX: FF [ skip-utf16le-bom ] } + { HEX: FE [ skip-utf16be-bom ] } + { f [ "" ] } + [ drop utf8 decode-input-if f ] + ! Same problem as with <e`>, in the case of XML chunks? + } case ; diff --git a/basis/xml/xml-docs.factor b/basis/xml/xml-docs.factor index 16a817d628..e87c32d375 100644 --- a/basis/xml/xml-docs.factor +++ b/basis/xml/xml-docs.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: help.markup help.syntax kernel xml.data xml.errors xml.writer state-parser xml.tokenize xml.utilities xml.entities -strings sequences io ; +strings sequences io xml.entities.html ; IN: xml HELP: string>xml @@ -295,9 +295,6 @@ HELP: expected HELP: no-entity { $class-description "XML parsing error describing the use of an undefined entity in a case where standalone is marked yes. Delegates to " { $link parsing-error } ". Contains one slot, thing, containing a string representing the entity." } ; -HELP: xml-string-error -{ $class-description "XML parsing error that delegates to " { $link parsing-error } " and represents an other, unspecified error, which is represented by the slot string, containing a string describing the error." } ; - HELP: open-tag { $class-description "represents a tag that does have children, ie is not a contained tag" } { $notes "the constructor used for this class is simply " { $link <tag> } "." } @@ -324,6 +321,15 @@ HELP: state-parse HELP: pre/post-content { $class-description "describes the error where a non-whitespace string is used before or after the main tag in an XML document. Contains two slots: string contains the offending string, and pre? is t if it occured before the main tag and f if it occured after" } ; +HELP: unclosed-quote +{ $class-description "describes the error where a quotation for an attribute value is opened but not closed before the end of the document." } ; + +HELP: bad-name +{ $class-description "describes the error where a name is used, for example in an XML tag or attribute key, which is invalid." } ; + +HELP: quoteless-attr +{ $class-description "describes the error where an attribute of an XML tag is missing quotes around a value." } ; + HELP: entities { $description "a hash table from default XML entity names (like & and <) to the characters they represent. This is automatically included when parsing any XML document." } { $see-also html-entities } ; @@ -444,6 +450,9 @@ ARTICLE: { "xml" "errors" } "XML parsing errors" { $subsection expected } { $subsection no-entity } { $subsection pre/post-content } + { $subsection unclosed-quote } + { $subsection bad-name } + { $subsection quoteless-attr } "Additionally, most of these errors delegate to " { $link parsing-error } " in order to provide more information" $nl "Note that, in parsing an XML document, only the first error is reported." ; @@ -456,7 +465,7 @@ ARTICLE: { "xml" "entities" } "XML entities" { $subsection with-html-entities } ; ARTICLE: "xml" "XML parser" -"The " { $vocab-link "xml" } " vocabulary implements the XML 1.1 standard, converting strings of text into XML and vice versa." +"The " { $vocab-link "xml" } " vocabulary implements the XML 1.0 and 1.1 standards, converting strings of text into XML and vice versa." { $subsection { "xml" "reading" } } { $subsection { "xml" "writing" } } { $subsection { "xml" "classes" } } diff --git a/basis/xml/xml.factor b/basis/xml/xml.factor index 8afcf7a33b..328a058a58 100644 --- a/basis/xml/xml.factor +++ b/basis/xml/xml.factor @@ -1,10 +1,9 @@ ! Copyright (C) 2005, 2006 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: io io.streams.string io.files kernel math namespaces -prettyprint sequences arrays generic strings vectors -xml.char-classes xml.data xml.errors xml.tokenize xml.writer -xml.utilities state-parser assocs ascii io.encodings.utf8 -accessors xml.backend ; +USING: accessors arrays io io.encodings.binary io.files +io.streams.string kernel namespaces sequences state-parser strings +xml.backend xml.data xml.errors xml.tokenize ascii +xml.writer ; IN: xml ! -- Overall parser with data tree @@ -23,7 +22,7 @@ GENERIC: process ( object -- ) M: object process add-child ; M: prolog process - xml-stack get V{ { f V{ "" } } } = + xml-stack get V{ { f V{ } } } = [ bad-prolog ] unless drop ; M: instruction process @@ -101,6 +100,7 @@ TUPLE: pull-xml scope ; text-now? on ] H{ } make-assoc pull-xml boa ; +! pull-xml needs to call start-document somewhere : pull-event ( pull -- xml-event/f ) scope>> [ @@ -133,11 +133,12 @@ TUPLE: pull-xml scope ; : sax ( stream quot: ( xml-elem -- ) -- ) swap [ reset-prolog init-ns-stack - prolog-data get call-under + start-document [ call-under ] when* sax-loop ] state-parse ; inline recursive : (read-xml) ( -- ) + start-document [ process ] when* [ process ] sax-loop ; inline : (read-xml-chunk) ( stream -- prolog seq ) @@ -159,11 +160,12 @@ TUPLE: pull-xml scope ; <string-reader> read-xml ; : string>xml-chunk ( string -- xml ) - <string-reader> read-xml-chunk ; + t string-input? + [ <string-reader> read-xml-chunk ] with-variable ; : file>xml ( filename -- xml ) ! Autodetect encoding! - utf8 <file-reader> read-xml ; + binary <file-reader> read-xml ; : xml-reprint ( string -- ) string>xml print-xml ; diff --git a/extra/fuel/fuel.factor b/extra/fuel/fuel.factor index 46d6ba12c7..f52a34ff28 100644 --- a/extra/fuel/fuel.factor +++ b/extra/fuel/fuel.factor @@ -141,7 +141,7 @@ PRIVATE> : fuel-get-article ( name -- ) article fuel-eval-set-result ; -MEMO: fuel-get-article-title ( name -- ) +: fuel-get-article-title ( name -- ) articles get at [ article-title ] [ f ] if* fuel-eval-set-result ; : fuel-word-help ( name -- ) (fuel-word-help) fuel-eval-set-result ; diff --git a/misc/fuel/README b/misc/fuel/README index eb280d796c..706191aaa3 100644 --- a/misc/fuel/README +++ b/misc/fuel/README @@ -18,16 +18,11 @@ beast. (load-file "<path/to/factor/installation>/misc/fuel/fu.el") - or - - (add-to-list load-path "<path/to/factor/installation>/fuel") - (require 'fuel) - If all you want is a major mode for editing Factor code with pretty font colors and indentation, without running the factor listener inside Emacs, you can use instead: - (add-to-list load-path "<path/to/factor/installation>/fuel") + (add-to-list 'load-path "<path/to/factor/installation>/fuel") (setq factor-mode-use-fuel nil) (require 'factor-mode) diff --git a/misc/fuel/fuel-eval.el b/misc/fuel/fuel-eval.el index 543d23bd3f..4c34ef17b8 100644 --- a/misc/fuel/fuel-eval.el +++ b/misc/fuel/fuel-eval.el @@ -31,6 +31,7 @@ ((listp sexp) (case (car sexp) (:array (factor--seq 'V{ '} (cdr sexp))) + (:seq (factor--seq '{ '} (cdr sexp))) (:quote (format "\\ %s" (factor `(:factor ,(cadr sexp))))) (:quotation (factor--seq '\[ '\] (cdr sexp))) (:using (factor `(USING: ,@(cdr sexp) :end))) diff --git a/misc/fuel/fuel-font-lock.el b/misc/fuel/fuel-font-lock.el index 99a7c7b8fb..10561ded68 100644 --- a/misc/fuel/fuel-font-lock.el +++ b/misc/fuel/fuel-font-lock.el @@ -72,11 +72,21 @@ ;;; Font lock: +(defun fuel-font-lock--syntactic-face (state) + (cond ((nth 3 state) 'factor-font-lock-string) + ((char-equal (char-after (nth 8 state)) ?\ ) + (save-excursion + (goto-char (nth 8 state)) + (beginning-of-line) + (cond ((looking-at "USING: ") 'factor-font-lock-vocabulary-name) + ((looking-at "\\(TUPLE\\|SYMBOLS\\|VARS\\): ") + 'factor-font-lock-symbol) + (t 'default)))) + (t 'factor-font-lock-comment))) + (defconst fuel-font-lock--font-lock-keywords `((,fuel-syntax--stack-effect-regex . 'factor-font-lock-stack-effect) - (,fuel-syntax--parsing-words-regex . 'factor-font-lock-parsing-word) (,fuel-syntax--brace-words-regex 1 'factor-font-lock-parsing-word) - ("\\(P\\|SBUF\\)\"" 1 'factor-font-lock-parsing-word) (,fuel-syntax--vocab-ref-regexp 2 'factor-font-lock-vocabulary-name) (,fuel-syntax--declaration-words-regex . 'factor-font-lock-declaration) (,fuel-syntax--word-definition-regex 2 'factor-font-lock-word) @@ -89,24 +99,26 @@ (,fuel-syntax--type-definition-regex 2 'factor-font-lock-type-name) (,fuel-syntax--method-definition-regex (1 'factor-font-lock-type-name) (2 'factor-font-lock-word)) - (,fuel-syntax--parent-type-regex 2 'factor-font-lock-type-name) + (,fuel-syntax--tuple-decl-regex 2 'factor-font-lock-type-name) (,fuel-syntax--constructor-regex . 'factor-font-lock-constructor) (,fuel-syntax--setter-regex . 'factor-font-lock-setter-word) (,fuel-syntax--getter-regex . 'factor-font-lock-getter-word) (,fuel-syntax--symbol-definition-regex 2 'factor-font-lock-symbol) - (,fuel-syntax--bad-string-regex . 'factor-font-lock-invalid-syntax))) + (,fuel-syntax--bad-string-regex . 'factor-font-lock-invalid-syntax) + ("\\(P\\|SBUF\\)\"" 1 'factor-font-lock-parsing-word) + (,fuel-syntax--parsing-words-regex . 'factor-font-lock-parsing-word))) (defun fuel-font-lock--font-lock-setup (&optional keywords no-syntax) (set (make-local-variable 'comment-start) "! ") (set (make-local-variable 'parse-sexp-lookup-properties) t) - (set (make-local-variable 'font-lock-comment-face) 'factor-font-lock-comment) - (set (make-local-variable 'font-lock-string-face) 'factor-font-lock-string) (set (make-local-variable 'font-lock-defaults) `(,(or keywords 'fuel-font-lock--font-lock-keywords) nil nil nil nil ,@(if no-syntax nil (list (cons 'font-lock-syntactic-keywords - fuel-syntax--syntactic-keywords)))))) + fuel-syntax--syntactic-keywords) + (cons 'font-lock-syntactic-face-function + 'fuel-font-lock--syntactic-face)))))) ;;; Fontify strings as Factor code: diff --git a/misc/fuel/fuel-help.el b/misc/fuel/fuel-help.el index 705d1469a2..a82de388da 100644 --- a/misc/fuel/fuel-help.el +++ b/misc/fuel/fuel-help.el @@ -137,7 +137,8 @@ (defun fuel-help--get-article (name label) (message "Retrieving article ...") - (let* ((cmd `(:fuel* ((,name fuel-get-article)) "fuel" t)) + (let* ((name (if (listp name) (cons :seq name) name)) + (cmd `(:fuel* ((,name fuel-get-article)) "fuel" t)) (ret (fuel-eval--send/wait cmd)) (res (fuel-eval--retort-result ret))) (if (not res) diff --git a/misc/fuel/fuel-listener.el b/misc/fuel/fuel-listener.el index 66034225f1..aa9f05ab29 100644 --- a/misc/fuel/fuel-listener.el +++ b/misc/fuel/fuel-listener.el @@ -31,7 +31,12 @@ :group 'fuel) (defcustom fuel-listener-factor-binary - (expand-file-name "factor" fuel-factor-root-dir) + (expand-file-name (cond ((eq system-type 'windows-nt) + "factor.exe") + ((eq system-type 'darwin) + "Factor.app/Contents/MacOS/factor") + (t "factor")) + fuel-factor-root-dir) "Full path to the factor executable to use when starting a listener." :type '(file :must-match t) :group 'fuel-listener) @@ -132,8 +137,7 @@ buffer." (defun fuel-listener--setup-completion () (setq fuel-syntax--current-vocab-function 'fuel-listener--current-vocab) - (setq fuel-syntax--usings-function 'fuel-listener--usings) - (set-syntax-table fuel-syntax--syntax-table)) + (setq fuel-syntax--usings-function 'fuel-listener--usings)) ;;; Stack mode support @@ -160,7 +164,6 @@ buffer." (set (make-local-variable 'comint-prompt-regexp) fuel-con--prompt-regex) (set (make-local-variable 'comint-use-prompt-regexp) t) (set (make-local-variable 'comint-prompt-read-only) t) - (set-syntax-table fuel-syntax--syntax-table) (fuel-listener--setup-completion) (fuel-listener--setup-stack-mode)) diff --git a/misc/fuel/fuel-markup.el b/misc/fuel/fuel-markup.el index 696e4ff080..6a374cd5c8 100644 --- a/misc/fuel/fuel-markup.el +++ b/misc/fuel/fuel-markup.el @@ -61,7 +61,7 @@ (defun fuel-markup--insert-button (label link type) (let ((label (format "%s" label)) - (link (format "%s" link))) + (link (if (listp link) link (format "%s" link)))) (insert-text-button label :type 'fuel-markup--button 'markup-link link @@ -70,8 +70,9 @@ 'help-echo (format "%s (%s)" label type)))) (defun fuel-markup--article-title (name) - (fuel-eval--retort-result - (fuel-eval--send/wait `(:fuel* ((,name fuel-get-article-title)) "fuel")))) + (let ((name (if (listp name) (cons :seq name) name))) + (fuel-eval--retort-result + (fuel-eval--send/wait `(:fuel* ((,name fuel-get-article-title)) "fuel"))))) (defun fuel-markup--link-at-point () (let ((button (condition-case nil (forward-button 0) (error nil)))) diff --git a/misc/fuel/fuel-syntax.el b/misc/fuel/fuel-syntax.el index b74b0afc11..4112e3507d 100644 --- a/misc/fuel/fuel-syntax.el +++ b/misc/fuel/fuel-syntax.el @@ -79,7 +79,7 @@ (regexp-opt fuel-syntax--declaration-words 'words)) (defsubst fuel-syntax--second-word-regex (prefixes) - (format "^%s +\\([^ \r\n]+\\)" (regexp-opt prefixes t))) + (format "%s +\\([^ \r\n]+\\)" (regexp-opt prefixes t))) (defconst fuel-syntax--method-definition-regex "^M: +\\([^ ]+\\) +\\([^ ]+\\)") @@ -87,14 +87,22 @@ (defconst fuel-syntax--integer-regex "\\_<-?[0-9]+\\_>") -(defconst fuel-syntax--ratio-regex - "\\_<-?\\([0-9]+\\+\\)?[0-9]+/-?[0-9]+\\_>") +(defconst fuel-syntax--raw-float-regex + "[0-9]*\\.[0-9]*\\([eE][+-]?[0-9]+\\)?") (defconst fuel-syntax--float-regex - "\\_<-?[0-9]+\\.[0-9]*\\([eE][+-]?[0-9]+\\)?\\_>") + (format "\\_<-?%s\\_>" fuel-syntax--raw-float-regex)) + +(defconst fuel-syntax--number-regex + (format "\\([0-9]+\\|%s\\)" fuel-syntax--raw-float-regex)) + +(defconst fuel-syntax--ratio-regex + (format "\\_<[+-]?%s/-?%s\\_>" + fuel-syntax--number-regex + fuel-syntax--number-regex)) (defconst fuel-syntax--bad-string-regex - "\"\\([^\"]\\|\\\\\"\\)*\n") + "\\_<\"[^>]\\([^\"\n]\\|\\\\\"\\)*\n") (defconst fuel-syntax--word-definition-regex (fuel-syntax--second-word-regex @@ -114,8 +122,8 @@ (defconst fuel-syntax--type-definition-regex (fuel-syntax--second-word-regex '("MIXIN:" "TUPLE:" "SINGLETON:" "UNION:"))) -(defconst fuel-syntax--parent-type-regex - "^\\(TUPLE\\|PREDICTE\\): +[^ ]+ +< +\\([^ ]+\\)") +(defconst fuel-syntax--tuple-decl-regex + "^TUPLE: +\\([^ \n]+\\) +< +\\([^ \n]+\\)\\_>") (defconst fuel-syntax--constructor-regex "<[^ >]+>") @@ -125,7 +133,8 @@ (defconst fuel-syntax--symbol-definition-regex (fuel-syntax--second-word-regex '("SYMBOL:" "VAR:"))) -(defconst fuel-syntax--stack-effect-regex " ( .* )") +(defconst fuel-syntax--stack-effect-regex + "\\( ( .* )\\)\\|\\( (( .* ))\\)") (defconst fuel-syntax--using-lines-regex "^USING: +\\([^;]+\\);") @@ -163,26 +172,26 @@ fuel-syntax--declaration-words-regex)) (defconst fuel-syntax--single-liner-regex - (format "^%s" (regexp-opt '("ABOUT:" - "ARTICLE:" - "ALIAS:" - "CONSTANT:" "C:" - "DEFER:" - "FORGET:" - "GENERIC:" "GENERIC#" - "HELP:" "HEX:" "HOOK:" - "IN:" "INSTANCE:" - "MAIN:" "MATH:" "MIXIN:" - "OCT:" - "POSTPONE:" "PRIVATE>" "<PRIVATE" - "QUALIFIED-WITH:" "QUALIFIED:" - "RENAME:" - "SINGLETON:" "SLOT:" "SYMBOL:" - "USE:" - "VAR:")))) + (regexp-opt '("ABOUT:" + "ARTICLE:" + "ALIAS:" + "CONSTANT:" "C:" + "DEFER:" + "FORGET:" + "GENERIC:" "GENERIC#" + "HELP:" "HEX:" "HOOK:" + "IN:" "INSTANCE:" + "MAIN:" "MATH:" "MIXIN:" + "OCT:" + "POSTPONE:" "PRIVATE>" "<PRIVATE" + "QUALIFIED-WITH:" "QUALIFIED:" + "RENAME:" + "SINGLETON:" "SLOT:" "SYMBOL:" + "USE:" + "VAR:"))) (defconst fuel-syntax--begin-of-def-regex - (format "^USING: \\|\\(%s\\)\\|\\(%s .*\\)" + (format "^USING: \\|\\(%s\\)\\|\\(^%s .*\\)" fuel-syntax--definition-start-regex fuel-syntax--single-liner-regex)) @@ -190,7 +199,7 @@ (format "^.*%s" fuel-syntax--definition-end-regex)) (defconst fuel-syntax--end-of-def-regex - (format "\\(%s\\)\\|\\(%s .*\\)" + (format "\\(%s\\)\\|\\(^%s .*\\)" fuel-syntax--end-of-def-line-regex fuel-syntax--single-liner-regex)) @@ -220,13 +229,21 @@ table)) (defconst fuel-syntax--syntactic-keywords - `(;; Comments: + `(;; CHARs: + ("CHAR: \\(.\\)\\( \\|$\\)" (1 "w")) + ;; Comments: ("\\_<\\(#?!\\) .*\\(\n\\|$\\)" (1 "<") (2 ">")) ("\\_<\\(#?!\\)\\(\n\\|$\\)" (1 "<") (2 ">")) - ;; CHARs: - ("CHAR: \\(.\\)\\( \\|$\\)" (1 "w")) ;; Strings - ("\\(\"\\)\\([^\n\r\f\\\"]\\|\\\\\"?\\)*\\(\"\\)" (1 "\"") (3 "\"")) + ("\\_<\\(\"\\)\\([^\n\r\f\"]\\|\\\\\"\\)*\\(\"\\)\\_>" (1 "\"") (3 "\"")) + ("\\_<<\\(\"\\)\\_>" (1 "\"")) + ("\\_<\\(\"\\)>\\_>" (1 "\"")) + ;; Multiline constructs + ("\\_<USING:\\( \\)\\(;\\)" (1 "<b") (2 ">b")) + ("\\_<USING:\\( \\)" (1 "<b")) + ("\\_<TUPLE: +\\w+? +< +\\w+? *\\( \\)" (1 "<b")) + ("\\_<\\(TUPLE\\|SYMBOLS\\|VARS\\): +\\w+? *\\( \\)\\([^<]\\|\\_>\\)" (2 "<b")) + ("\\(\n\\| \\);\\_>" (1 ">b")) ;; Let and lambda: ("\\_<\\(!(\\) .* \\()\\)" (1 "<") (2 ">")) ("\\(\\[\\)\\(let\\|wlet\\|let\\*\\)\\( \\|$\\)" (1 "(]"))