diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor
index 0524825d43..e3e1282676 100644
--- a/basis/unicode/breaks/breaks.factor
+++ b/basis/unicode/breaks/breaks.factor
@@ -61,7 +61,7 @@ SYMBOL: table
: eval-seq ( seq -- seq ) [ dup word? [ execute ] when ] map ;
: (set-table) ( class1 class2 val -- )
- -rot table get nth [ swap or ] change-nth ;
+ [ table get nth ] dip '[ _ or ] change-nth ;
: set-table ( classes1 classes2 val -- )
[ [ eval-seq ] bi@ ] dip
diff --git a/basis/xml/tests/ascii.xml b/basis/xml/tests/ascii.xml
new file mode 100644
index 0000000000..ca1c355c81
--- /dev/null
+++ b/basis/xml/tests/ascii.xml
@@ -0,0 +1 @@
+e
\ No newline at end of file
diff --git a/basis/xml/tests/encodings.factor b/basis/xml/tests/encodings.factor
new file mode 100644
index 0000000000..720b04ca42
--- /dev/null
+++ b/basis/xml/tests/encodings.factor
@@ -0,0 +1,14 @@
+USING: xml xml.data xml.utilities tools.test accessors kernel ;
+
+[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/spaces.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be-bom.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test
+[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test
+[ "\u0000e9" "x" ] [ "resource:basis/xml/tests/unitag.xml" file>xml [ name>> main>> ] [ children>string ] bi ] unit-test
\ No newline at end of file
diff --git a/basis/xml/tests/latin1.xml b/basis/xml/tests/latin1.xml
new file mode 100644
index 0000000000..f8bc6bcd61
--- /dev/null
+++ b/basis/xml/tests/latin1.xml
@@ -0,0 +1 @@
+é
\ No newline at end of file
diff --git a/basis/xml/tests/latin5.xml b/basis/xml/tests/latin5.xml
new file mode 100644
index 0000000000..afbcf09fc7
--- /dev/null
+++ b/basis/xml/tests/latin5.xml
@@ -0,0 +1 @@
+ý
\ No newline at end of file
diff --git a/basis/xml/tests/prologless.xml b/basis/xml/tests/prologless.xml
new file mode 100644
index 0000000000..a60ed31cc6
--- /dev/null
+++ b/basis/xml/tests/prologless.xml
@@ -0,0 +1 @@
+é
\ No newline at end of file
diff --git a/basis/xml/tests/spaces.xml b/basis/xml/tests/spaces.xml
new file mode 100644
index 0000000000..dd194ff218
--- /dev/null
+++ b/basis/xml/tests/spaces.xml
@@ -0,0 +1,3 @@
+
+
+é
diff --git a/basis/xml/tests/test.factor b/basis/xml/tests/test.factor
index f37c3fa7ac..a565df6a9d 100644
--- a/basis/xml/tests/test.factor
+++ b/basis/xml/tests/test.factor
@@ -53,12 +53,12 @@ SYMBOL: xml-file
[ " bar " string>xml pprint-xml>string ] unit-test
[ "" string>xml ] must-fail
[ ] [ "" string>xml drop ] unit-test
-[ T{ element-decl f "br" "EMPTY" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ element-decl f "container" "ANY" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk second ] unit-test
-[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk second ] unit-test
+[ T{ element-decl f "br" "EMPTY" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ element-decl f "container" "ANY" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test
+[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test
[ t ] [ "" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test
diff --git a/basis/xml/tests/unitag.xml b/basis/xml/tests/unitag.xml
new file mode 100644
index 0000000000..b7ef6ade3c
--- /dev/null
+++ b/basis/xml/tests/unitag.xml
@@ -0,0 +1 @@
+<é>xé>
\ No newline at end of file
diff --git a/basis/xml/tests/utf16.xml b/basis/xml/tests/utf16.xml
new file mode 100644
index 0000000000..d8775098a1
Binary files /dev/null and b/basis/xml/tests/utf16.xml differ
diff --git a/basis/xml/tests/utf16be-bom.xml b/basis/xml/tests/utf16be-bom.xml
new file mode 100644
index 0000000000..4a6f3e255c
Binary files /dev/null and b/basis/xml/tests/utf16be-bom.xml differ
diff --git a/basis/xml/tests/utf16be.xml b/basis/xml/tests/utf16be.xml
new file mode 100644
index 0000000000..c97bff7593
Binary files /dev/null and b/basis/xml/tests/utf16be.xml differ
diff --git a/basis/xml/tests/utf16le-bom.xml b/basis/xml/tests/utf16le-bom.xml
new file mode 100644
index 0000000000..ac7d8b8c70
Binary files /dev/null and b/basis/xml/tests/utf16le-bom.xml differ
diff --git a/basis/xml/tests/utf16le.xml b/basis/xml/tests/utf16le.xml
new file mode 100644
index 0000000000..5a0c7d9551
Binary files /dev/null and b/basis/xml/tests/utf16le.xml differ
diff --git a/basis/xml/tests/utf8-bom.xml b/basis/xml/tests/utf8-bom.xml
new file mode 100644
index 0000000000..5486916f73
--- /dev/null
+++ b/basis/xml/tests/utf8-bom.xml
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/basis/xml/tests/utf8.xml b/basis/xml/tests/utf8.xml
new file mode 100644
index 0000000000..83b3e2d501
--- /dev/null
+++ b/basis/xml/tests/utf8.xml
@@ -0,0 +1 @@
+é
\ No newline at end of file
diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor
index 0c475c108d..f2bb5a2e97 100644
--- a/basis/xml/tokenize/tokenize.factor
+++ b/basis/xml/tokenize/tokenize.factor
@@ -1,9 +1,10 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
-USING: xml.errors xml.data xml.utilities xml.char-classes sets
-xml.entities kernel state-parser kernel namespaces make strings
-math math.parser sequences assocs arrays splitting combinators
-unicode.case accessors fry ascii ;
+USING: accessors arrays ascii assocs combinators
+combinators.short-circuit fry io.encodings io.encodings.iana
+io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make
+math math.parser namespaces sequences sets splitting state-parser
+strings xml.char-classes xml.data xml.entities xml.errors ;
IN: xml.tokenize
! XML namespace processing: ns = namespace
@@ -53,17 +54,23 @@ SYMBOL: ns-stack
! version=1.0? is calculated once and passed around for efficiency
-: (parse-name) ( -- str )
- version=1.0? dup
- get-char name-start? [
- [ dup get-char name-char? not ] take-until nip
- ] [
- "Malformed name" xml-string-error
- ] if ;
+: assure-name ( str version=1.0? -- str )
+ over {
+ [ first name-start? ]
+ [ rest-slice [ name-char? ] with all? ]
+ } 2&& [ "Malformed name" xml-string-error ] unless ;
+
+: (parse-name) ( start -- str )
+ version=1.0?
+ [ [ get-char name-char? not ] curry take-until append ]
+ [ assure-name ] bi ;
+
+: parse-name-starting ( start -- name )
+ (parse-name) get-char CHAR: : =
+ [ next "" (parse-name) ] [ "" swap ] if f ;
: parse-name ( -- name )
- (parse-name) get-char CHAR: : =
- [ next (parse-name) ] [ "" swap ] if f ;
+ "" parse-name-starting ;
! -- Parsing strings
@@ -99,7 +106,7 @@ SYMBOL: ns-stack
: parse-text ( -- string )
CHAR: < parse-char ;
-
+
! Parsing tags
: start-tag ( -- name ? )
@@ -262,13 +269,19 @@ DEFER: direct
[ yes/no>bool ] [ f ] if*
;
+SYMBOL: string-input?
+: decode-input-if ( encoding -- )
+ string-input? get [ drop ] [ decode-input ] if ;
+
: parse-prolog ( -- prolog )
pass-blank middle-tag "?>" expect-string
dup assure-no-extra prolog-attrs
+ dup encoding>> dup "UTF-16" =
+ [ drop ] [ name>encoding [ decode-input-if ] when* ] if
dup prolog-data set ;
: instruct ( -- instruction )
- (parse-name) dup "xml" =
+ "" (parse-name) dup "xml" =
[ drop parse-prolog ] [
dup >lower "xml" =
[ capitalized-prolog ]
@@ -285,3 +298,66 @@ DEFER: direct
CHAR: > expect
]
} cond ;
+
+! Autodetecting encodings
+
+: continue-make-tag ( str -- tag )
+ parse-name-starting middle-tag end-tag CHAR: > expect ;
+
+: start-utf16le ( -- tag )
+ utf16le decode-input-if
+ CHAR: ? expect
+ 0 expect instruct ;
+
+: 10xxxxxx? ( ch -- ? )
+ -6 shift 3 bitand 2 = ;
+
+: start> [
@@ -133,11 +133,12 @@ TUPLE: pull-xml scope ;
: sax ( stream quot: ( xml-elem -- ) -- )
swap [
reset-prolog init-ns-stack
- prolog-data get call-under
+ start-document call-under
sax-loop
] state-parse ; inline recursive
: (read-xml) ( -- )
+ start-document process
[ process ] sax-loop ; inline
: (read-xml-chunk) ( stream -- prolog seq )
@@ -159,11 +160,12 @@ TUPLE: pull-xml scope ;
read-xml ;
: string>xml-chunk ( string -- xml )
- read-xml-chunk ;
+ t string-input?
+ [ read-xml-chunk ] with-variable ;
: file>xml ( filename -- xml )
! Autodetect encoding!
- utf8 read-xml ;
+ binary read-xml ;
: xml-reprint ( string -- )
string>xml print-xml ;
diff --git a/misc/fuel/fuel-font-lock.el b/misc/fuel/fuel-font-lock.el
index 99a7c7b8fb..10561ded68 100644
--- a/misc/fuel/fuel-font-lock.el
+++ b/misc/fuel/fuel-font-lock.el
@@ -72,11 +72,21 @@
;;; Font lock:
+(defun fuel-font-lock--syntactic-face (state)
+ (cond ((nth 3 state) 'factor-font-lock-string)
+ ((char-equal (char-after (nth 8 state)) ?\ )
+ (save-excursion
+ (goto-char (nth 8 state))
+ (beginning-of-line)
+ (cond ((looking-at "USING: ") 'factor-font-lock-vocabulary-name)
+ ((looking-at "\\(TUPLE\\|SYMBOLS\\|VARS\\): ")
+ 'factor-font-lock-symbol)
+ (t 'default))))
+ (t 'factor-font-lock-comment)))
+
(defconst fuel-font-lock--font-lock-keywords
`((,fuel-syntax--stack-effect-regex . 'factor-font-lock-stack-effect)
- (,fuel-syntax--parsing-words-regex . 'factor-font-lock-parsing-word)
(,fuel-syntax--brace-words-regex 1 'factor-font-lock-parsing-word)
- ("\\(P\\|SBUF\\)\"" 1 'factor-font-lock-parsing-word)
(,fuel-syntax--vocab-ref-regexp 2 'factor-font-lock-vocabulary-name)
(,fuel-syntax--declaration-words-regex . 'factor-font-lock-declaration)
(,fuel-syntax--word-definition-regex 2 'factor-font-lock-word)
@@ -89,24 +99,26 @@
(,fuel-syntax--type-definition-regex 2 'factor-font-lock-type-name)
(,fuel-syntax--method-definition-regex (1 'factor-font-lock-type-name)
(2 'factor-font-lock-word))
- (,fuel-syntax--parent-type-regex 2 'factor-font-lock-type-name)
+ (,fuel-syntax--tuple-decl-regex 2 'factor-font-lock-type-name)
(,fuel-syntax--constructor-regex . 'factor-font-lock-constructor)
(,fuel-syntax--setter-regex . 'factor-font-lock-setter-word)
(,fuel-syntax--getter-regex . 'factor-font-lock-getter-word)
(,fuel-syntax--symbol-definition-regex 2 'factor-font-lock-symbol)
- (,fuel-syntax--bad-string-regex . 'factor-font-lock-invalid-syntax)))
+ (,fuel-syntax--bad-string-regex . 'factor-font-lock-invalid-syntax)
+ ("\\(P\\|SBUF\\)\"" 1 'factor-font-lock-parsing-word)
+ (,fuel-syntax--parsing-words-regex . 'factor-font-lock-parsing-word)))
(defun fuel-font-lock--font-lock-setup (&optional keywords no-syntax)
(set (make-local-variable 'comment-start) "! ")
(set (make-local-variable 'parse-sexp-lookup-properties) t)
- (set (make-local-variable 'font-lock-comment-face) 'factor-font-lock-comment)
- (set (make-local-variable 'font-lock-string-face) 'factor-font-lock-string)
(set (make-local-variable 'font-lock-defaults)
`(,(or keywords 'fuel-font-lock--font-lock-keywords)
nil nil nil nil
,@(if no-syntax nil
(list (cons 'font-lock-syntactic-keywords
- fuel-syntax--syntactic-keywords))))))
+ fuel-syntax--syntactic-keywords)
+ (cons 'font-lock-syntactic-face-function
+ 'fuel-font-lock--syntactic-face))))))
;;; Fontify strings as Factor code:
diff --git a/misc/fuel/fuel-listener.el b/misc/fuel/fuel-listener.el
index 66034225f1..f1be6a5607 100644
--- a/misc/fuel/fuel-listener.el
+++ b/misc/fuel/fuel-listener.el
@@ -68,7 +68,8 @@ buffer."
(setq fuel-listener--buffer (current-buffer)))))
(defun fuel-listener--start-process ()
- (let ((factor (expand-file-name fuel-listener-factor-binary))
+ (let ((factor (locate-file (expand-file-name fuel-listener-factor-binary)
+ '("") exec-suffixes))
(image (expand-file-name fuel-listener-factor-image))
(comint-redirect-perform-sanity-check nil))
(unless (file-executable-p factor)
@@ -132,8 +133,7 @@ buffer."
(defun fuel-listener--setup-completion ()
(setq fuel-syntax--current-vocab-function 'fuel-listener--current-vocab)
- (setq fuel-syntax--usings-function 'fuel-listener--usings)
- (set-syntax-table fuel-syntax--syntax-table))
+ (setq fuel-syntax--usings-function 'fuel-listener--usings))
;;; Stack mode support
@@ -160,7 +160,6 @@ buffer."
(set (make-local-variable 'comint-prompt-regexp) fuel-con--prompt-regex)
(set (make-local-variable 'comint-use-prompt-regexp) t)
(set (make-local-variable 'comint-prompt-read-only) t)
- (set-syntax-table fuel-syntax--syntax-table)
(fuel-listener--setup-completion)
(fuel-listener--setup-stack-mode))
diff --git a/misc/fuel/fuel-syntax.el b/misc/fuel/fuel-syntax.el
index b74b0afc11..5aaaa33964 100644
--- a/misc/fuel/fuel-syntax.el
+++ b/misc/fuel/fuel-syntax.el
@@ -87,14 +87,22 @@
(defconst fuel-syntax--integer-regex
"\\_<-?[0-9]+\\_>")
-(defconst fuel-syntax--ratio-regex
- "\\_<-?\\([0-9]+\\+\\)?[0-9]+/-?[0-9]+\\_>")
+(defconst fuel-syntax--raw-float-regex
+ "[0-9]*\\.[0-9]*\\([eE][+-]?[0-9]+\\)?")
(defconst fuel-syntax--float-regex
- "\\_<-?[0-9]+\\.[0-9]*\\([eE][+-]?[0-9]+\\)?\\_>")
+ (format "\\_<-?%s\\_>" fuel-syntax--raw-float-regex))
+
+(defconst fuel-syntax--number-regex
+ (format "\\([0-9]+\\|%s\\)" fuel-syntax--raw-float-regex))
+
+(defconst fuel-syntax--ratio-regex
+ (format "\\_<[+-]?%s/-?%s\\_>"
+ fuel-syntax--number-regex
+ fuel-syntax--number-regex))
(defconst fuel-syntax--bad-string-regex
- "\"\\([^\"]\\|\\\\\"\\)*\n")
+ "\\_<\"[^>]\\([^\"\n]\\|\\\\\"\\)*\n")
(defconst fuel-syntax--word-definition-regex
(fuel-syntax--second-word-regex
@@ -114,8 +122,8 @@
(defconst fuel-syntax--type-definition-regex
(fuel-syntax--second-word-regex '("MIXIN:" "TUPLE:" "SINGLETON:" "UNION:")))
-(defconst fuel-syntax--parent-type-regex
- "^\\(TUPLE\\|PREDICTE\\): +[^ ]+ +< +\\([^ ]+\\)")
+(defconst fuel-syntax--tuple-decl-regex
+ "^TUPLE: +\\([^ \n]+\\) +< +\\([^ \n]+\\)\\_>")
(defconst fuel-syntax--constructor-regex "<[^ >]+>")
@@ -125,7 +133,8 @@
(defconst fuel-syntax--symbol-definition-regex
(fuel-syntax--second-word-regex '("SYMBOL:" "VAR:")))
-(defconst fuel-syntax--stack-effect-regex " ( .* )")
+(defconst fuel-syntax--stack-effect-regex
+ "\\( ( .* )\\)\\|\\( (( .* ))\\)")
(defconst fuel-syntax--using-lines-regex "^USING: +\\([^;]+\\);")
@@ -220,13 +229,20 @@
table))
(defconst fuel-syntax--syntactic-keywords
- `(;; Comments:
+ `(;; CHARs:
+ ("CHAR: \\(.\\)\\( \\|$\\)" (1 "w"))
+ ;; Comments:
("\\_<\\(#?!\\) .*\\(\n\\|$\\)" (1 "<") (2 ">"))
("\\_<\\(#?!\\)\\(\n\\|$\\)" (1 "<") (2 ">"))
- ;; CHARs:
- ("CHAR: \\(.\\)\\( \\|$\\)" (1 "w"))
;; Strings
- ("\\(\"\\)\\([^\n\r\f\\\"]\\|\\\\\"?\\)*\\(\"\\)" (1 "\"") (3 "\""))
+ ("\\_<\\(\"\\)\\([^\n\r\f\\\"]\\|\\\\\"\\)*\\(\"\\)\\_>" (1 "\"") (3 "\""))
+ ("\\_<<\\(\"\\)\\_>" (1 "\""))
+ ("\\_<\\(\"\\)>\\_>" (1 "\""))
+ ;; Multiline constructs
+ ("\\_\\)" (2 "" (1 ">b"))
;; Let and lambda:
("\\_<\\(!(\\) .* \\()\\)" (1 "<") (2 ">"))
("\\(\\[\\)\\(let\\|wlet\\|let\\*\\)\\( \\|$\\)" (1 "(]"))