diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor index 0524825d43..e3e1282676 100644 --- a/basis/unicode/breaks/breaks.factor +++ b/basis/unicode/breaks/breaks.factor @@ -61,7 +61,7 @@ SYMBOL: table : eval-seq ( seq -- seq ) [ dup word? [ execute ] when ] map ; : (set-table) ( class1 class2 val -- ) - -rot table get nth [ swap or ] change-nth ; + [ table get nth ] dip '[ _ or ] change-nth ; : set-table ( classes1 classes2 val -- ) [ [ eval-seq ] bi@ ] dip diff --git a/basis/xml/tests/ascii.xml b/basis/xml/tests/ascii.xml new file mode 100644 index 0000000000..ca1c355c81 --- /dev/null +++ b/basis/xml/tests/ascii.xml @@ -0,0 +1 @@ +e \ No newline at end of file diff --git a/basis/xml/tests/encodings.factor b/basis/xml/tests/encodings.factor new file mode 100644 index 0000000000..720b04ca42 --- /dev/null +++ b/basis/xml/tests/encodings.factor @@ -0,0 +1,14 @@ +USING: xml xml.data xml.utilities tools.test accessors kernel ; + +[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/spaces.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be-bom.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test +[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test +[ "\u0000e9" "x" ] [ "resource:basis/xml/tests/unitag.xml" file>xml [ name>> main>> ] [ children>string ] bi ] unit-test \ No newline at end of file diff --git a/basis/xml/tests/latin1.xml b/basis/xml/tests/latin1.xml new file mode 100644 index 0000000000..f8bc6bcd61 --- /dev/null +++ b/basis/xml/tests/latin1.xml @@ -0,0 +1 @@ +é \ No newline at end of file diff --git a/basis/xml/tests/latin5.xml b/basis/xml/tests/latin5.xml new file mode 100644 index 0000000000..afbcf09fc7 --- /dev/null +++ b/basis/xml/tests/latin5.xml @@ -0,0 +1 @@ +ý \ No newline at end of file diff --git a/basis/xml/tests/prologless.xml b/basis/xml/tests/prologless.xml new file mode 100644 index 0000000000..a60ed31cc6 --- /dev/null +++ b/basis/xml/tests/prologless.xml @@ -0,0 +1 @@ +é \ No newline at end of file diff --git a/basis/xml/tests/spaces.xml b/basis/xml/tests/spaces.xml new file mode 100644 index 0000000000..dd194ff218 --- /dev/null +++ b/basis/xml/tests/spaces.xml @@ -0,0 +1,3 @@ + + +é diff --git a/basis/xml/tests/test.factor b/basis/xml/tests/test.factor index f37c3fa7ac..a565df6a9d 100644 --- a/basis/xml/tests/test.factor +++ b/basis/xml/tests/test.factor @@ -53,12 +53,12 @@ SYMBOL: xml-file [ " bar " string>xml pprint-xml>string ] unit-test [ "" string>xml ] must-fail [ ] [ "" string>xml drop ] unit-test -[ T{ element-decl f "br" "EMPTY" } ] [ "" string>xml-chunk second ] unit-test -[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>xml-chunk second ] unit-test -[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>xml-chunk second ] unit-test -[ T{ element-decl f "container" "ANY" } ] [ "" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk second ] unit-test +[ T{ element-decl f "br" "EMPTY" } ] [ "" string>xml-chunk first ] unit-test +[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>xml-chunk first ] unit-test +[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>xml-chunk first ] unit-test +[ T{ element-decl f "container" "ANY" } ] [ "" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test [ t ] [ "" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test diff --git a/basis/xml/tests/unitag.xml b/basis/xml/tests/unitag.xml new file mode 100644 index 0000000000..b7ef6ade3c --- /dev/null +++ b/basis/xml/tests/unitag.xml @@ -0,0 +1 @@ +<é>x \ No newline at end of file diff --git a/basis/xml/tests/utf16.xml b/basis/xml/tests/utf16.xml new file mode 100644 index 0000000000..d8775098a1 Binary files /dev/null and b/basis/xml/tests/utf16.xml differ diff --git a/basis/xml/tests/utf16be-bom.xml b/basis/xml/tests/utf16be-bom.xml new file mode 100644 index 0000000000..4a6f3e255c Binary files /dev/null and b/basis/xml/tests/utf16be-bom.xml differ diff --git a/basis/xml/tests/utf16be.xml b/basis/xml/tests/utf16be.xml new file mode 100644 index 0000000000..c97bff7593 Binary files /dev/null and b/basis/xml/tests/utf16be.xml differ diff --git a/basis/xml/tests/utf16le-bom.xml b/basis/xml/tests/utf16le-bom.xml new file mode 100644 index 0000000000..ac7d8b8c70 Binary files /dev/null and b/basis/xml/tests/utf16le-bom.xml differ diff --git a/basis/xml/tests/utf16le.xml b/basis/xml/tests/utf16le.xml new file mode 100644 index 0000000000..5a0c7d9551 Binary files /dev/null and b/basis/xml/tests/utf16le.xml differ diff --git a/basis/xml/tests/utf8-bom.xml b/basis/xml/tests/utf8-bom.xml new file mode 100644 index 0000000000..5486916f73 --- /dev/null +++ b/basis/xml/tests/utf8-bom.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/basis/xml/tests/utf8.xml b/basis/xml/tests/utf8.xml new file mode 100644 index 0000000000..83b3e2d501 --- /dev/null +++ b/basis/xml/tests/utf8.xml @@ -0,0 +1 @@ +é \ No newline at end of file diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor index 0c475c108d..f2bb5a2e97 100644 --- a/basis/xml/tokenize/tokenize.factor +++ b/basis/xml/tokenize/tokenize.factor @@ -1,9 +1,10 @@ ! Copyright (C) 2005, 2006 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: xml.errors xml.data xml.utilities xml.char-classes sets -xml.entities kernel state-parser kernel namespaces make strings -math math.parser sequences assocs arrays splitting combinators -unicode.case accessors fry ascii ; +USING: accessors arrays ascii assocs combinators +combinators.short-circuit fry io.encodings io.encodings.iana +io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make +math math.parser namespaces sequences sets splitting state-parser +strings xml.char-classes xml.data xml.entities xml.errors ; IN: xml.tokenize ! XML namespace processing: ns = namespace @@ -53,17 +54,23 @@ SYMBOL: ns-stack ! version=1.0? is calculated once and passed around for efficiency -: (parse-name) ( -- str ) - version=1.0? dup - get-char name-start? [ - [ dup get-char name-char? not ] take-until nip - ] [ - "Malformed name" xml-string-error - ] if ; +: assure-name ( str version=1.0? -- str ) + over { + [ first name-start? ] + [ rest-slice [ name-char? ] with all? ] + } 2&& [ "Malformed name" xml-string-error ] unless ; + +: (parse-name) ( start -- str ) + version=1.0? + [ [ get-char name-char? not ] curry take-until append ] + [ assure-name ] bi ; + +: parse-name-starting ( start -- name ) + (parse-name) get-char CHAR: : = + [ next "" (parse-name) ] [ "" swap ] if f ; : parse-name ( -- name ) - (parse-name) get-char CHAR: : = - [ next (parse-name) ] [ "" swap ] if f ; + "" parse-name-starting ; ! -- Parsing strings @@ -99,7 +106,7 @@ SYMBOL: ns-stack : parse-text ( -- string ) CHAR: < parse-char ; - + ! Parsing tags : start-tag ( -- name ? ) @@ -262,13 +269,19 @@ DEFER: direct [ yes/no>bool ] [ f ] if* ; +SYMBOL: string-input? +: decode-input-if ( encoding -- ) + string-input? get [ drop ] [ decode-input ] if ; + : parse-prolog ( -- prolog ) pass-blank middle-tag "?>" expect-string dup assure-no-extra prolog-attrs + dup encoding>> dup "UTF-16" = + [ drop ] [ name>encoding [ decode-input-if ] when* ] if dup prolog-data set ; : instruct ( -- instruction ) - (parse-name) dup "xml" = + "" (parse-name) dup "xml" = [ drop parse-prolog ] [ dup >lower "xml" = [ capitalized-prolog ] @@ -285,3 +298,66 @@ DEFER: direct CHAR: > expect ] } cond ; + +! Autodetecting encodings + +: continue-make-tag ( str -- tag ) + parse-name-starting middle-tag end-tag CHAR: > expect ; + +: start-utf16le ( -- tag ) + utf16le decode-input-if + CHAR: ? expect + 0 expect instruct ; + +: 10xxxxxx? ( ch -- ? ) + -6 shift 3 bitand 2 = ; + +: start> [ @@ -133,11 +133,12 @@ TUPLE: pull-xml scope ; : sax ( stream quot: ( xml-elem -- ) -- ) swap [ reset-prolog init-ns-stack - prolog-data get call-under + start-document call-under sax-loop ] state-parse ; inline recursive : (read-xml) ( -- ) + start-document process [ process ] sax-loop ; inline : (read-xml-chunk) ( stream -- prolog seq ) @@ -159,11 +160,12 @@ TUPLE: pull-xml scope ; read-xml ; : string>xml-chunk ( string -- xml ) - read-xml-chunk ; + t string-input? + [ read-xml-chunk ] with-variable ; : file>xml ( filename -- xml ) ! Autodetect encoding! - utf8 read-xml ; + binary read-xml ; : xml-reprint ( string -- ) string>xml print-xml ; diff --git a/misc/fuel/fuel-font-lock.el b/misc/fuel/fuel-font-lock.el index 99a7c7b8fb..10561ded68 100644 --- a/misc/fuel/fuel-font-lock.el +++ b/misc/fuel/fuel-font-lock.el @@ -72,11 +72,21 @@ ;;; Font lock: +(defun fuel-font-lock--syntactic-face (state) + (cond ((nth 3 state) 'factor-font-lock-string) + ((char-equal (char-after (nth 8 state)) ?\ ) + (save-excursion + (goto-char (nth 8 state)) + (beginning-of-line) + (cond ((looking-at "USING: ") 'factor-font-lock-vocabulary-name) + ((looking-at "\\(TUPLE\\|SYMBOLS\\|VARS\\): ") + 'factor-font-lock-symbol) + (t 'default)))) + (t 'factor-font-lock-comment))) + (defconst fuel-font-lock--font-lock-keywords `((,fuel-syntax--stack-effect-regex . 'factor-font-lock-stack-effect) - (,fuel-syntax--parsing-words-regex . 'factor-font-lock-parsing-word) (,fuel-syntax--brace-words-regex 1 'factor-font-lock-parsing-word) - ("\\(P\\|SBUF\\)\"" 1 'factor-font-lock-parsing-word) (,fuel-syntax--vocab-ref-regexp 2 'factor-font-lock-vocabulary-name) (,fuel-syntax--declaration-words-regex . 'factor-font-lock-declaration) (,fuel-syntax--word-definition-regex 2 'factor-font-lock-word) @@ -89,24 +99,26 @@ (,fuel-syntax--type-definition-regex 2 'factor-font-lock-type-name) (,fuel-syntax--method-definition-regex (1 'factor-font-lock-type-name) (2 'factor-font-lock-word)) - (,fuel-syntax--parent-type-regex 2 'factor-font-lock-type-name) + (,fuel-syntax--tuple-decl-regex 2 'factor-font-lock-type-name) (,fuel-syntax--constructor-regex . 'factor-font-lock-constructor) (,fuel-syntax--setter-regex . 'factor-font-lock-setter-word) (,fuel-syntax--getter-regex . 'factor-font-lock-getter-word) (,fuel-syntax--symbol-definition-regex 2 'factor-font-lock-symbol) - (,fuel-syntax--bad-string-regex . 'factor-font-lock-invalid-syntax))) + (,fuel-syntax--bad-string-regex . 'factor-font-lock-invalid-syntax) + ("\\(P\\|SBUF\\)\"" 1 'factor-font-lock-parsing-word) + (,fuel-syntax--parsing-words-regex . 'factor-font-lock-parsing-word))) (defun fuel-font-lock--font-lock-setup (&optional keywords no-syntax) (set (make-local-variable 'comment-start) "! ") (set (make-local-variable 'parse-sexp-lookup-properties) t) - (set (make-local-variable 'font-lock-comment-face) 'factor-font-lock-comment) - (set (make-local-variable 'font-lock-string-face) 'factor-font-lock-string) (set (make-local-variable 'font-lock-defaults) `(,(or keywords 'fuel-font-lock--font-lock-keywords) nil nil nil nil ,@(if no-syntax nil (list (cons 'font-lock-syntactic-keywords - fuel-syntax--syntactic-keywords)))))) + fuel-syntax--syntactic-keywords) + (cons 'font-lock-syntactic-face-function + 'fuel-font-lock--syntactic-face)))))) ;;; Fontify strings as Factor code: diff --git a/misc/fuel/fuel-listener.el b/misc/fuel/fuel-listener.el index 66034225f1..f1be6a5607 100644 --- a/misc/fuel/fuel-listener.el +++ b/misc/fuel/fuel-listener.el @@ -68,7 +68,8 @@ buffer." (setq fuel-listener--buffer (current-buffer))))) (defun fuel-listener--start-process () - (let ((factor (expand-file-name fuel-listener-factor-binary)) + (let ((factor (locate-file (expand-file-name fuel-listener-factor-binary) + '("") exec-suffixes)) (image (expand-file-name fuel-listener-factor-image)) (comint-redirect-perform-sanity-check nil)) (unless (file-executable-p factor) @@ -132,8 +133,7 @@ buffer." (defun fuel-listener--setup-completion () (setq fuel-syntax--current-vocab-function 'fuel-listener--current-vocab) - (setq fuel-syntax--usings-function 'fuel-listener--usings) - (set-syntax-table fuel-syntax--syntax-table)) + (setq fuel-syntax--usings-function 'fuel-listener--usings)) ;;; Stack mode support @@ -160,7 +160,6 @@ buffer." (set (make-local-variable 'comint-prompt-regexp) fuel-con--prompt-regex) (set (make-local-variable 'comint-use-prompt-regexp) t) (set (make-local-variable 'comint-prompt-read-only) t) - (set-syntax-table fuel-syntax--syntax-table) (fuel-listener--setup-completion) (fuel-listener--setup-stack-mode)) diff --git a/misc/fuel/fuel-syntax.el b/misc/fuel/fuel-syntax.el index b74b0afc11..5aaaa33964 100644 --- a/misc/fuel/fuel-syntax.el +++ b/misc/fuel/fuel-syntax.el @@ -87,14 +87,22 @@ (defconst fuel-syntax--integer-regex "\\_<-?[0-9]+\\_>") -(defconst fuel-syntax--ratio-regex - "\\_<-?\\([0-9]+\\+\\)?[0-9]+/-?[0-9]+\\_>") +(defconst fuel-syntax--raw-float-regex + "[0-9]*\\.[0-9]*\\([eE][+-]?[0-9]+\\)?") (defconst fuel-syntax--float-regex - "\\_<-?[0-9]+\\.[0-9]*\\([eE][+-]?[0-9]+\\)?\\_>") + (format "\\_<-?%s\\_>" fuel-syntax--raw-float-regex)) + +(defconst fuel-syntax--number-regex + (format "\\([0-9]+\\|%s\\)" fuel-syntax--raw-float-regex)) + +(defconst fuel-syntax--ratio-regex + (format "\\_<[+-]?%s/-?%s\\_>" + fuel-syntax--number-regex + fuel-syntax--number-regex)) (defconst fuel-syntax--bad-string-regex - "\"\\([^\"]\\|\\\\\"\\)*\n") + "\\_<\"[^>]\\([^\"\n]\\|\\\\\"\\)*\n") (defconst fuel-syntax--word-definition-regex (fuel-syntax--second-word-regex @@ -114,8 +122,8 @@ (defconst fuel-syntax--type-definition-regex (fuel-syntax--second-word-regex '("MIXIN:" "TUPLE:" "SINGLETON:" "UNION:"))) -(defconst fuel-syntax--parent-type-regex - "^\\(TUPLE\\|PREDICTE\\): +[^ ]+ +< +\\([^ ]+\\)") +(defconst fuel-syntax--tuple-decl-regex + "^TUPLE: +\\([^ \n]+\\) +< +\\([^ \n]+\\)\\_>") (defconst fuel-syntax--constructor-regex "<[^ >]+>") @@ -125,7 +133,8 @@ (defconst fuel-syntax--symbol-definition-regex (fuel-syntax--second-word-regex '("SYMBOL:" "VAR:"))) -(defconst fuel-syntax--stack-effect-regex " ( .* )") +(defconst fuel-syntax--stack-effect-regex + "\\( ( .* )\\)\\|\\( (( .* ))\\)") (defconst fuel-syntax--using-lines-regex "^USING: +\\([^;]+\\);") @@ -220,13 +229,20 @@ table)) (defconst fuel-syntax--syntactic-keywords - `(;; Comments: + `(;; CHARs: + ("CHAR: \\(.\\)\\( \\|$\\)" (1 "w")) + ;; Comments: ("\\_<\\(#?!\\) .*\\(\n\\|$\\)" (1 "<") (2 ">")) ("\\_<\\(#?!\\)\\(\n\\|$\\)" (1 "<") (2 ">")) - ;; CHARs: - ("CHAR: \\(.\\)\\( \\|$\\)" (1 "w")) ;; Strings - ("\\(\"\\)\\([^\n\r\f\\\"]\\|\\\\\"?\\)*\\(\"\\)" (1 "\"") (3 "\"")) + ("\\_<\\(\"\\)\\([^\n\r\f\\\"]\\|\\\\\"\\)*\\(\"\\)\\_>" (1 "\"") (3 "\"")) + ("\\_<<\\(\"\\)\\_>" (1 "\"")) + ("\\_<\\(\"\\)>\\_>" (1 "\"")) + ;; Multiline constructs + ("\\_\\)" (2 "" (1 ">b")) ;; Let and lambda: ("\\_<\\(!(\\) .* \\()\\)" (1 "<") (2 ">")) ("\\(\\[\\)\\(let\\|wlet\\|let\\*\\)\\( \\|$\\)" (1 "(]"))