From 9a0d318b916a113581760a73a163ad87e5d8801b Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@factorcode.org>
Date: Sun, 2 Dec 2007 05:25:18 -0500
Subject: [PATCH] Updating extra/xmode

---
 extra/xmode/README.txt                 |  44 ++++++++-
 extra/xmode/loader/loader.factor       |   5 +-
 extra/xmode/marker/marker-tests.factor |  43 +++++++++
 extra/xmode/marker/marker.factor       | 123 ++++++++++++++++---------
 extra/xmode/marker/state/state.factor  |   2 +
 extra/xmode/rules/rules.factor         |  17 ----
 6 files changed, 165 insertions(+), 69 deletions(-)
 mode change 100644 => 100755 extra/xmode/README.txt
 mode change 100644 => 100755 extra/xmode/loader/loader.factor
 mode change 100644 => 100755 extra/xmode/marker/marker-tests.factor
 mode change 100644 => 100755 extra/xmode/marker/marker.factor
 mode change 100644 => 100755 extra/xmode/marker/state/state.factor
 mode change 100644 => 100755 extra/xmode/rules/rules.factor
diff --git a/extra/xmode/README.txt b/extra/xmode/README.txt
old mode 100644
new mode 100755
index 7a9d1580d3..bf73042030
--- a/extra/xmode/README.txt
+++ b/extra/xmode/README.txt
@@ -1,5 +1,41 @@
-This is a Factor port of jEdit's syntax highlighting engine.
+This is a Factor port of the jEdit 4.3 syntax highlighting engine
+(http://www.jedit.org).
 
-It implements a relatively basic, rule-driven recursive parser.
-The parser is incremental, with one line granularity. This is
-still a work in progress.
+jEdit 1.2, released in late 1998, was the first release to support
+syntax highlighting. It featured a small number of hand-coded
+"token markers" -- simple incremental parers -- all based on the
+original JavaTokenMarker contributed by Tal Davidson.
+
+Around the time of jEdit 1.5 in 1999, Mike Dillon began developing a
+jEdit plugin named "XMode". This plugin implemented a generic,
+rule-driven token marker which read mode descriptions from XML files.
+XMode eventually matured to the point where it could replace the
+formerly hand-coded token markers.
+
+With the release of jEdit 2.4, I merged XMode into the core and
+eliminated the old hand-coded token markers.
+
+XMode suffers from a somewhat archaic design, and was written at a time
+when Java VMs with JIT compilers were relatively uncommon, object
+allocation was expensive, and heap space tight. As a result the parser
+design is less general than it could be.
+
+Furthermore, the parser has a few bugs which some mode files have come
+to depend on:
+
+- If a RULES tag does not define any keywords or rules, then its
+  NO_WORD_SEP attribute is ignored.
+
+  The Factor implementation duplicates this behavior.
+
+- if a RULES tag does not have a NO_WORD_SEP attribute, then
+  it inherits the value of the NO_WORD_SEP attribute from the previous
+  RULES tag.
+
+  The Factor implementation does not duplicate this behavior.
+
+This is still a work in progress. If you find any behavioral differences
+between the Factor implementation and the original jEdit code, please
+report them as bugs. Also, if you wish to contribute a new or improved
+mode file, please contact the jEdit project. Updated mode files in jEdit
+will be periodically imported into the Factor source tree.
diff --git a/extra/xmode/loader/loader.factor b/extra/xmode/loader/loader.factor
old mode 100644
new mode 100755
index a287efdd4b..c6b5cad9d1
--- a/extra/xmode/loader/loader.factor
+++ b/extra/xmode/loader/loader.factor
@@ -35,8 +35,7 @@ IN: xmode.loader
     dup children>string swap position-attrs <matcher> ;
 
 : parse-regexp-matcher ( tag -- matcher )
-    ! XXX
-    dup children>string swap position-attrs <matcher> ;
+    dup children>string <regexp> swap position-attrs <matcher> ;
 
 ! SPAN's children
 <TAGS: parse-begin/end-tag
@@ -146,7 +145,7 @@ TAGS>
         { "SET" string>rule-set-name set-rule-set-name }
         { "IGNORE_CASE" string>boolean set-rule-set-ignore-case? }
         { "HIGHLIGHT_DIGITS" string>boolean set-rule-set-highlight-digits? }
-        { "DIGIT_RE" f set-rule-set-digit-re } ! XXX
+        { "DIGIT_RE" <regexp> set-rule-set-digit-re } ! XXX
         { "ESCAPE" f add-escape-rule }
         { "DEFAULT" string>token set-rule-set-default }
         { "NO_WORD_SEP" f set-rule-set-no-word-sep }
diff --git a/extra/xmode/marker/marker-tests.factor b/extra/xmode/marker/marker-tests.factor
old mode 100644
new mode 100755
index 6c66f958a6..cb7f2960a4
--- a/extra/xmode/marker/marker-tests.factor
+++ b/extra/xmode/marker/marker-tests.factor
@@ -2,6 +2,40 @@ USING: xmode.tokens xmode.catalog
 xmode.marker tools.test kernel ;
 IN: temporary
 
+[
+    {
+        T{ token f "int" KEYWORD3 }
+        T{ token f " " f }
+        T{ token f "x" f }
+    }
+] [ f "int x" "c" load-mode tokenize-line nip ] unit-test
+
+[
+    {
+        T{ token f "\"" LITERAL1 }
+        T{ token f "hello\\\"" LITERAL1 }
+        T{ token f " " LITERAL1 }
+        T{ token f "world" LITERAL1 }
+        T{ token f "\"" LITERAL1 }
+    }
+] [ f "\"hello\\\" world\"" "c" load-mode tokenize-line nip ] unit-test
+
+[
+    {
+        T{ token f "\"" LITERAL1 }
+        T{ token f "hello\\\ world" LITERAL1 }
+        T{ token f "\"" LITERAL1 }
+    }
+] [ f "\"hello\\\ world\"" "c" load-mode tokenize-line nip ] unit-test
+
+[
+    {
+        T{ token f "int" KEYWORD3 }
+        T{ token f " " f }
+        T{ token f "x" f }
+    }
+] [ f "int x" "java" load-mode tokenize-line nip ] unit-test
+
 [
     {
         T{ token f "//" COMMENT2 }
@@ -66,3 +100,12 @@ IN: temporary
 ] [
      f "<!ELEMENT %hello-world; >" "xml" load-mode tokenize-line nip
 ] unit-test
+
+[
+    {
+        T{ token f "$" KEYWORD2 }
+        T{ token f "FOO" KEYWORD2 }
+    }
+] [
+    f "$FOO" "shellscript" load-mode tokenize-line nip
+] unit-test
diff --git a/extra/xmode/marker/marker.factor b/extra/xmode/marker/marker.factor
old mode 100644
new mode 100755
index c155f8e11c..cd9eacbb88
--- a/extra/xmode/marker/marker.factor
+++ b/extra/xmode/marker/marker.factor
@@ -24,8 +24,18 @@ assocs combinators combinators.lib strings regexp splitting ;
 : mark-number ( keyword -- id )
     keyword-number? DIGIT and ;
 
+: resolve-delegate ( name -- rules )
+    dup string? [
+        "::" split1 [ swap load-mode at ] [ rule-sets get at ] if*
+    ] when ;
+
+: rule-set-keyword-maps ( ruleset -- seq )
+    dup rule-set-imports
+    [ resolve-delegate rule-set-keyword-maps ] map concat
+    swap rule-set-keywords add ;
+
 : mark-keyword ( keyword -- id )
-    current-keywords at ;
+    current-rule-set rule-set-keyword-maps assoc-stack ;
 
 : add-remaining-token ( -- )
     current-rule-set rule-set-default prev-token, ;
@@ -45,30 +55,6 @@ assocs combinators combinators.lib strings regexp splitting ;
 : current-char ( -- char )
     position get line get nth ;
 
-GENERIC: perform-rule ( rule -- )
-
-: ... ;
-
-M: escape-rule perform-rule ( rule -- ) ... ;
-
-: find-escape-rule ( -- rule )
-    context get dup
-    line-context-in-rule-set rule-set-escape-rule
-    [ ] [ line-context-parent find-escape-rule ] ?if ;
-
-: check-escape-rule ( rule -- )
-    #! Unlike jEdit, we keep checking parents until we find
-    #! an escape rule.
-    dup rule-no-escape? [ drop ] [
-        drop
-        ! find-escape-rule
-        ! current-rule-set rule-set-escape-rule [
-        !     find-escape-rule
-        ! ] [
-        !     
-        ! ] if*
-    ] if ;
-
 GENERIC: match-position ( rule -- n )
 
 M: mark-previous-rule match-position drop last-offset get ;
@@ -83,10 +69,10 @@ M: rule match-position drop position get ;
         [ over matcher-at-word-start?     over last-offset get =    implies ]
     } && 2nip ;
 
-: matches-not-mark-following? ... ;
-
 GENERIC: text-matches? ( position text -- match-count/f )
 
+M: f text-matches? 2drop f ;
+
 M: string text-matches?
     ! XXX ignore case
     >r line get swap tail-slice r>
@@ -103,7 +89,7 @@ M: string text-matches?
 
 : rule-end-matches? ( rule -- match-count/f )
     dup mark-following-rule? [
-        dup rule-end swap can-match-here? 0 and
+        dup rule-start swap can-match-here? 0 and
     ] [
         dup rule-end tuck swap can-match-here? [
             position get swap matcher-text
@@ -114,10 +100,48 @@ M: string text-matches?
         ] if
     ] if ;
 
+DEFER: get-rules
+
+: get-imported-rules ( vector/f char ruleset -- vector/f )
+    rule-set-imports
+    [ resolve-delegate get-rules ?push-all ] curry* each ;
+
+: get-always-rules ( vector/f ruleset -- vector/f )
+    f swap rule-set-rules at ?push-all ;
+
+: get-char-rules ( vector/f char ruleset -- vector/f )
+    >r ch>upper r> rule-set-rules at ?push-all ;
+
+: get-rules ( char ruleset -- seq )
+    f -rot
+    [ get-char-rules ] 2keep
+    [ get-always-rules ] keep
+    get-imported-rules ;
+
 GENERIC: handle-rule-start ( match-count rule -- )
 
 GENERIC: handle-rule-end ( match-count rule -- )
 
+: find-escape-rule ( -- rule )
+    context get dup
+    line-context-in-rule-set rule-set-escape-rule [ ] [
+        line-context-parent line-context-in-rule-set
+        dup [ rule-set-escape-rule ] when
+    ] ?if ;
+
+: check-escape-rule ( rule -- ? )
+    rule-no-escape? [ f ] [
+        find-escape-rule dup [
+            dup rule-start-matches? dup [
+                swap handle-rule-start
+                delegate-end-escaped? [ not ] change
+                t
+            ] [
+                2drop f
+            ] if
+        ] when
+    ] if ;
+
 : check-every-rule ( -- ? )
     current-char current-rule-set get-rules
     [ rule-start-matches? ] map-find
@@ -129,11 +153,6 @@ GENERIC: handle-rule-end ( match-count rule -- )
         dup [ swap handle-rule-end ] [ 2drop ] if
     ] when* ;
 
-: handle-escape-rule ( rule -- )
-    ?end-rule
-    ;
-!        ... process escape ... ;
-
 : rule-match-token* ( rule -- id )
     dup rule-match-token {
         { f [ dup rule-body-token ] }
@@ -141,10 +160,13 @@ GENERIC: handle-rule-end ( match-count rule -- )
         [ ]
     } case nip ;
 
-: resolve-delegate ( name -- rules )
-    dup string? [
-        "::" split1 [ swap load-mode at ] [ rule-sets get at ] if*
-    ] when ;
+M: escape-rule handle-rule-start
+    drop
+    ?end-rule
+    process-escape? get [
+        escaped? [ not ] change
+        position [ + ] change
+    ] [ 2drop ] if ;
 
 M: seq-rule handle-rule-start
     ?end-rule
@@ -174,6 +196,10 @@ M: mark-following-rule handle-rule-start
     f context get set-line-context-end
     context get set-line-context-in-rule ;
 
+M: mark-following-rule handle-rule-end
+    nip rule-match-token* prev-token,
+    f context get set-line-context-in-rule ;
+
 M: mark-previous-rule handle-rule-start
     ?end-rule
     mark-token
@@ -183,7 +209,7 @@ M: mark-previous-rule handle-rule-start
 : do-escaped
     escaped? get [
         escaped? off
-        ...
+        ! ...
     ] when ;
 
 : check-end-delegate ( -- ? )
@@ -198,14 +224,14 @@ M: mark-previous-rule handle-rule-start
                 ] keep context get line-context-parent line-context-in-rule rule-match-token* next-token,
                 pop-context
                 seen-whitespace-end? on t
-            ] [ 2drop f ] if
+            ] [ drop check-escape-rule ] if
         ] [ f ] if*
     ] [ f ] if* ;
 
 : handle-no-word-break ( -- )
     context get line-context-parent [
         line-context-in-rule dup rule-no-word-break? [
-            rule-match-token prev-token,
+            rule-match-token* prev-token,
             pop-context
         ] [ drop ] if
     ] when* ;
@@ -221,6 +247,10 @@ M: mark-previous-rule handle-rule-start
     
     1 current-rule-set rule-set-default next-token, ;
 
+: rule-set-empty? ( ruleset -- ? )
+    dup rule-set-rules assoc-empty?
+    swap rule-set-keywords assoc-empty? and ;
+
 : check-word-break ( -- ? )
     current-char dup blank? [
         drop
@@ -232,14 +262,17 @@ M: mark-previous-rule handle-rule-start
         (check-word-break)
 
     ] [
-        dup alpha? [
+        ! Micro-optimization with incorrect semantics; we keep
+        ! it here because jEdit mode files depend on it now...
+        current-rule-set rule-set-empty? [
             drop
         ] [
-            dup current-rule-set dup short. rule-set-no-word-sep* dup . member? [
-                "A: " write write1 nl
+            dup alpha? [
+                drop
             ] [
-                "B: " write write1 nl
-                (check-word-break)
+                current-rule-set rule-set-no-word-sep* member? [
+                    (check-word-break)
+                ] unless
             ] if
         ] if
 
diff --git a/extra/xmode/marker/state/state.factor b/extra/xmode/marker/state/state.factor
old mode 100644
new mode 100755
index 26379501bd..cce7c7567a
--- a/extra/xmode/marker/state/state.factor
+++ b/extra/xmode/marker/state/state.factor
@@ -14,6 +14,7 @@ SYMBOL: whitespace-end
 SYMBOL: seen-whitespace-end?
 
 SYMBOL: escaped?
+SYMBOL: process-escape?
 SYMBOL: delegate-end-escaped?
 SYMBOL: terminated?
 
@@ -61,5 +62,6 @@ SYMBOL: terminated?
     0 position set
     0 last-offset set
     0 whitespace-end set
+    process-escape? on
     [ clone ] [ main-rule-set f <line-context> ] if*
     context set ;
diff --git a/extra/xmode/rules/rules.factor b/extra/xmode/rules/rules.factor
old mode 100644
new mode 100755
index 9b530aae14..7206668edb
--- a/extra/xmode/rules/rules.factor
+++ b/extra/xmode/rules/rules.factor
@@ -45,23 +45,6 @@ MEMO: standard-rule-set ( id -- ruleset )
         over [ >r V{ } like r> over push-all ] [ nip ] if
     ] when* ;
 
-DEFER: get-rules
-
-: get-imported-rules ( vector/f char ruleset -- vector/f )
-    rule-set-imports [ get-rules ?push-all ] curry* each ;
-
-: get-always-rules ( vector/f ruleset -- vector/f )
-    f swap rule-set-rules at ?push-all ;
-
-: get-char-rules ( vector/f char ruleset -- vector/f )
-    >r ch>upper r> rule-set-rules at ?push-all ;
-
-: get-rules ( char ruleset -- seq )
-    f -rot
-    [ get-char-rules ] 2keep
-    [ get-always-rules ] keep
-    get-imported-rules ;
-
 : rule-set-no-word-sep* ( ruleset -- str )
     dup rule-set-keywords keyword-map-no-word-sep*
     swap rule-set-no-word-sep "_" 3append ;