Updating extra/xmode

2007-12-02 05:25:18 -05:00 · 2007-12-02 05:25:18 -05:00 · 9a0d318b91
parent ca0df2cb46
commit 9a0d318b91
6 changed files with 165 additions and 69 deletions
--- a/extra/xmode/README.txt
+++ b/extra/xmode/README.txt
@ -1,5 +1,41 @@
-This is a Factor port of jEdit's syntax highlighting engine.
+This is a Factor port of the jEdit 4.3 syntax highlighting engine
+(http://www.jedit.org).

-It implements a relatively basic, rule-driven recursive parser.
-The parser is incremental, with one line granularity. This is
-still a work in progress.
+jEdit 1.2, released in late 1998, was the first release to support
+syntax highlighting. It featured a small number of hand-coded
+"token markers" -- simple incremental parers -- all based on the
+original JavaTokenMarker contributed by Tal Davidson.
+
+Around the time of jEdit 1.5 in 1999, Mike Dillon began developing a
+jEdit plugin named "XMode". This plugin implemented a generic,
+rule-driven token marker which read mode descriptions from XML files.
+XMode eventually matured to the point where it could replace the
+formerly hand-coded token markers.
+
+With the release of jEdit 2.4, I merged XMode into the core and
+eliminated the old hand-coded token markers.
+
+XMode suffers from a somewhat archaic design, and was written at a time
+when Java VMs with JIT compilers were relatively uncommon, object
+allocation was expensive, and heap space tight. As a result the parser
+design is less general than it could be.
+
+Furthermore, the parser has a few bugs which some mode files have come
+to depend on:
+
+- If a RULES tag does not define any keywords or rules, then its
+  NO_WORD_SEP attribute is ignored.
+
+  The Factor implementation duplicates this behavior.
+
+- if a RULES tag does not have a NO_WORD_SEP attribute, then
+  it inherits the value of the NO_WORD_SEP attribute from the previous
+  RULES tag.
+
+  The Factor implementation does not duplicate this behavior.
+
+This is still a work in progress. If you find any behavioral differences
+between the Factor implementation and the original jEdit code, please
+report them as bugs. Also, if you wish to contribute a new or improved
+mode file, please contact the jEdit project. Updated mode files in jEdit
+will be periodically imported into the Factor source tree.
--- a/extra/xmode/loader/loader.factor
+++ b/extra/xmode/loader/loader.factor
@ -35,8 +35,7 @@ IN: xmode.loader
    dup children>string swap position-attrs <matcher> ;

 : parse-regexp-matcher ( tag -- matcher )
-    ! XXX
-    dup children>string swap position-attrs <matcher> ;
+    dup children>string <regexp> swap position-attrs <matcher> ;

 ! SPAN's children
 <TAGS: parse-begin/end-tag
@ -146,7 +145,7 @@ TAGS>
        { "SET" string>rule-set-name set-rule-set-name }
        { "IGNORE_CASE" string>boolean set-rule-set-ignore-case? }
        { "HIGHLIGHT_DIGITS" string>boolean set-rule-set-highlight-digits? }
-        { "DIGIT_RE" f set-rule-set-digit-re } ! XXX
+        { "DIGIT_RE" <regexp> set-rule-set-digit-re } ! XXX
        { "ESCAPE" f add-escape-rule }
        { "DEFAULT" string>token set-rule-set-default }
        { "NO_WORD_SEP" f set-rule-set-no-word-sep }
--- a/extra/xmode/marker/marker-tests.factor
+++ b/extra/xmode/marker/marker-tests.factor
@ -2,6 +2,40 @@ USING: xmode.tokens xmode.catalog
 xmode.marker tools.test kernel ;
 IN: temporary

+[
+    {
+        T{ token f "int" KEYWORD3 }
+        T{ token f " " f }
+        T{ token f "x" f }
+    }
+] [ f "int x" "c" load-mode tokenize-line nip ] unit-test
+
+[
+    {
+        T{ token f "\"" LITERAL1 }
+        T{ token f "hello\\\"" LITERAL1 }
+        T{ token f " " LITERAL1 }
+        T{ token f "world" LITERAL1 }
+        T{ token f "\"" LITERAL1 }
+    }
+] [ f "\"hello\\\" world\"" "c" load-mode tokenize-line nip ] unit-test
+
+[
+    {
+        T{ token f "\"" LITERAL1 }
+        T{ token f "hello\\\ world" LITERAL1 }
+        T{ token f "\"" LITERAL1 }
+    }
+] [ f "\"hello\\\ world\"" "c" load-mode tokenize-line nip ] unit-test
+
+[
+    {
+        T{ token f "int" KEYWORD3 }
+        T{ token f " " f }
+        T{ token f "x" f }
+    }
+] [ f "int x" "java" load-mode tokenize-line nip ] unit-test
+
 [
    {
        T{ token f "//" COMMENT2 }
@ -66,3 +100,12 @@ IN: temporary
 ] [
     f "<!ELEMENT %hello-world; >" "xml" load-mode tokenize-line nip
 ] unit-test
+
+[
+    {
+        T{ token f "$" KEYWORD2 }
+        T{ token f "FOO" KEYWORD2 }
+    }
+] [
+    f "$FOO" "shellscript" load-mode tokenize-line nip
+] unit-test
--- a/extra/xmode/marker/marker.factor
+++ b/extra/xmode/marker/marker.factor
@ -24,8 +24,18 @@ assocs combinators combinators.lib strings regexp splitting ;
 : mark-number ( keyword -- id )
    keyword-number? DIGIT and ;

+: resolve-delegate ( name -- rules )
+    dup string? [
+        "::" split1 [ swap load-mode at ] [ rule-sets get at ] if*
+    ] when ;
+
+: rule-set-keyword-maps ( ruleset -- seq )
+    dup rule-set-imports
+    [ resolve-delegate rule-set-keyword-maps ] map concat
+    swap rule-set-keywords add ;
+
 : mark-keyword ( keyword -- id )
-    current-keywords at ;
+    current-rule-set rule-set-keyword-maps assoc-stack ;

 : add-remaining-token ( -- )
    current-rule-set rule-set-default prev-token, ;
@ -45,30 +55,6 @@ assocs combinators combinators.lib strings regexp splitting ;
 : current-char ( -- char )
    position get line get nth ;

-GENERIC: perform-rule ( rule -- )
-
-: ... ;
-
-M: escape-rule perform-rule ( rule -- ) ... ;
-
-: find-escape-rule ( -- rule )
-    context get dup
-    line-context-in-rule-set rule-set-escape-rule
-    [ ] [ line-context-parent find-escape-rule ] ?if ;
-
-: check-escape-rule ( rule -- )
-    #! Unlike jEdit, we keep checking parents until we find
-    #! an escape rule.
-    dup rule-no-escape? [ drop ] [
-        drop
-        ! find-escape-rule
-        ! current-rule-set rule-set-escape-rule [
-        !     find-escape-rule
-        ! ] [
-        !     
-        ! ] if*
-    ] if ;
-
 GENERIC: match-position ( rule -- n )

 M: mark-previous-rule match-position drop last-offset get ;
@ -83,10 +69,10 @@ M: rule match-position drop position get ;
        [ over matcher-at-word-start?     over last-offset get =    implies ]
    } && 2nip ;

-: matches-not-mark-following? ... ;
-
 GENERIC: text-matches? ( position text -- match-count/f )

+M: f text-matches? 2drop f ;
+
 M: string text-matches?
    ! XXX ignore case
    >r line get swap tail-slice r>
@ -103,7 +89,7 @@ M: string text-matches?

 : rule-end-matches? ( rule -- match-count/f )
    dup mark-following-rule? [
-        dup rule-end swap can-match-here? 0 and
+        dup rule-start swap can-match-here? 0 and
    ] [
        dup rule-end tuck swap can-match-here? [
            position get swap matcher-text
@ -114,10 +100,48 @@ M: string text-matches?
        ] if
    ] if ;

+DEFER: get-rules
+
+: get-imported-rules ( vector/f char ruleset -- vector/f )
+    rule-set-imports
+    [ resolve-delegate get-rules ?push-all ] curry* each ;
+
+: get-always-rules ( vector/f ruleset -- vector/f )
+    f swap rule-set-rules at ?push-all ;
+
+: get-char-rules ( vector/f char ruleset -- vector/f )
+    >r ch>upper r> rule-set-rules at ?push-all ;
+
+: get-rules ( char ruleset -- seq )
+    f -rot
+    [ get-char-rules ] 2keep
+    [ get-always-rules ] keep
+    get-imported-rules ;
+
 GENERIC: handle-rule-start ( match-count rule -- )

 GENERIC: handle-rule-end ( match-count rule -- )

+: find-escape-rule ( -- rule )
+    context get dup
+    line-context-in-rule-set rule-set-escape-rule [ ] [
+        line-context-parent line-context-in-rule-set
+        dup [ rule-set-escape-rule ] when
+    ] ?if ;
+
+: check-escape-rule ( rule -- ? )
+    rule-no-escape? [ f ] [
+        find-escape-rule dup [
+            dup rule-start-matches? dup [
+                swap handle-rule-start
+                delegate-end-escaped? [ not ] change
+                t
+            ] [
+                2drop f
+            ] if
+        ] when
+    ] if ;
+
 : check-every-rule ( -- ? )
    current-char current-rule-set get-rules
    [ rule-start-matches? ] map-find
@ -129,11 +153,6 @@ GENERIC: handle-rule-end ( match-count rule -- )
        dup [ swap handle-rule-end ] [ 2drop ] if
    ] when* ;

-: handle-escape-rule ( rule -- )
-    ?end-rule
-    ;
-!        ... process escape ... ;
-
 : rule-match-token* ( rule -- id )
    dup rule-match-token {
        { f [ dup rule-body-token ] }
@ -141,10 +160,13 @@ GENERIC: handle-rule-end ( match-count rule -- )
        [ ]
    } case nip ;

-: resolve-delegate ( name -- rules )
-    dup string? [
-        "::" split1 [ swap load-mode at ] [ rule-sets get at ] if*
-    ] when ;
+M: escape-rule handle-rule-start
+    drop
+    ?end-rule
+    process-escape? get [
+        escaped? [ not ] change
+        position [ + ] change
+    ] [ 2drop ] if ;

 M: seq-rule handle-rule-start
    ?end-rule
@ -174,6 +196,10 @@ M: mark-following-rule handle-rule-start
    f context get set-line-context-end
    context get set-line-context-in-rule ;

+M: mark-following-rule handle-rule-end
+    nip rule-match-token* prev-token,
+    f context get set-line-context-in-rule ;
+
 M: mark-previous-rule handle-rule-start
    ?end-rule
    mark-token
@ -183,7 +209,7 @@ M: mark-previous-rule handle-rule-start
 : do-escaped
    escaped? get [
        escaped? off
-        ...
+        ! ...
    ] when ;

 : check-end-delegate ( -- ? )
@ -198,14 +224,14 @@ M: mark-previous-rule handle-rule-start
                ] keep context get line-context-parent line-context-in-rule rule-match-token* next-token,
                pop-context
                seen-whitespace-end? on t
-            ] [ 2drop f ] if
+            ] [ drop check-escape-rule ] if
        ] [ f ] if*
    ] [ f ] if* ;

 : handle-no-word-break ( -- )
    context get line-context-parent [
        line-context-in-rule dup rule-no-word-break? [
-            rule-match-token prev-token,
+            rule-match-token* prev-token,
            pop-context
        ] [ drop ] if
    ] when* ;
@ -221,6 +247,10 @@ M: mark-previous-rule handle-rule-start
    
    1 current-rule-set rule-set-default next-token, ;

+: rule-set-empty? ( ruleset -- ? )
+    dup rule-set-rules assoc-empty?
+    swap rule-set-keywords assoc-empty? and ;
+
 : check-word-break ( -- ? )
    current-char dup blank? [
        drop
@ -232,14 +262,17 @@ M: mark-previous-rule handle-rule-start
        (check-word-break)

    ] [
-        dup alpha? [
+        ! Micro-optimization with incorrect semantics; we keep
+        ! it here because jEdit mode files depend on it now...
+        current-rule-set rule-set-empty? [
            drop
        ] [
-            dup current-rule-set dup short. rule-set-no-word-sep* dup . member? [
-                "A: " write write1 nl
+            dup alpha? [
+                drop
            ] [
-                "B: " write write1 nl
-                (check-word-break)
+                current-rule-set rule-set-no-word-sep* member? [
+                    (check-word-break)
+                ] unless
            ] if
        ] if

--- a/extra/xmode/marker/state/state.factor
+++ b/extra/xmode/marker/state/state.factor
@ -14,6 +14,7 @@ SYMBOL: whitespace-end
 SYMBOL: seen-whitespace-end?

 SYMBOL: escaped?
+SYMBOL: process-escape?
 SYMBOL: delegate-end-escaped?
 SYMBOL: terminated?

@ -61,5 +62,6 @@ SYMBOL: terminated?
    0 position set
    0 last-offset set
    0 whitespace-end set
+    process-escape? on
    [ clone ] [ main-rule-set f <line-context> ] if*
    context set ;
--- a/extra/xmode/rules/rules.factor
+++ b/extra/xmode/rules/rules.factor
@ -45,23 +45,6 @@ MEMO: standard-rule-set ( id -- ruleset )
        over [ >r V{ } like r> over push-all ] [ nip ] if
    ] when* ;

-DEFER: get-rules
-
-: get-imported-rules ( vector/f char ruleset -- vector/f )
-    rule-set-imports [ get-rules ?push-all ] curry* each ;
-
-: get-always-rules ( vector/f ruleset -- vector/f )
-    f swap rule-set-rules at ?push-all ;
-
-: get-char-rules ( vector/f char ruleset -- vector/f )
-    >r ch>upper r> rule-set-rules at ?push-all ;
-
-: get-rules ( char ruleset -- seq )
-    f -rot
-    [ get-char-rules ] 2keep
-    [ get-always-rules ] keep
-    get-imported-rules ;
-
 : rule-set-no-word-sep* ( ruleset -- str )
    dup rule-set-keywords keyword-map-no-word-sep*
    swap rule-set-no-word-sep "_" 3append ;