Unicode character name syntax

2008-02-15 19:32:14 -06:00 · 2008-02-15 19:32:14 -06:00 · 87dfd962a2
parent c4ac180c5a
commit 87dfd962a2
7 changed files with 63 additions and 42 deletions
--- a/core/parser/parser-docs.factor
+++ b/core/parser/parser-docs.factor
@ -266,19 +266,10 @@ HELP: escape
 { $description "Converts from a single-character escape code and the corresponding character." }
 { $examples { $example "CHAR: n escape CHAR: \\n = ." "t" } } ;

-HELP: next-escape
-{ $values { "m" "an index into " { $snippet "str" } } { "str" string } { "n" "an index into " { $snippet "str" } } { "ch" "a character" } }
-{ $description "Helper word for " { $link parse-string } " which parses an escape sequence starting at the " { $snippet "m" } "th index of " { $snippet "str" } "." }
-{ $errors "Throws a " { $link bad-escape } " if the string contains an invalid escape sequence." } ;
-
-HELP: next-char
-{ $values { "m" "an index into " { $snippet "str" } } { "str" string } { "n" "an index into " { $snippet "str" } } { "ch" "a character" } }
-{ $description "Helper word for " { $link parse-string } " which parses a character starting at the " { $snippet "m" } "th index of " { $snippet "str" } "." } ;
-
 HELP: parse-string
 { $values { "str" "a new " { $link string } } }
 { $description "Parses the line until a quote (\"), interpreting escape codes along the way." }
-{ $errors "Throws an " { $link bad-escape } " if the string contains an invalid escape sequence." }
+{ $errors "Throws an error if the string contains an invalid escape sequence." }
 $parsing-note ;

 HELP: still-parsing?
--- a/core/parser/parser.factor
+++ b/core/parser/parser.factor
@ -119,22 +119,43 @@ M: bad-escape summary drop "Bad escape code" ;
        { CHAR: \" CHAR: \" }
    } at [ bad-escape ] unless* ;

-: next-escape ( m str -- n ch )
-    2dup nth CHAR: u =
-    [ >r 1+ dup 6 + tuck r> subseq hex> ]
-    [ over 1+ -rot nth escape ] if ;
+SYMBOL: name>char-hook

-: next-char ( m str -- n ch )
-    2dup nth CHAR: \\ =
-    [ >r 1+ r> next-escape ] [ over 1+ -rot nth ] if ;
+name>char-hook global [
+    [ "Unicode support not available" throw ] or
+] change-at

-: (parse-string) ( m str -- n )
-    2dup nth CHAR: " =
-    [ drop 1+ ] [ [ next-char , ] keep (parse-string) ] if ;
+: unicode-escape ( str -- ch str' )
+    "{" ?head-slice [
+        CHAR: } over index cut-slice
+        >r >string name>char-hook get call r>
+        1 tail-slice
+    ] [
+        6 cut-slice >r hex> r>
+    ] if ;
+
+: next-escape ( str -- ch str' )
+    "u" ?head-slice [
+        unicode-escape
+    ] [
+        unclip-slice escape swap
+    ] if ;
+
+: (parse-string) ( str -- m )
+    dup [ "\"\\" member? ] find dup [
+        >r cut-slice >r % r> 1 tail-slice r>
+        dup CHAR: " = [
+            drop slice-from
+        ] [
+            drop next-escape >r , r> (parse-string)
+        ] if
+    ] [
+        "Unterminated string" throw
+    ] if ;

 : parse-string ( -- str )
    lexer get [
-        [ (parse-string) ] "" make swap
+        [ swap tail-slice (parse-string) ] "" make swap
    ] change-column ;

 TUPLE: parse-error file line col text ;
--- a/core/syntax/syntax-docs.factor
+++ b/core/syntax/syntax-docs.factor
@ -100,13 +100,9 @@ ARTICLE: "escape" "Character escape codes"
    { { $snippet "\\0" } "a null byte (ASCII 0)" }
    { { $snippet "\\e" } "escape (ASCII 27)" }
    { { $snippet "\\\"" } { $snippet "\"" } }
-}
-"A Unicode character can be specified by its code number by writing " { $snippet "\\u" } " followed by a six-digit hexadecimal number. That is, the following two expressions are equivalent:"
-{ $code
-    "CHAR: \\u000078"
-    "78"
-}
-"While not useful for single characters, this syntax is also permitted inside strings." ;
+    { { $snippet "\\u" { $emphasis "xxxxxx" } } { "The Unicode code point with hexadecimal number " { $snippet { $emphasis "xxxxxx" } } } }
+    { { $snippet "\\u{" { $emphasis "name" } "}" } { "The Unicode code point named " { $snippet { $emphasis "name" } } } }
+} ;

 ARTICLE: "syntax-strings" "Character and string syntax"
 "Factor has no distinct character type, however Unicode character value integers can be read by specifying a literal character, or an escaped representation thereof."
@ -412,8 +408,17 @@ HELP: IN:

 HELP: CHAR:
 { $syntax "CHAR: token" }
-{ $values { "token" "a literal character or escape code" } }
-{ $description "Adds the Unicode code point of the character represented by the token to the parse tree." } ;
+{ $values { "token" "a literal character, escape code, or Unicode character name" } }
+{ $description "Adds a Unicode code point to the parse tree." }
+{ $examples
+    { $code
+        "CHAR: x"
+        "CHAR: \\u000032"
+        "CHAR: \\u{exclamation-mark}"
+        "CHAR: exclamation-mark"
+        "CHAR: ugaritic-letter-samka"
+    }
+} ;

 HELP: "
 { $syntax "\"string...\"" }
--- a/core/syntax/syntax.factor
+++ b/core/syntax/syntax.factor
@ -5,7 +5,8 @@ byte-vectors definitions generic hashtables kernel math
 namespaces parser sequences strings sbufs vectors words
 quotations io assocs splitting tuples generic.standard
 generic.math classes io.files vocabs float-arrays float-vectors
-classes.union classes.mixin classes.predicate compiler.units ;
+classes.union classes.mixin classes.predicate compiler.units
+combinators ;
 IN: bootstrap.syntax

 ! These words are defined as a top-level form, instead of with
@ -56,7 +57,14 @@ IN: bootstrap.syntax
    "f" [ f parsed ] define-syntax
    "t" "syntax" lookup define-symbol

-    "CHAR:" [ 0 scan next-char nip parsed ] define-syntax
+    "CHAR:" [
+        scan {
+            { [ dup length 1 = ] [ first ] }
+            { [ "\\" ?head ] [ next-escape drop ] }
+            { [ t ] [ name>char-hook get call ] }
+        } cond parsed
+    ] define-syntax
+
    "\"" [ parse-string parsed ] define-syntax

    "SBUF\"" [
--- a/extra/unicode/syntax/syntax-tests.factor
+++ b/extra/unicode/syntax/syntax-tests.factor
@ -1,4 +0,0 @@
-USING: unicode.syntax tools.test ;
-
-[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
-! Write a test for CATEGORY and CATEGORY-NOT
--- a/extra/unicode/syntax/syntax.factor
+++ b/extra/unicode/syntax/syntax.factor
@ -46,7 +46,3 @@ IN: unicode.syntax
 : CATEGORY-NOT:
    CREATE ";" parse-tokens
    categories swap seq-minus define-category ; parsing
-
-: UNICHAR:
-    ! This should be part of CHAR:. Also, name-map at ==> name>char
-    scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
--- a/extra/unicode/unicode.factor
+++ b/extra/unicode/unicode.factor
@ -1,5 +1,9 @@
-USING: unicode.syntax unicode.data unicode.breaks unicode.normalize
-unicode.case unicode.categories ;
+USING: unicode.syntax unicode.data unicode.breaks
+unicode.normalize unicode.case unicode.categories
+parser ;
 IN: unicode

 ! For now: convenience to load all Unicode vocabs
+
+[ name>char [ "Invalid character" throw ] unless* ]
+name>char-hook set-global