diff --git a/core/parser/parser-docs.factor b/core/parser/parser-docs.factor index ae38925c68..ce6a119e32 100755 --- a/core/parser/parser-docs.factor +++ b/core/parser/parser-docs.factor @@ -266,19 +266,10 @@ HELP: escape { $description "Converts from a single-character escape code and the corresponding character." } { $examples { $example "CHAR: n escape CHAR: \\n = ." "t" } } ; -HELP: next-escape -{ $values { "m" "an index into " { $snippet "str" } } { "str" string } { "n" "an index into " { $snippet "str" } } { "ch" "a character" } } -{ $description "Helper word for " { $link parse-string } " which parses an escape sequence starting at the " { $snippet "m" } "th index of " { $snippet "str" } "." } -{ $errors "Throws a " { $link bad-escape } " if the string contains an invalid escape sequence." } ; - -HELP: next-char -{ $values { "m" "an index into " { $snippet "str" } } { "str" string } { "n" "an index into " { $snippet "str" } } { "ch" "a character" } } -{ $description "Helper word for " { $link parse-string } " which parses a character starting at the " { $snippet "m" } "th index of " { $snippet "str" } "." } ; - HELP: parse-string { $values { "str" "a new " { $link string } } } { $description "Parses the line until a quote (\"), interpreting escape codes along the way." } -{ $errors "Throws an " { $link bad-escape } " if the string contains an invalid escape sequence." } +{ $errors "Throws an error if the string contains an invalid escape sequence." } $parsing-note ; HELP: still-parsing? diff --git a/core/parser/parser.factor b/core/parser/parser.factor index 1bd7979a0c..c84c836390 100755 --- a/core/parser/parser.factor +++ b/core/parser/parser.factor @@ -119,22 +119,43 @@ M: bad-escape summary drop "Bad escape code" ; { CHAR: \" CHAR: \" } } at [ bad-escape ] unless* ; -: next-escape ( m str -- n ch ) - 2dup nth CHAR: u = - [ >r 1+ dup 6 + tuck r> subseq hex> ] - [ over 1+ -rot nth escape ] if ; +SYMBOL: name>char-hook -: next-char ( m str -- n ch ) - 2dup nth CHAR: \\ = - [ >r 1+ r> next-escape ] [ over 1+ -rot nth ] if ; +name>char-hook global [ + [ "Unicode support not available" throw ] or +] change-at -: (parse-string) ( m str -- n ) - 2dup nth CHAR: " = - [ drop 1+ ] [ [ next-char , ] keep (parse-string) ] if ; +: unicode-escape ( str -- ch str' ) + "{" ?head-slice [ + CHAR: } over index cut-slice + >r >string name>char-hook get call r> + 1 tail-slice + ] [ + 6 cut-slice >r hex> r> + ] if ; + +: next-escape ( str -- ch str' ) + "u" ?head-slice [ + unicode-escape + ] [ + unclip-slice escape swap + ] if ; + +: (parse-string) ( str -- m ) + dup [ "\"\\" member? ] find dup [ + >r cut-slice >r % r> 1 tail-slice r> + dup CHAR: " = [ + drop slice-from + ] [ + drop next-escape >r , r> (parse-string) + ] if + ] [ + "Unterminated string" throw + ] if ; : parse-string ( -- str ) lexer get [ - [ (parse-string) ] "" make swap + [ swap tail-slice (parse-string) ] "" make swap ] change-column ; TUPLE: parse-error file line col text ; diff --git a/core/syntax/syntax-docs.factor b/core/syntax/syntax-docs.factor index 9ccfd2efcd..95a00f3801 100755 --- a/core/syntax/syntax-docs.factor +++ b/core/syntax/syntax-docs.factor @@ -100,13 +100,9 @@ ARTICLE: "escape" "Character escape codes" { { $snippet "\\0" } "a null byte (ASCII 0)" } { { $snippet "\\e" } "escape (ASCII 27)" } { { $snippet "\\\"" } { $snippet "\"" } } -} -"A Unicode character can be specified by its code number by writing " { $snippet "\\u" } " followed by a six-digit hexadecimal number. That is, the following two expressions are equivalent:" -{ $code - "CHAR: \\u000078" - "78" -} -"While not useful for single characters, this syntax is also permitted inside strings." ; + { { $snippet "\\u" { $emphasis "xxxxxx" } } { "The Unicode code point with hexadecimal number " { $snippet { $emphasis "xxxxxx" } } } } + { { $snippet "\\u{" { $emphasis "name" } "}" } { "The Unicode code point named " { $snippet { $emphasis "name" } } } } +} ; ARTICLE: "syntax-strings" "Character and string syntax" "Factor has no distinct character type, however Unicode character value integers can be read by specifying a literal character, or an escaped representation thereof." @@ -412,8 +408,17 @@ HELP: IN: HELP: CHAR: { $syntax "CHAR: token" } -{ $values { "token" "a literal character or escape code" } } -{ $description "Adds the Unicode code point of the character represented by the token to the parse tree." } ; +{ $values { "token" "a literal character, escape code, or Unicode character name" } } +{ $description "Adds a Unicode code point to the parse tree." } +{ $examples + { $code + "CHAR: x" + "CHAR: \\u000032" + "CHAR: \\u{exclamation-mark}" + "CHAR: exclamation-mark" + "CHAR: ugaritic-letter-samka" + } +} ; HELP: " { $syntax "\"string...\"" } diff --git a/core/syntax/syntax.factor b/core/syntax/syntax.factor index 67799b92ea..601c05d8d9 100755 --- a/core/syntax/syntax.factor +++ b/core/syntax/syntax.factor @@ -5,7 +5,8 @@ byte-vectors definitions generic hashtables kernel math namespaces parser sequences strings sbufs vectors words quotations io assocs splitting tuples generic.standard generic.math classes io.files vocabs float-arrays float-vectors -classes.union classes.mixin classes.predicate compiler.units ; +classes.union classes.mixin classes.predicate compiler.units +combinators ; IN: bootstrap.syntax ! These words are defined as a top-level form, instead of with @@ -56,7 +57,14 @@ IN: bootstrap.syntax "f" [ f parsed ] define-syntax "t" "syntax" lookup define-symbol - "CHAR:" [ 0 scan next-char nip parsed ] define-syntax + "CHAR:" [ + scan { + { [ dup length 1 = ] [ first ] } + { [ "\\" ?head ] [ next-escape drop ] } + { [ t ] [ name>char-hook get call ] } + } cond parsed + ] define-syntax + "\"" [ parse-string parsed ] define-syntax "SBUF\"" [ diff --git a/extra/unicode/syntax/syntax-tests.factor b/extra/unicode/syntax/syntax-tests.factor deleted file mode 100644 index 1579e368c2..0000000000 --- a/extra/unicode/syntax/syntax-tests.factor +++ /dev/null @@ -1,4 +0,0 @@ -USING: unicode.syntax tools.test ; - -[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test -! Write a test for CATEGORY and CATEGORY-NOT diff --git a/extra/unicode/syntax/syntax.factor b/extra/unicode/syntax/syntax.factor old mode 100644 new mode 100755 index 6c75a77c76..bd3fd4ae2a --- a/extra/unicode/syntax/syntax.factor +++ b/extra/unicode/syntax/syntax.factor @@ -46,7 +46,3 @@ IN: unicode.syntax : CATEGORY-NOT: CREATE ";" parse-tokens categories swap seq-minus define-category ; parsing - -: UNICHAR: - ! This should be part of CHAR:. Also, name-map at ==> name>char - scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing diff --git a/extra/unicode/unicode.factor b/extra/unicode/unicode.factor index 12673c3bde..a62840c535 100755 --- a/extra/unicode/unicode.factor +++ b/extra/unicode/unicode.factor @@ -1,5 +1,9 @@ -USING: unicode.syntax unicode.data unicode.breaks unicode.normalize -unicode.case unicode.categories ; +USING: unicode.syntax unicode.data unicode.breaks +unicode.normalize unicode.case unicode.categories +parser ; IN: unicode ! For now: convenience to load all Unicode vocabs + +[ name>char [ "Invalid character" throw ] unless* ] +name>char-hook set-global