Unicode character name syntax
parent
c4ac180c5a
commit
87dfd962a2
|
@ -266,19 +266,10 @@ HELP: escape
|
||||||
{ $description "Converts from a single-character escape code and the corresponding character." }
|
{ $description "Converts from a single-character escape code and the corresponding character." }
|
||||||
{ $examples { $example "CHAR: n escape CHAR: \\n = ." "t" } } ;
|
{ $examples { $example "CHAR: n escape CHAR: \\n = ." "t" } } ;
|
||||||
|
|
||||||
HELP: next-escape
|
|
||||||
{ $values { "m" "an index into " { $snippet "str" } } { "str" string } { "n" "an index into " { $snippet "str" } } { "ch" "a character" } }
|
|
||||||
{ $description "Helper word for " { $link parse-string } " which parses an escape sequence starting at the " { $snippet "m" } "th index of " { $snippet "str" } "." }
|
|
||||||
{ $errors "Throws a " { $link bad-escape } " if the string contains an invalid escape sequence." } ;
|
|
||||||
|
|
||||||
HELP: next-char
|
|
||||||
{ $values { "m" "an index into " { $snippet "str" } } { "str" string } { "n" "an index into " { $snippet "str" } } { "ch" "a character" } }
|
|
||||||
{ $description "Helper word for " { $link parse-string } " which parses a character starting at the " { $snippet "m" } "th index of " { $snippet "str" } "." } ;
|
|
||||||
|
|
||||||
HELP: parse-string
|
HELP: parse-string
|
||||||
{ $values { "str" "a new " { $link string } } }
|
{ $values { "str" "a new " { $link string } } }
|
||||||
{ $description "Parses the line until a quote (\"), interpreting escape codes along the way." }
|
{ $description "Parses the line until a quote (\"), interpreting escape codes along the way." }
|
||||||
{ $errors "Throws an " { $link bad-escape } " if the string contains an invalid escape sequence." }
|
{ $errors "Throws an error if the string contains an invalid escape sequence." }
|
||||||
$parsing-note ;
|
$parsing-note ;
|
||||||
|
|
||||||
HELP: still-parsing?
|
HELP: still-parsing?
|
||||||
|
|
|
@ -119,22 +119,43 @@ M: bad-escape summary drop "Bad escape code" ;
|
||||||
{ CHAR: \" CHAR: \" }
|
{ CHAR: \" CHAR: \" }
|
||||||
} at [ bad-escape ] unless* ;
|
} at [ bad-escape ] unless* ;
|
||||||
|
|
||||||
: next-escape ( m str -- n ch )
|
SYMBOL: name>char-hook
|
||||||
2dup nth CHAR: u =
|
|
||||||
[ >r 1+ dup 6 + tuck r> subseq hex> ]
|
|
||||||
[ over 1+ -rot nth escape ] if ;
|
|
||||||
|
|
||||||
: next-char ( m str -- n ch )
|
name>char-hook global [
|
||||||
2dup nth CHAR: \\ =
|
[ "Unicode support not available" throw ] or
|
||||||
[ >r 1+ r> next-escape ] [ over 1+ -rot nth ] if ;
|
] change-at
|
||||||
|
|
||||||
: (parse-string) ( m str -- n )
|
: unicode-escape ( str -- ch str' )
|
||||||
2dup nth CHAR: " =
|
"{" ?head-slice [
|
||||||
[ drop 1+ ] [ [ next-char , ] keep (parse-string) ] if ;
|
CHAR: } over index cut-slice
|
||||||
|
>r >string name>char-hook get call r>
|
||||||
|
1 tail-slice
|
||||||
|
] [
|
||||||
|
6 cut-slice >r hex> r>
|
||||||
|
] if ;
|
||||||
|
|
||||||
|
: next-escape ( str -- ch str' )
|
||||||
|
"u" ?head-slice [
|
||||||
|
unicode-escape
|
||||||
|
] [
|
||||||
|
unclip-slice escape swap
|
||||||
|
] if ;
|
||||||
|
|
||||||
|
: (parse-string) ( str -- m )
|
||||||
|
dup [ "\"\\" member? ] find dup [
|
||||||
|
>r cut-slice >r % r> 1 tail-slice r>
|
||||||
|
dup CHAR: " = [
|
||||||
|
drop slice-from
|
||||||
|
] [
|
||||||
|
drop next-escape >r , r> (parse-string)
|
||||||
|
] if
|
||||||
|
] [
|
||||||
|
"Unterminated string" throw
|
||||||
|
] if ;
|
||||||
|
|
||||||
: parse-string ( -- str )
|
: parse-string ( -- str )
|
||||||
lexer get [
|
lexer get [
|
||||||
[ (parse-string) ] "" make swap
|
[ swap tail-slice (parse-string) ] "" make swap
|
||||||
] change-column ;
|
] change-column ;
|
||||||
|
|
||||||
TUPLE: parse-error file line col text ;
|
TUPLE: parse-error file line col text ;
|
||||||
|
|
|
@ -100,13 +100,9 @@ ARTICLE: "escape" "Character escape codes"
|
||||||
{ { $snippet "\\0" } "a null byte (ASCII 0)" }
|
{ { $snippet "\\0" } "a null byte (ASCII 0)" }
|
||||||
{ { $snippet "\\e" } "escape (ASCII 27)" }
|
{ { $snippet "\\e" } "escape (ASCII 27)" }
|
||||||
{ { $snippet "\\\"" } { $snippet "\"" } }
|
{ { $snippet "\\\"" } { $snippet "\"" } }
|
||||||
}
|
{ { $snippet "\\u" { $emphasis "xxxxxx" } } { "The Unicode code point with hexadecimal number " { $snippet { $emphasis "xxxxxx" } } } }
|
||||||
"A Unicode character can be specified by its code number by writing " { $snippet "\\u" } " followed by a six-digit hexadecimal number. That is, the following two expressions are equivalent:"
|
{ { $snippet "\\u{" { $emphasis "name" } "}" } { "The Unicode code point named " { $snippet { $emphasis "name" } } } }
|
||||||
{ $code
|
} ;
|
||||||
"CHAR: \\u000078"
|
|
||||||
"78"
|
|
||||||
}
|
|
||||||
"While not useful for single characters, this syntax is also permitted inside strings." ;
|
|
||||||
|
|
||||||
ARTICLE: "syntax-strings" "Character and string syntax"
|
ARTICLE: "syntax-strings" "Character and string syntax"
|
||||||
"Factor has no distinct character type, however Unicode character value integers can be read by specifying a literal character, or an escaped representation thereof."
|
"Factor has no distinct character type, however Unicode character value integers can be read by specifying a literal character, or an escaped representation thereof."
|
||||||
|
@ -412,8 +408,17 @@ HELP: IN:
|
||||||
|
|
||||||
HELP: CHAR:
|
HELP: CHAR:
|
||||||
{ $syntax "CHAR: token" }
|
{ $syntax "CHAR: token" }
|
||||||
{ $values { "token" "a literal character or escape code" } }
|
{ $values { "token" "a literal character, escape code, or Unicode character name" } }
|
||||||
{ $description "Adds the Unicode code point of the character represented by the token to the parse tree." } ;
|
{ $description "Adds a Unicode code point to the parse tree." }
|
||||||
|
{ $examples
|
||||||
|
{ $code
|
||||||
|
"CHAR: x"
|
||||||
|
"CHAR: \\u000032"
|
||||||
|
"CHAR: \\u{exclamation-mark}"
|
||||||
|
"CHAR: exclamation-mark"
|
||||||
|
"CHAR: ugaritic-letter-samka"
|
||||||
|
}
|
||||||
|
} ;
|
||||||
|
|
||||||
HELP: "
|
HELP: "
|
||||||
{ $syntax "\"string...\"" }
|
{ $syntax "\"string...\"" }
|
||||||
|
|
|
@ -5,7 +5,8 @@ byte-vectors definitions generic hashtables kernel math
|
||||||
namespaces parser sequences strings sbufs vectors words
|
namespaces parser sequences strings sbufs vectors words
|
||||||
quotations io assocs splitting tuples generic.standard
|
quotations io assocs splitting tuples generic.standard
|
||||||
generic.math classes io.files vocabs float-arrays float-vectors
|
generic.math classes io.files vocabs float-arrays float-vectors
|
||||||
classes.union classes.mixin classes.predicate compiler.units ;
|
classes.union classes.mixin classes.predicate compiler.units
|
||||||
|
combinators ;
|
||||||
IN: bootstrap.syntax
|
IN: bootstrap.syntax
|
||||||
|
|
||||||
! These words are defined as a top-level form, instead of with
|
! These words are defined as a top-level form, instead of with
|
||||||
|
@ -56,7 +57,14 @@ IN: bootstrap.syntax
|
||||||
"f" [ f parsed ] define-syntax
|
"f" [ f parsed ] define-syntax
|
||||||
"t" "syntax" lookup define-symbol
|
"t" "syntax" lookup define-symbol
|
||||||
|
|
||||||
"CHAR:" [ 0 scan next-char nip parsed ] define-syntax
|
"CHAR:" [
|
||||||
|
scan {
|
||||||
|
{ [ dup length 1 = ] [ first ] }
|
||||||
|
{ [ "\\" ?head ] [ next-escape drop ] }
|
||||||
|
{ [ t ] [ name>char-hook get call ] }
|
||||||
|
} cond parsed
|
||||||
|
] define-syntax
|
||||||
|
|
||||||
"\"" [ parse-string parsed ] define-syntax
|
"\"" [ parse-string parsed ] define-syntax
|
||||||
|
|
||||||
"SBUF\"" [
|
"SBUF\"" [
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
USING: unicode.syntax tools.test ;
|
|
||||||
|
|
||||||
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
|
|
||||||
! Write a test for CATEGORY and CATEGORY-NOT
|
|
|
@ -46,7 +46,3 @@ IN: unicode.syntax
|
||||||
: CATEGORY-NOT:
|
: CATEGORY-NOT:
|
||||||
CREATE ";" parse-tokens
|
CREATE ";" parse-tokens
|
||||||
categories swap seq-minus define-category ; parsing
|
categories swap seq-minus define-category ; parsing
|
||||||
|
|
||||||
: UNICHAR:
|
|
||||||
! This should be part of CHAR:. Also, name-map at ==> name>char
|
|
||||||
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing
|
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
USING: unicode.syntax unicode.data unicode.breaks unicode.normalize
|
USING: unicode.syntax unicode.data unicode.breaks
|
||||||
unicode.case unicode.categories ;
|
unicode.normalize unicode.case unicode.categories
|
||||||
|
parser ;
|
||||||
IN: unicode
|
IN: unicode
|
||||||
|
|
||||||
! For now: convenience to load all Unicode vocabs
|
! For now: convenience to load all Unicode vocabs
|
||||||
|
|
||||||
|
[ name>char [ "Invalid character" throw ] unless* ]
|
||||||
|
name>char-hook set-global
|
||||||
|
|
Loading…
Reference in New Issue