Unicode character name syntax

db4
Slava Pestov 2008-02-15 19:32:14 -06:00
parent c4ac180c5a
commit 87dfd962a2
7 changed files with 63 additions and 42 deletions

View File

@ -266,19 +266,10 @@ HELP: escape
{ $description "Converts from a single-character escape code and the corresponding character." } { $description "Converts from a single-character escape code and the corresponding character." }
{ $examples { $example "CHAR: n escape CHAR: \\n = ." "t" } } ; { $examples { $example "CHAR: n escape CHAR: \\n = ." "t" } } ;
HELP: next-escape
{ $values { "m" "an index into " { $snippet "str" } } { "str" string } { "n" "an index into " { $snippet "str" } } { "ch" "a character" } }
{ $description "Helper word for " { $link parse-string } " which parses an escape sequence starting at the " { $snippet "m" } "th index of " { $snippet "str" } "." }
{ $errors "Throws a " { $link bad-escape } " if the string contains an invalid escape sequence." } ;
HELP: next-char
{ $values { "m" "an index into " { $snippet "str" } } { "str" string } { "n" "an index into " { $snippet "str" } } { "ch" "a character" } }
{ $description "Helper word for " { $link parse-string } " which parses a character starting at the " { $snippet "m" } "th index of " { $snippet "str" } "." } ;
HELP: parse-string HELP: parse-string
{ $values { "str" "a new " { $link string } } } { $values { "str" "a new " { $link string } } }
{ $description "Parses the line until a quote (\"), interpreting escape codes along the way." } { $description "Parses the line until a quote (\"), interpreting escape codes along the way." }
{ $errors "Throws an " { $link bad-escape } " if the string contains an invalid escape sequence." } { $errors "Throws an error if the string contains an invalid escape sequence." }
$parsing-note ; $parsing-note ;
HELP: still-parsing? HELP: still-parsing?

View File

@ -119,22 +119,43 @@ M: bad-escape summary drop "Bad escape code" ;
{ CHAR: \" CHAR: \" } { CHAR: \" CHAR: \" }
} at [ bad-escape ] unless* ; } at [ bad-escape ] unless* ;
: next-escape ( m str -- n ch ) SYMBOL: name>char-hook
2dup nth CHAR: u =
[ >r 1+ dup 6 + tuck r> subseq hex> ]
[ over 1+ -rot nth escape ] if ;
: next-char ( m str -- n ch ) name>char-hook global [
2dup nth CHAR: \\ = [ "Unicode support not available" throw ] or
[ >r 1+ r> next-escape ] [ over 1+ -rot nth ] if ; ] change-at
: (parse-string) ( m str -- n ) : unicode-escape ( str -- ch str' )
2dup nth CHAR: " = "{" ?head-slice [
[ drop 1+ ] [ [ next-char , ] keep (parse-string) ] if ; CHAR: } over index cut-slice
>r >string name>char-hook get call r>
1 tail-slice
] [
6 cut-slice >r hex> r>
] if ;
: next-escape ( str -- ch str' )
"u" ?head-slice [
unicode-escape
] [
unclip-slice escape swap
] if ;
: (parse-string) ( str -- m )
dup [ "\"\\" member? ] find dup [
>r cut-slice >r % r> 1 tail-slice r>
dup CHAR: " = [
drop slice-from
] [
drop next-escape >r , r> (parse-string)
] if
] [
"Unterminated string" throw
] if ;
: parse-string ( -- str ) : parse-string ( -- str )
lexer get [ lexer get [
[ (parse-string) ] "" make swap [ swap tail-slice (parse-string) ] "" make swap
] change-column ; ] change-column ;
TUPLE: parse-error file line col text ; TUPLE: parse-error file line col text ;

View File

@ -100,13 +100,9 @@ ARTICLE: "escape" "Character escape codes"
{ { $snippet "\\0" } "a null byte (ASCII 0)" } { { $snippet "\\0" } "a null byte (ASCII 0)" }
{ { $snippet "\\e" } "escape (ASCII 27)" } { { $snippet "\\e" } "escape (ASCII 27)" }
{ { $snippet "\\\"" } { $snippet "\"" } } { { $snippet "\\\"" } { $snippet "\"" } }
} { { $snippet "\\u" { $emphasis "xxxxxx" } } { "The Unicode code point with hexadecimal number " { $snippet { $emphasis "xxxxxx" } } } }
"A Unicode character can be specified by its code number by writing " { $snippet "\\u" } " followed by a six-digit hexadecimal number. That is, the following two expressions are equivalent:" { { $snippet "\\u{" { $emphasis "name" } "}" } { "The Unicode code point named " { $snippet { $emphasis "name" } } } }
{ $code } ;
"CHAR: \\u000078"
"78"
}
"While not useful for single characters, this syntax is also permitted inside strings." ;
ARTICLE: "syntax-strings" "Character and string syntax" ARTICLE: "syntax-strings" "Character and string syntax"
"Factor has no distinct character type, however Unicode character value integers can be read by specifying a literal character, or an escaped representation thereof." "Factor has no distinct character type, however Unicode character value integers can be read by specifying a literal character, or an escaped representation thereof."
@ -412,8 +408,17 @@ HELP: IN:
HELP: CHAR: HELP: CHAR:
{ $syntax "CHAR: token" } { $syntax "CHAR: token" }
{ $values { "token" "a literal character or escape code" } } { $values { "token" "a literal character, escape code, or Unicode character name" } }
{ $description "Adds the Unicode code point of the character represented by the token to the parse tree." } ; { $description "Adds a Unicode code point to the parse tree." }
{ $examples
{ $code
"CHAR: x"
"CHAR: \\u000032"
"CHAR: \\u{exclamation-mark}"
"CHAR: exclamation-mark"
"CHAR: ugaritic-letter-samka"
}
} ;
HELP: " HELP: "
{ $syntax "\"string...\"" } { $syntax "\"string...\"" }

View File

@ -5,7 +5,8 @@ byte-vectors definitions generic hashtables kernel math
namespaces parser sequences strings sbufs vectors words namespaces parser sequences strings sbufs vectors words
quotations io assocs splitting tuples generic.standard quotations io assocs splitting tuples generic.standard
generic.math classes io.files vocabs float-arrays float-vectors generic.math classes io.files vocabs float-arrays float-vectors
classes.union classes.mixin classes.predicate compiler.units ; classes.union classes.mixin classes.predicate compiler.units
combinators ;
IN: bootstrap.syntax IN: bootstrap.syntax
! These words are defined as a top-level form, instead of with ! These words are defined as a top-level form, instead of with
@ -56,7 +57,14 @@ IN: bootstrap.syntax
"f" [ f parsed ] define-syntax "f" [ f parsed ] define-syntax
"t" "syntax" lookup define-symbol "t" "syntax" lookup define-symbol
"CHAR:" [ 0 scan next-char nip parsed ] define-syntax "CHAR:" [
scan {
{ [ dup length 1 = ] [ first ] }
{ [ "\\" ?head ] [ next-escape drop ] }
{ [ t ] [ name>char-hook get call ] }
} cond parsed
] define-syntax
"\"" [ parse-string parsed ] define-syntax "\"" [ parse-string parsed ] define-syntax
"SBUF\"" [ "SBUF\"" [

View File

@ -1,4 +0,0 @@
USING: unicode.syntax tools.test ;
[ CHAR: ! ] [ UNICHAR: exclamation-mark ] unit-test
! Write a test for CATEGORY and CATEGORY-NOT

4
extra/unicode/syntax/syntax.factor Normal file → Executable file
View File

@ -46,7 +46,3 @@ IN: unicode.syntax
: CATEGORY-NOT: : CATEGORY-NOT:
CREATE ";" parse-tokens CREATE ";" parse-tokens
categories swap seq-minus define-category ; parsing categories swap seq-minus define-category ; parsing
: UNICHAR:
! This should be part of CHAR:. Also, name-map at ==> name>char
scan name>char [ parsed ] [ "Invalid character" throw ] if* ; parsing

View File

@ -1,5 +1,9 @@
USING: unicode.syntax unicode.data unicode.breaks unicode.normalize USING: unicode.syntax unicode.data unicode.breaks
unicode.case unicode.categories ; unicode.normalize unicode.case unicode.categories
parser ;
IN: unicode IN: unicode
! For now: convenience to load all Unicode vocabs ! For now: convenience to load all Unicode vocabs
[ name>char [ "Invalid character" throw ] unless* ]
name>char-hook set-global