Re-add 334a96f25c
parent
e29f18a2f4
commit
7c804591b1
|
@ -0,0 +1,14 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: help.syntax help.markup ;
|
||||
IN: io.encodings.chinese
|
||||
|
||||
ARTICLE: "io.encodings.chinese" "Chinese text encodings"
|
||||
"The " { $vocab-link "io.encodings.chinese" } " vocabulary implements encodings used for Chinese text besides the standard UTF encodings for Unicode strings."
|
||||
{ $subsection gb18030 } ;
|
||||
|
||||
ABOUT: "io.encodings.chinese"
|
||||
|
||||
HELP: gb18030
|
||||
{ $class-description "The encoding descriptor for GB 18030, a Chinese national standard for text encoding. GB 18030 consists of a unique encoding for each Unicode code point, and for this reason has been described as a UTF. It is backwards compatible with the earlier encodings GB 2312 and GBK." }
|
||||
{ $see-also "encodings-introduction" } ;
|
|
@ -0,0 +1,26 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: io.encodings.chinese io.encodings.string strings tools.test arrays ;
|
||||
IN: io.encodings.chinese.tests
|
||||
|
||||
[ "hello" ] [ "hello" gb18030 encode >string ] unit-test
|
||||
[ "hello" ] [ "hello" gb18030 decode ] unit-test
|
||||
[ B{ HEX: A1 HEX: A4 HEX: 81 HEX: 30 HEX: 86 HEX: 30 } ]
|
||||
[ B{ HEX: B7 HEX: B8 } gb18030 encode ] unit-test
|
||||
[ { HEX: B7 HEX: B8 } ]
|
||||
[ B{ HEX: A1 HEX: A4 HEX: 81 HEX: 30 HEX: 86 HEX: 30 } gb18030 decode >array ] unit-test
|
||||
[ { HEX: B7 CHAR: replacement-character } ]
|
||||
[ B{ HEX: A1 HEX: A4 HEX: 81 HEX: 30 HEX: 86 } gb18030 decode >array ] unit-test
|
||||
[ { HEX: B7 CHAR: replacement-character } ]
|
||||
[ B{ HEX: A1 HEX: A4 HEX: 81 HEX: 30 } gb18030 decode >array ] unit-test
|
||||
[ { HEX: B7 CHAR: replacement-character } ]
|
||||
[ B{ HEX: A1 HEX: A4 HEX: 81 } gb18030 decode >array ] unit-test
|
||||
[ { HEX: B7 } ]
|
||||
[ B{ HEX: A1 HEX: A4 } gb18030 decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ]
|
||||
[ B{ HEX: A1 } gb18030 decode >array ] unit-test
|
||||
[ { HEX: 44D7 HEX: 464B } ]
|
||||
[ B{ HEX: 82 HEX: 33 HEX: A3 HEX: 39 HEX: 82 HEX: 33 HEX: C9 HEX: 31 }
|
||||
gb18030 decode >array ] unit-test
|
||||
[ { HEX: 82 HEX: 33 HEX: A3 HEX: 39 HEX: 82 HEX: 33 HEX: C9 HEX: 31 } ]
|
||||
[ { HEX: 44D7 HEX: 464B } gb18030 encode >array ] unit-test
|
|
@ -0,0 +1,133 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: xml xml.data kernel io io.encodings interval-maps splitting fry
|
||||
math.parser sequences combinators assocs locals accessors math
|
||||
arrays values io.encodings.ascii ascii io.files biassocs math.order
|
||||
combinators.short-circuit io.binary ;
|
||||
IN: io.encodings.chinese
|
||||
|
||||
SINGLETON: gb18030
|
||||
|
||||
<PRIVATE
|
||||
|
||||
! GB to mean GB18030 is a terrible abuse of notation
|
||||
|
||||
! Resource file from:
|
||||
! http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
|
||||
|
||||
TUPLE: range ufirst ulast bfirst blast ;
|
||||
|
||||
: b>byte-array ( string -- byte-array )
|
||||
" " split [ hex> ] B{ } map-as ;
|
||||
|
||||
: add-range ( contained ranges -- )
|
||||
[
|
||||
{
|
||||
[ "uFirst" attr hex> ]
|
||||
[ "uLast" attr hex> ]
|
||||
[ "bFirst" attr b>byte-array ]
|
||||
[ "bLast" attr b>byte-array ]
|
||||
} cleave range boa
|
||||
] dip push ;
|
||||
|
||||
: add-mapping ( contained mapping -- )
|
||||
[
|
||||
[ "b" attr b>byte-array ]
|
||||
[ "u" attr hex> ] bi
|
||||
] dip set-at ;
|
||||
|
||||
: xml>gb-data ( stream -- mapping ranges )
|
||||
[let | mapping [ H{ } clone ] ranges [ V{ } clone ] |
|
||||
[
|
||||
dup contained? [
|
||||
dup name>> main>> {
|
||||
{ "range" [ ranges add-range ] }
|
||||
{ "a" [ mapping add-mapping ] }
|
||||
[ 2drop ]
|
||||
} case
|
||||
] [ drop ] if
|
||||
] each-element mapping ranges
|
||||
] ;
|
||||
|
||||
! Algorithms from:
|
||||
! http://www-128.ibm.com/developerworks/library/u-china.html
|
||||
|
||||
: linear ( bytes -- num )
|
||||
! This hard-codes bMin and bMax
|
||||
reverse first4
|
||||
10 * + 126 * + 10 * + ;
|
||||
|
||||
: unlinear ( num -- bytes )
|
||||
B{ HEX: 81 HEX: 30 HEX: 81 HEX: 30 } linear -
|
||||
10 /mod swap [ HEX: 30 + ] dip
|
||||
126 /mod swap [ HEX: 81 + ] dip
|
||||
10 /mod swap [ HEX: 30 + ] dip
|
||||
HEX: 81 +
|
||||
B{ } 4sequence reverse ;
|
||||
|
||||
: >interval-map-by ( start-quot end-quot value-quot seq -- interval-map )
|
||||
'[ _ [ @ 2array ] _ tri ] { } map>assoc <interval-map> ; inline
|
||||
|
||||
: ranges-u>gb ( ranges -- interval-map )
|
||||
[ ufirst>> ] [ ulast>> ] [ ] >interval-map-by ;
|
||||
|
||||
: ranges-gb>u ( ranges -- interval-map )
|
||||
[ bfirst>> linear ] [ blast>> linear ] [ ] >interval-map-by ;
|
||||
|
||||
VALUE: gb>u
|
||||
VALUE: u>gb
|
||||
VALUE: mapping
|
||||
|
||||
"resource:basis/io/encodings/chinese/gb-18030-2000.xml"
|
||||
ascii <file-reader> xml>gb-data
|
||||
[ ranges-u>gb to: u>gb ] [ ranges-gb>u to: gb>u ] bi
|
||||
>biassoc to: mapping
|
||||
|
||||
: lookup-range ( char -- byte-array )
|
||||
dup u>gb interval-at [
|
||||
[ ufirst>> - ] [ bfirst>> linear ] bi + unlinear
|
||||
] [ encode-error ] if* ;
|
||||
|
||||
M: gb18030 encode-char ( char stream encoding -- )
|
||||
drop [
|
||||
dup mapping at
|
||||
[ ] [ lookup-range ] ?if
|
||||
] dip stream-write ;
|
||||
|
||||
: second-byte? ( ch -- ? ) ! of a double-byte character
|
||||
{ [ HEX: 40 HEX: 7E between? ] [ HEX: 80 HEX: fe between? ] } 1|| ;
|
||||
|
||||
: quad-1/3? ( ch -- ? ) HEX: 81 HEX: fe between? ;
|
||||
|
||||
: quad-2/4? ( ch -- ? ) HEX: 30 HEX: 39 between? ;
|
||||
|
||||
: last-bytes? ( byte-array -- ? )
|
||||
{ [ length 2 = ] [ first quad-1/3? ] [ second quad-2/4? ] } 1&& ;
|
||||
|
||||
: decode-quad ( byte-array -- char )
|
||||
dup mapping value-at [ ] [
|
||||
linear dup gb>u interval-at [
|
||||
[ bfirst>> linear - ] [ ufirst>> ] bi +
|
||||
] [ drop replacement-char ] if*
|
||||
] ?if ;
|
||||
|
||||
: four-byte ( stream byte1 byte2 -- char )
|
||||
rot 2 swap stream-read dup last-bytes?
|
||||
[ first2 B{ } 4sequence decode-quad ]
|
||||
[ 3drop replacement-char ] if ;
|
||||
|
||||
: two-byte ( stream byte -- char )
|
||||
over stream-read1 {
|
||||
{ [ dup not ] [ 3drop replacement-char ] }
|
||||
{ [ dup second-byte? ] [ B{ } 2sequence mapping value-at nip ] }
|
||||
{ [ dup quad-2/4? ] [ four-byte ] }
|
||||
[ 3drop replacement-char ]
|
||||
} cond ;
|
||||
|
||||
M: gb18030 decode-char ( stream encoding -- char )
|
||||
drop dup stream-read1 {
|
||||
{ [ dup not ] [ 2drop f ] }
|
||||
{ [ dup ascii? ] [ nip 1array B{ } like mapping value-at ] }
|
||||
{ [ dup quad-1/3? ] [ two-byte ] }
|
||||
[ 2drop replacement-char ]
|
||||
} cond ;
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue