From 8bbc144ce7bb7eddd0aa5a737595e9b4ad2850e9 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 11 Feb 2008 17:44:14 -0600 Subject: [PATCH] io/utf8 and 16 were moved to core/io/encodings --- core/io/utf16/authors.txt | 1 - core/io/utf16/summary.txt | 1 - core/io/utf16/utf16-docs.factor | 45 ------------ core/io/utf16/utf16-tests.factor | 15 ---- core/io/utf16/utf16.factor | 116 ------------------------------- core/io/utf8/authors.txt | 1 - core/io/utf8/summary.txt | 1 - core/io/utf8/utf8-docs.factor | 18 ----- core/io/utf8/utf8-tests.factor | 16 ----- core/io/utf8/utf8.factor | 72 ------------------- 10 files changed, 286 deletions(-) delete mode 100644 core/io/utf16/authors.txt delete mode 100644 core/io/utf16/summary.txt delete mode 100644 core/io/utf16/utf16-docs.factor delete mode 100755 core/io/utf16/utf16-tests.factor delete mode 100755 core/io/utf16/utf16.factor delete mode 100644 core/io/utf8/authors.txt delete mode 100644 core/io/utf8/summary.txt delete mode 100644 core/io/utf8/utf8-docs.factor delete mode 100644 core/io/utf8/utf8-tests.factor delete mode 100644 core/io/utf8/utf8.factor diff --git a/core/io/utf16/authors.txt b/core/io/utf16/authors.txt deleted file mode 100644 index f990dd0ed2..0000000000 --- a/core/io/utf16/authors.txt +++ /dev/null @@ -1 +0,0 @@ -Daniel Ehrenberg diff --git a/core/io/utf16/summary.txt b/core/io/utf16/summary.txt deleted file mode 100644 index b2490675aa..0000000000 --- a/core/io/utf16/summary.txt +++ /dev/null @@ -1 +0,0 @@ -UTF16 encoding/decoding diff --git a/core/io/utf16/utf16-docs.factor b/core/io/utf16/utf16-docs.factor deleted file mode 100644 index 6d24f54694..0000000000 --- a/core/io/utf16/utf16-docs.factor +++ /dev/null @@ -1,45 +0,0 @@ -USING: help.markup help.syntax io.encodings strings ; -IN: io.utf16 - -ARTICLE: "io.utf16" "Working with UTF16-encoded data" -"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences." -{ $subsection encode-utf16le } -{ $subsection encode-utf16be } -{ $subsection decode-utf16le } -{ $subsection decode-utf16be } -"Support for UTF16 data with a byte order mark:" -{ $subsection encode-utf16 } -{ $subsection decode-utf16 } ; - -ABOUT: "io.utf16" - -HELP: decode-utf16 -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; - -HELP: decode-utf16be -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; - -HELP: decode-utf16le -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; - -{ decode-utf16 decode-utf16le decode-utf16be } related-words - -HELP: encode-utf16be -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ; - -HELP: encode-utf16le -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ; - -HELP: encode-utf16 -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ; - -{ encode-utf16 encode-utf16be encode-utf16le } related-words diff --git a/core/io/utf16/utf16-tests.factor b/core/io/utf16/utf16-tests.factor deleted file mode 100755 index 9800a9827d..0000000000 --- a/core/io/utf16/utf16-tests.factor +++ /dev/null @@ -1,15 +0,0 @@ -USING: tools.test io.utf16 arrays unicode.syntax ; - -[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test -[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test - -[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test - -[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test -[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test - -[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test diff --git a/core/io/utf16/utf16.factor b/core/io/utf16/utf16.factor deleted file mode 100755 index 19ebc1d43a..0000000000 --- a/core/io/utf16/utf16.factor +++ /dev/null @@ -1,116 +0,0 @@ -! Copyright (C) 2006, 2007 Daniel Ehrenberg. -! See http://factorcode.org/license.txt for BSD license. -USING: math kernel sequences sbufs vectors namespaces io.binary -io.encodings combinators splitting ; -IN: io.utf16 - -SYMBOL: double -SYMBOL: quad1 -SYMBOL: quad2 -SYMBOL: quad3 -SYMBOL: ignore - -: do-ignore ( -- ch state ) 0 ignore ; - -: append-nums ( byte ch -- ch ) - 8 shift bitor ; - -: end-multibyte ( buf byte ch -- buf ch state ) - append-nums decoded ; - -: begin-utf16be ( buf byte -- buf ch state ) - dup -3 shift BIN: 11011 number= [ - dup BIN: 00000100 bitand zero? - [ BIN: 11 bitand quad1 ] - [ drop do-ignore ] if - ] [ double ] if ; - -: handle-quad2be ( byte ch -- ch state ) - swap dup -2 shift BIN: 110111 number= [ - >r 2 shift r> BIN: 11 bitand bitor quad3 - ] [ 2drop do-ignore ] if ; - -: (decode-utf16be) ( buf byte ch state -- buf ch state ) - { - { begin [ drop begin-utf16be ] } - { double [ end-multibyte ] } - { quad1 [ append-nums quad2 ] } - { quad2 [ handle-quad2be ] } - { quad3 [ append-nums HEX: 10000 + decoded ] } - { ignore [ 2drop push-replacement ] } - } case ; - -: decode-utf16be ( seq -- str ) - [ -rot (decode-utf16be) ] decode ; - -: handle-double ( buf byte ch -- buf ch state ) - swap dup -3 shift BIN: 11011 = [ - dup BIN: 100 bitand 0 number= - [ BIN: 11 bitand 8 shift bitor quad2 ] - [ 2drop push-replacement ] if - ] [ end-multibyte ] if ; - -: handle-quad3le ( buf byte ch -- buf ch state ) - swap dup -2 shift BIN: 110111 = [ - BIN: 11 bitand append-nums HEX: 10000 + decoded - ] [ 2drop push-replacement ] if ; - -: (decode-utf16le) ( buf byte ch state -- buf ch state ) - { - { begin [ drop double ] } - { double [ handle-double ] } - { quad1 [ append-nums quad2 ] } - { quad2 [ 10 shift bitor quad3 ] } - { quad3 [ handle-quad3le ] } - } case ; - -: decode-utf16le ( seq -- str ) - [ -rot (decode-utf16le) ] decode ; - -: encode-first - -10 shift - dup -8 shift BIN: 11011000 bitor - swap HEX: FF bitand ; - -: encode-second - BIN: 1111111111 bitand - dup -8 shift BIN: 11011100 bitor - swap BIN: 11111111 bitand ; - -: char>utf16be ( char -- ) - dup HEX: FFFF > [ - HEX: 10000 - - dup encode-first swap , , - encode-second swap , , - ] [ h>b/b , , ] if ; - -: encode-utf16be ( str -- seq ) - [ [ char>utf16be ] each ] B{ } make ; - -: char>utf16le ( char -- ) - dup HEX: FFFF > [ - HEX: 10000 - - dup encode-first , , - encode-second , , - ] [ h>b/b swap , , ] if ; - -: encode-utf16le ( str -- seq ) - [ [ char>utf16le ] each ] B{ } make ; - -: bom-le B{ HEX: ff HEX: fe } ; inline - -: bom-be B{ HEX: fe HEX: ff } ; inline - -: encode-utf16 ( str -- seq ) - encode-utf16le bom-le swap append ; - -: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ; - -: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; - -: decode-utf16 ( seq -- str ) - { - { [ utf16le? ] [ decode-utf16le ] } - { [ utf16be? ] [ decode-utf16be ] } - { [ t ] [ decode-error ] } - } cond ; diff --git a/core/io/utf8/authors.txt b/core/io/utf8/authors.txt deleted file mode 100644 index f990dd0ed2..0000000000 --- a/core/io/utf8/authors.txt +++ /dev/null @@ -1 +0,0 @@ -Daniel Ehrenberg diff --git a/core/io/utf8/summary.txt b/core/io/utf8/summary.txt deleted file mode 100644 index afd259a56b..0000000000 --- a/core/io/utf8/summary.txt +++ /dev/null @@ -1 +0,0 @@ -UTF8 encoding/decoding diff --git a/core/io/utf8/utf8-docs.factor b/core/io/utf8/utf8-docs.factor deleted file mode 100644 index 28310b5d77..0000000000 --- a/core/io/utf8/utf8-docs.factor +++ /dev/null @@ -1,18 +0,0 @@ -USING: help.markup help.syntax io.encodings strings ; -IN: io.utf8 - -ARTICLE: "io.utf8" "Working with UTF8-encoded data" -"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences." -{ $subsection encode-utf8 } -{ $subsection decode-utf8 } ; - -ABOUT: "io.utf8" - -HELP: decode-utf8 -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in UTF8 format." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; - -HELP: encode-utf8 -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in UTF8 format." } ; diff --git a/core/io/utf8/utf8-tests.factor b/core/io/utf8/utf8-tests.factor deleted file mode 100644 index 3576471586..0000000000 --- a/core/io/utf8/utf8-tests.factor +++ /dev/null @@ -1,16 +0,0 @@ -USING: io.utf8 tools.test strings arrays unicode.syntax ; - -[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test - -[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test - -[ "x" ] [ "x" decode-utf8 >string ] unit-test - -[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test - -[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test - -[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test - -[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] -[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test diff --git a/core/io/utf8/utf8.factor b/core/io/utf8/utf8.factor deleted file mode 100644 index 213afb6eae..0000000000 --- a/core/io/utf8/utf8.factor +++ /dev/null @@ -1,72 +0,0 @@ -! Copyright (C) 2006, 2007 Daniel Ehrenberg. -! See http://factorcode.org/license.txt for BSD license. -USING: math kernel sequences sbufs vectors -namespaces io.encodings combinators ; -IN: io.utf8 - -SYMBOL: double -SYMBOL: triple -SYMBOL: triple2 -SYMBOL: quad -SYMBOL: quad2 -SYMBOL: quad3 - -: starts-2? ( char -- ? ) - -6 shift BIN: 10 number= ; - -: append-nums ( buf bottom top state-out -- buf num state ) - >r over starts-2? - [ 6 shift swap BIN: 111111 bitand bitor r> ] - [ r> 3drop push-replacement ] if ; - -: begin-utf8 ( buf byte -- buf ch state ) - { - { [ dup -7 shift zero? ] [ decoded ] } - { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] } - { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] } - { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] } - { [ t ] [ drop push-replacement ] } - } cond ; - -: end-multibyte ( buf byte ch -- buf ch state ) - f append-nums [ decoded ] unless* ; - -: (decode-utf8) ( buf byte ch state -- buf ch state ) - { - { begin [ drop begin-utf8 ] } - { double [ end-multibyte ] } - { triple [ triple2 append-nums ] } - { triple2 [ end-multibyte ] } - { quad [ quad2 append-nums ] } - { quad2 [ quad3 append-nums ] } - { quad3 [ end-multibyte ] } - } case ; - -: decode-utf8 ( seq -- str ) - [ -rot (decode-utf8) ] decode ; - -: encoded ( char -- ) - BIN: 111111 bitand BIN: 10000000 bitor , ; - -: char>utf8 ( char -- ) - { - { [ dup -7 shift zero? ] [ , ] } - { [ dup -11 shift zero? ] [ - dup -6 shift BIN: 11000000 bitor , - encoded - ] } - { [ dup -16 shift zero? ] [ - dup -12 shift BIN: 11100000 bitor , - dup -6 shift encoded - encoded - ] } - { [ t ] [ - dup -18 shift BIN: 11110000 bitor , - dup -12 shift encoded - dup -6 shift encoded - encoded - ] } - } cond ; - -: encode-utf8 ( str -- seq ) - [ [ char>utf8 ] each ] B{ } make ;