Merge branch 'master' of git://factorcode.org/git/littledan
commit
205d5ca6cb
|
@ -0,0 +1 @@
|
|||
Dummy encoding for binary I/O
|
|
@ -0,0 +1,5 @@
|
|||
USING: help.syntax help.markup ;
|
||||
IN: io.encodings.binary
|
||||
|
||||
HELP: binary
|
||||
{ $class-description "This is the encoding descriptor for binary I/O." } ;
|
|
@ -0,0 +1,6 @@
|
|||
USING: kernel io.encodings ;
|
||||
|
||||
TUPLE: binary ;
|
||||
|
||||
M: binary init-decoding drop ;
|
||||
M: binary init-encoding drop ;
|
|
@ -0,0 +1 @@
|
|||
text
|
|
@ -1,7 +1,7 @@
|
|||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: math kernel sequences sbufs vectors
|
||||
namespaces unicode.syntax ;
|
||||
namespaces unicode.syntax growable strings io ;
|
||||
IN: io.encodings
|
||||
|
||||
TUPLE: encode-error ;
|
||||
|
@ -23,6 +23,42 @@ SYMBOL: begin
|
|||
: finish-decoding ( buf ch state -- str )
|
||||
begin eq? [ decode-error ] unless drop "" like ;
|
||||
|
||||
: decode ( seq quot -- str )
|
||||
>r [ length <sbuf> 0 begin ] keep r> each
|
||||
: start-decoding ( seq length -- buf ch state seq )
|
||||
<sbuf> 0 begin roll ;
|
||||
|
||||
: decode ( seq quot -- string )
|
||||
>r dup length start-decoding r>
|
||||
[ -rot ] swap compose each
|
||||
finish-decoding ; inline
|
||||
|
||||
: space ( resizable -- room-left )
|
||||
dup underlying swap [ length ] 2apply - ;
|
||||
|
||||
: full? ( resizable -- ? ) space zero? ;
|
||||
|
||||
: end-read-loop ( buf ch state stream quot -- string/f )
|
||||
2drop 2drop >string f like ;
|
||||
|
||||
: under ( a b c -- c a b c )
|
||||
tuck >r swapd r> ; inline
|
||||
|
||||
: decode-read-loop ( buf ch state stream quot -- string/f )
|
||||
>r >r pick r> r> rot full? [ end-read-loop ] [
|
||||
over stream-read1 [
|
||||
-rot tuck >r >r >r -rot r> call r> r> decode-read-loop
|
||||
] [ end-read-loop ] if*
|
||||
] if ; inline
|
||||
|
||||
: decode-read ( length stream quot -- string )
|
||||
>r swap start-decoding r>
|
||||
decode-read-loop ; inline
|
||||
|
||||
GENERIC: init-decoding ( stream encoding -- decoded-stream )
|
||||
|
||||
: <decoding> ( stream decoding-class -- decoded-stream )
|
||||
construct-empty init-decoding ;
|
||||
|
||||
GENERIC: init-encoding ( stream encoding -- encoded-stream )
|
||||
|
||||
: <encoding> ( stream encoding-class -- encoded-stream )
|
||||
construct-empty init-encoding ;
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
ISO 8859-1 encoding/decoding
|
|
@ -0,0 +1,5 @@
|
|||
USING: help.syntax help.markup ;
|
||||
IN: io.encodings.latin1
|
||||
|
||||
HELP: latin1
|
||||
{ $class-description "This class is used for Latin 1 (ISO 8859-1) encoding and decoding" } ;
|
|
@ -0,0 +1,19 @@
|
|||
USING: io.encodings strings kernel ;
|
||||
IN: io.encodings.latin1
|
||||
|
||||
TUPLE: latin1 stream ;
|
||||
|
||||
M: latin1 init-decoding tuck set-latin1-stream ;
|
||||
M: latin1 init-encoding drop ;
|
||||
|
||||
M: latin1 stream-read1
|
||||
latin1-stream stream-read1 ;
|
||||
|
||||
M: latin1 stream-read
|
||||
latin1-stream stream-read >string ;
|
||||
|
||||
M: latin1 stream-read-until
|
||||
latin1-stream stream-read-until >string ;
|
||||
|
||||
M: latin1 stream-readln
|
||||
latin1-stream stream-readln >string ;
|
|
@ -0,0 +1 @@
|
|||
text
|
Binary file not shown.
|
@ -0,0 +1 @@
|
|||
UTF-16, UTF-16LE, UTF-16BE encoding and decoding
|
|
@ -0,0 +1 @@
|
|||
Daniel Ehrenberg
|
|
@ -0,0 +1 @@
|
|||
text
|
|
@ -1,5 +1,5 @@
|
|||
USING: help.markup help.syntax io.encodings strings ;
|
||||
IN: io.utf16
|
||||
IN: io.encodings.utf16
|
||||
|
||||
ARTICLE: "io.utf16" "Working with UTF16-encoded data"
|
||||
"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences."
|
|
@ -30,7 +30,7 @@ SYMBOL: ignore
|
|||
>r 2 shift r> BIN: 11 bitand bitor quad3
|
||||
] [ 2drop do-ignore ] if ;
|
||||
|
||||
: (decode-utf16be) ( buf byte ch state -- buf ch state )
|
||||
: decode-utf16be-step ( buf byte ch state -- buf ch state )
|
||||
{
|
||||
{ begin [ drop begin-utf16be ] }
|
||||
{ double [ end-multibyte ] }
|
||||
|
@ -41,7 +41,7 @@ SYMBOL: ignore
|
|||
} case ;
|
||||
|
||||
: decode-utf16be ( seq -- str )
|
||||
[ -rot (decode-utf16be) ] decode ;
|
||||
[ decode-utf16be-step ] decode ;
|
||||
|
||||
: handle-double ( buf byte ch -- buf ch state )
|
||||
swap dup -3 shift BIN: 11011 = [
|
||||
|
@ -55,7 +55,7 @@ SYMBOL: ignore
|
|||
BIN: 11 bitand append-nums HEX: 10000 + decoded
|
||||
] [ 2drop push-replacement ] if ;
|
||||
|
||||
: (decode-utf16le) ( buf byte ch state -- buf ch state )
|
||||
: decode-utf16le-step ( buf byte ch state -- buf ch state )
|
||||
{
|
||||
{ begin [ drop double ] }
|
||||
{ double [ handle-double ] }
|
||||
|
@ -65,7 +65,7 @@ SYMBOL: ignore
|
|||
} case ;
|
||||
|
||||
: decode-utf16le ( seq -- str )
|
||||
[ -rot (decode-utf16le) ] decode ;
|
||||
[ decode-utf16le-step ] decode ;
|
||||
|
||||
: encode-first
|
||||
-10 shift
|
||||
|
@ -114,3 +114,35 @@ SYMBOL: ignore
|
|||
{ [ utf16be? ] [ decode-utf16be ] }
|
||||
{ [ t ] [ decode-error ] }
|
||||
} cond ;
|
||||
|
||||
! UTF16LE streams
|
||||
|
||||
TUPLE: utf16le ;
|
||||
: <utf16le> utf16le construct-delegate ;
|
||||
! In the future, this should detect and ignore a BOM at the beginning
|
||||
|
||||
M: utf16le init-decoding ( stream utf16le -- utf16le-stream )
|
||||
tuck set-delegate ;
|
||||
|
||||
M: utf16le init-encoding ( stream utf16le -- utf16le-stream )
|
||||
tuck set-delegate ;
|
||||
|
||||
M: utf16le stream-read1 1 swap stream-read ;
|
||||
|
||||
M: utf16le stream-read
|
||||
delegate [ decode-utf16le-step ] decode-read ;
|
||||
|
||||
M: utf16le stream-read-partial stream-read ;
|
||||
|
||||
M: utf16le stream-read-until
|
||||
! Copied from { c-reader stream-read-until }!!!
|
||||
[ swap read-until-loop ] "" make
|
||||
swap over empty? over not and [ 2drop f f ] when ;
|
||||
|
||||
M: utf16le stream-write1
|
||||
>r 1string r> stream-write ;
|
||||
|
||||
M: utf16le stream-write
|
||||
>r encode-utf16le r> delegate stream-write ;
|
||||
|
||||
M: utf16le dispose delegate dispose ;
|
|
@ -0,0 +1 @@
|
|||
UTF-8 encoding and decoding
|
|
@ -0,0 +1 @@
|
|||
Daniel Ehrenberg
|
|
@ -0,0 +1 @@
|
|||
text
|
|
@ -1,5 +1,5 @@
|
|||
USING: help.markup help.syntax io.encodings strings ;
|
||||
IN: io.utf8
|
||||
IN: io.encodings.utf8
|
||||
|
||||
ARTICLE: "io.utf8" "Working with UTF8-encoded data"
|
||||
"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences."
|
|
@ -0,0 +1,23 @@
|
|||
USING: io.encodings.utf8 tools.test sbufs kernel io
|
||||
sequences strings arrays unicode.syntax ;
|
||||
|
||||
: decode-utf8-w/stream ( array -- newarray )
|
||||
>sbuf dup reverse-here <utf8> contents >array ;
|
||||
|
||||
: encode-utf8-w/stream ( array -- newarray )
|
||||
SBUF" " clone tuck <utf8> write >array ;
|
||||
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
|
||||
|
||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
|
||||
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
|
|
@ -1,8 +1,10 @@
|
|||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: math kernel sequences sbufs vectors
|
||||
namespaces io.encodings combinators ;
|
||||
IN: io.utf8
|
||||
USING: math kernel sequences sbufs vectors growable io continuations
|
||||
namespaces io.encodings combinators strings io.streams.c ;
|
||||
IN: io.encodings.utf8
|
||||
|
||||
! Decoding UTF-8
|
||||
|
||||
SYMBOL: double
|
||||
SYMBOL: triple
|
||||
|
@ -31,7 +33,7 @@ SYMBOL: quad3
|
|||
: end-multibyte ( buf byte ch -- buf ch state )
|
||||
f append-nums [ decoded ] unless* ;
|
||||
|
||||
: (decode-utf8) ( buf byte ch state -- buf ch state )
|
||||
: decode-utf8-step ( buf byte ch state -- buf ch state )
|
||||
{
|
||||
{ begin [ drop begin-utf8 ] }
|
||||
{ double [ end-multibyte ] }
|
||||
|
@ -43,7 +45,9 @@ SYMBOL: quad3
|
|||
} case ;
|
||||
|
||||
: decode-utf8 ( seq -- str )
|
||||
[ -rot (decode-utf8) ] decode ;
|
||||
[ decode-utf8-step ] decode ;
|
||||
|
||||
! Encoding UTF-8
|
||||
|
||||
: encoded ( char -- )
|
||||
BIN: 111111 bitand BIN: 10000000 bitor , ;
|
||||
|
@ -70,3 +74,35 @@ SYMBOL: quad3
|
|||
|
||||
: encode-utf8 ( str -- seq )
|
||||
[ [ char>utf8 ] each ] B{ } make ;
|
||||
|
||||
! Interface for streams
|
||||
|
||||
TUPLE: utf8 ;
|
||||
: <utf8> utf8 construct-delegate ;
|
||||
! In the future, this should detect and ignore a BOM at the beginning
|
||||
|
||||
M: utf8 init-decoding ( stream utf8 -- utf8-stream )
|
||||
tuck set-delegate ;
|
||||
|
||||
M: utf8 init-encoding ( stream utf8 -- utf8-stream )
|
||||
tuck set-delegate ;
|
||||
|
||||
M: utf8 stream-read1 1 swap stream-read ;
|
||||
|
||||
M: utf8 stream-read
|
||||
delegate [ decode-utf8-step ] decode-read ;
|
||||
|
||||
M: utf8 stream-read-partial stream-read ;
|
||||
|
||||
M: utf8 stream-read-until
|
||||
! Copied from { c-reader stream-read-until }!!!
|
||||
[ swap read-until-loop ] "" make
|
||||
swap over empty? over not and [ 2drop f f ] when ;
|
||||
|
||||
M: utf8 stream-write1
|
||||
>r 1string r> stream-write ;
|
||||
|
||||
M: utf8 stream-write
|
||||
>r encode-utf8 r> delegate stream-write ;
|
||||
|
||||
M: utf8 dispose delegate dispose ;
|
|
@ -1,16 +0,0 @@
|
|||
USING: io.utf8 tools.test strings arrays unicode.syntax ;
|
||||
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
|
||||
|
||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
|
||||
|
||||
[ "x" ] [ "x" decode-utf8 >string ] unit-test
|
||||
|
||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
|
||||
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
|
||||
|
||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
|
||||
|
||||
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
|
||||
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
|
|
@ -1,24 +0,0 @@
|
|||
USING: kernel parser words sequences ;
|
||||
IN: const
|
||||
|
||||
: define-const ( word value -- )
|
||||
[ parsed ] curry dupd define
|
||||
t "parsing" set-word-prop ;
|
||||
|
||||
: CONST:
|
||||
CREATE scan-word dup parsing?
|
||||
[ execute dup pop ] when define-const ; parsing
|
||||
|
||||
: define-enum ( words -- )
|
||||
dup length [ define-const ] 2each ;
|
||||
|
||||
: ENUM:
|
||||
";" parse-tokens [ create-in ] map define-enum ; parsing
|
||||
|
||||
: define-value ( word -- )
|
||||
{ f } clone [ first ] curry define ;
|
||||
|
||||
: VALUE: CREATE define-value ; parsing
|
||||
|
||||
: set-value ( value word -- )
|
||||
word-def first set-first ;
|
|
@ -1,7 +1,7 @@
|
|||
! Copyright (C) 2007 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: delegate sequences.private sequences assocs prettyprint.sections
|
||||
io definitions kernel ;
|
||||
io definitions kernel continuations ;
|
||||
IN: delegate.protocols
|
||||
|
||||
PROTOCOL: sequence-protocol
|
||||
|
@ -15,7 +15,7 @@ PROTOCOL: assoc-protocol
|
|||
! everything should work, just slower (with >alist)
|
||||
|
||||
PROTOCOL: stream-protocol
|
||||
stream-read1 stream-read stream-read-until
|
||||
stream-read1 stream-read stream-read-until dispose
|
||||
stream-flush stream-write1 stream-write stream-format
|
||||
stream-nl make-span-stream make-block-stream stream-readln
|
||||
make-cell-stream stream-write-table ;
|
||||
|
|
|
@ -7,8 +7,11 @@ IN: multiline
|
|||
lexer get dup next-line lexer-line-text ;
|
||||
|
||||
: (parse-here) ( -- )
|
||||
next-line-text dup ";" =
|
||||
[ drop lexer get next-line ] [ % "\n" % (parse-here) ] if ;
|
||||
next-line-text [
|
||||
dup ";" =
|
||||
[ drop lexer get next-line ]
|
||||
[ % "\n" % (parse-here) ] if
|
||||
] [ ";" unexpected-eof ] if* ;
|
||||
|
||||
: parse-here ( -- str )
|
||||
[ (parse-here) ] "" make 1 head*
|
||||
|
@ -19,11 +22,13 @@ IN: multiline
|
|||
parse-here 1quotation define-inline ; parsing
|
||||
|
||||
: (parse-multiline-string) ( start-index end-text -- end-index )
|
||||
lexer get lexer-line-text 2dup start
|
||||
[ rot dupd >r >r swap subseq % r> r> length + ] [
|
||||
rot tail % "\n" % 0
|
||||
lexer get next-line swap (parse-multiline-string)
|
||||
] if* ;
|
||||
lexer get lexer-line-text [
|
||||
2dup start
|
||||
[ rot dupd >r >r swap subseq % r> r> length + ] [
|
||||
rot tail % "\n" % 0
|
||||
lexer get next-line swap (parse-multiline-string)
|
||||
] if*
|
||||
] [ nip unexpected-eof ] if* ;
|
||||
|
||||
: parse-multiline-string ( end-text -- str )
|
||||
[
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
USING: unicode.categories kernel math combinators splitting
|
||||
sequences math.parser io.files io assocs arrays namespaces
|
||||
combinators.lib assocs.lib math.ranges unicode.normalize
|
||||
unicode.syntax unicode.data compiler.units alien.syntax const ;
|
||||
unicode.syntax unicode.data compiler.units alien.syntax ;
|
||||
IN: unicode.breaks
|
||||
|
||||
C-ENUM: Any L V T Extend Control CR LF graphemes ;
|
||||
|
|
Loading…
Reference in New Issue