Merge branch 'master' of git://factorcode.org/git/littledan

db4
Slava Pestov 2008-02-13 17:56:30 -06:00
commit 205d5ca6cb
31 changed files with 201 additions and 64 deletions

View File

@ -0,0 +1 @@
Dummy encoding for binary I/O

View File

@ -0,0 +1,5 @@
USING: help.syntax help.markup ;
IN: io.encodings.binary
HELP: binary
{ $class-description "This is the encoding descriptor for binary I/O." } ;

View File

@ -0,0 +1,6 @@
USING: kernel io.encodings ;
TUPLE: binary ;
M: binary init-decoding drop ;
M: binary init-encoding drop ;

View File

@ -0,0 +1 @@
text

View File

@ -1,7 +1,7 @@
! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors USING: math kernel sequences sbufs vectors
namespaces unicode.syntax ; namespaces unicode.syntax growable strings io ;
IN: io.encodings IN: io.encodings
TUPLE: encode-error ; TUPLE: encode-error ;
@ -23,6 +23,42 @@ SYMBOL: begin
: finish-decoding ( buf ch state -- str ) : finish-decoding ( buf ch state -- str )
begin eq? [ decode-error ] unless drop "" like ; begin eq? [ decode-error ] unless drop "" like ;
: decode ( seq quot -- str ) : start-decoding ( seq length -- buf ch state seq )
>r [ length <sbuf> 0 begin ] keep r> each <sbuf> 0 begin roll ;
: decode ( seq quot -- string )
>r dup length start-decoding r>
[ -rot ] swap compose each
finish-decoding ; inline finish-decoding ; inline
: space ( resizable -- room-left )
dup underlying swap [ length ] 2apply - ;
: full? ( resizable -- ? ) space zero? ;
: end-read-loop ( buf ch state stream quot -- string/f )
2drop 2drop >string f like ;
: under ( a b c -- c a b c )
tuck >r swapd r> ; inline
: decode-read-loop ( buf ch state stream quot -- string/f )
>r >r pick r> r> rot full? [ end-read-loop ] [
over stream-read1 [
-rot tuck >r >r >r -rot r> call r> r> decode-read-loop
] [ end-read-loop ] if*
] if ; inline
: decode-read ( length stream quot -- string )
>r swap start-decoding r>
decode-read-loop ; inline
GENERIC: init-decoding ( stream encoding -- decoded-stream )
: <decoding> ( stream decoding-class -- decoded-stream )
construct-empty init-decoding ;
GENERIC: init-encoding ( stream encoding -- encoded-stream )
: <encoding> ( stream encoding-class -- encoded-stream )
construct-empty init-encoding ;

View File

@ -0,0 +1 @@
ISO 8859-1 encoding/decoding

View File

@ -0,0 +1,5 @@
USING: help.syntax help.markup ;
IN: io.encodings.latin1
HELP: latin1
{ $class-description "This class is used for Latin 1 (ISO 8859-1) encoding and decoding" } ;

View File

@ -0,0 +1,19 @@
USING: io.encodings strings kernel ;
IN: io.encodings.latin1
TUPLE: latin1 stream ;
M: latin1 init-decoding tuck set-latin1-stream ;
M: latin1 init-encoding drop ;
M: latin1 stream-read1
latin1-stream stream-read1 ;
M: latin1 stream-read
latin1-stream stream-read >string ;
M: latin1 stream-read-until
latin1-stream stream-read-until >string ;
M: latin1 stream-readln
latin1-stream stream-readln >string ;

View File

@ -0,0 +1 @@
text

Binary file not shown.

View File

@ -0,0 +1 @@
UTF-16, UTF-16LE, UTF-16BE encoding and decoding

View File

@ -0,0 +1 @@
Daniel Ehrenberg

View File

@ -0,0 +1 @@
text

View File

@ -1,5 +1,5 @@
USING: help.markup help.syntax io.encodings strings ; USING: help.markup help.syntax io.encodings strings ;
IN: io.utf16 IN: io.encodings.utf16
ARTICLE: "io.utf16" "Working with UTF16-encoded data" ARTICLE: "io.utf16" "Working with UTF16-encoded data"
"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences." "The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences."

View File

@ -30,7 +30,7 @@ SYMBOL: ignore
>r 2 shift r> BIN: 11 bitand bitor quad3 >r 2 shift r> BIN: 11 bitand bitor quad3
] [ 2drop do-ignore ] if ; ] [ 2drop do-ignore ] if ;
: (decode-utf16be) ( buf byte ch state -- buf ch state ) : decode-utf16be-step ( buf byte ch state -- buf ch state )
{ {
{ begin [ drop begin-utf16be ] } { begin [ drop begin-utf16be ] }
{ double [ end-multibyte ] } { double [ end-multibyte ] }
@ -41,7 +41,7 @@ SYMBOL: ignore
} case ; } case ;
: decode-utf16be ( seq -- str ) : decode-utf16be ( seq -- str )
[ -rot (decode-utf16be) ] decode ; [ decode-utf16be-step ] decode ;
: handle-double ( buf byte ch -- buf ch state ) : handle-double ( buf byte ch -- buf ch state )
swap dup -3 shift BIN: 11011 = [ swap dup -3 shift BIN: 11011 = [
@ -55,7 +55,7 @@ SYMBOL: ignore
BIN: 11 bitand append-nums HEX: 10000 + decoded BIN: 11 bitand append-nums HEX: 10000 + decoded
] [ 2drop push-replacement ] if ; ] [ 2drop push-replacement ] if ;
: (decode-utf16le) ( buf byte ch state -- buf ch state ) : decode-utf16le-step ( buf byte ch state -- buf ch state )
{ {
{ begin [ drop double ] } { begin [ drop double ] }
{ double [ handle-double ] } { double [ handle-double ] }
@ -65,7 +65,7 @@ SYMBOL: ignore
} case ; } case ;
: decode-utf16le ( seq -- str ) : decode-utf16le ( seq -- str )
[ -rot (decode-utf16le) ] decode ; [ decode-utf16le-step ] decode ;
: encode-first : encode-first
-10 shift -10 shift
@ -114,3 +114,35 @@ SYMBOL: ignore
{ [ utf16be? ] [ decode-utf16be ] } { [ utf16be? ] [ decode-utf16be ] }
{ [ t ] [ decode-error ] } { [ t ] [ decode-error ] }
} cond ; } cond ;
! UTF16LE streams
TUPLE: utf16le ;
: <utf16le> utf16le construct-delegate ;
! In the future, this should detect and ignore a BOM at the beginning
M: utf16le init-decoding ( stream utf16le -- utf16le-stream )
tuck set-delegate ;
M: utf16le init-encoding ( stream utf16le -- utf16le-stream )
tuck set-delegate ;
M: utf16le stream-read1 1 swap stream-read ;
M: utf16le stream-read
delegate [ decode-utf16le-step ] decode-read ;
M: utf16le stream-read-partial stream-read ;
M: utf16le stream-read-until
! Copied from { c-reader stream-read-until }!!!
[ swap read-until-loop ] "" make
swap over empty? over not and [ 2drop f f ] when ;
M: utf16le stream-write1
>r 1string r> stream-write ;
M: utf16le stream-write
>r encode-utf16le r> delegate stream-write ;
M: utf16le dispose delegate dispose ;

View File

@ -0,0 +1 @@
UTF-8 encoding and decoding

View File

@ -0,0 +1 @@
Daniel Ehrenberg

View File

@ -0,0 +1 @@
text

View File

@ -1,5 +1,5 @@
USING: help.markup help.syntax io.encodings strings ; USING: help.markup help.syntax io.encodings strings ;
IN: io.utf8 IN: io.encodings.utf8
ARTICLE: "io.utf8" "Working with UTF8-encoded data" ARTICLE: "io.utf8" "Working with UTF8-encoded data"
"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences." "The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences."

View File

@ -0,0 +1,23 @@
USING: io.encodings.utf8 tools.test sbufs kernel io
sequences strings arrays unicode.syntax ;
: decode-utf8-w/stream ( array -- newarray )
>sbuf dup reverse-here <utf8> contents >array ;
: encode-utf8-w/stream ( array -- newarray )
SBUF" " clone tuck <utf8> write >array ;
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test

View File

@ -1,8 +1,10 @@
! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors USING: math kernel sequences sbufs vectors growable io continuations
namespaces io.encodings combinators ; namespaces io.encodings combinators strings io.streams.c ;
IN: io.utf8 IN: io.encodings.utf8
! Decoding UTF-8
SYMBOL: double SYMBOL: double
SYMBOL: triple SYMBOL: triple
@ -31,7 +33,7 @@ SYMBOL: quad3
: end-multibyte ( buf byte ch -- buf ch state ) : end-multibyte ( buf byte ch -- buf ch state )
f append-nums [ decoded ] unless* ; f append-nums [ decoded ] unless* ;
: (decode-utf8) ( buf byte ch state -- buf ch state ) : decode-utf8-step ( buf byte ch state -- buf ch state )
{ {
{ begin [ drop begin-utf8 ] } { begin [ drop begin-utf8 ] }
{ double [ end-multibyte ] } { double [ end-multibyte ] }
@ -43,7 +45,9 @@ SYMBOL: quad3
} case ; } case ;
: decode-utf8 ( seq -- str ) : decode-utf8 ( seq -- str )
[ -rot (decode-utf8) ] decode ; [ decode-utf8-step ] decode ;
! Encoding UTF-8
: encoded ( char -- ) : encoded ( char -- )
BIN: 111111 bitand BIN: 10000000 bitor , ; BIN: 111111 bitand BIN: 10000000 bitor , ;
@ -70,3 +74,35 @@ SYMBOL: quad3
: encode-utf8 ( str -- seq ) : encode-utf8 ( str -- seq )
[ [ char>utf8 ] each ] B{ } make ; [ [ char>utf8 ] each ] B{ } make ;
! Interface for streams
TUPLE: utf8 ;
: <utf8> utf8 construct-delegate ;
! In the future, this should detect and ignore a BOM at the beginning
M: utf8 init-decoding ( stream utf8 -- utf8-stream )
tuck set-delegate ;
M: utf8 init-encoding ( stream utf8 -- utf8-stream )
tuck set-delegate ;
M: utf8 stream-read1 1 swap stream-read ;
M: utf8 stream-read
delegate [ decode-utf8-step ] decode-read ;
M: utf8 stream-read-partial stream-read ;
M: utf8 stream-read-until
! Copied from { c-reader stream-read-until }!!!
[ swap read-until-loop ] "" make
swap over empty? over not and [ 2drop f f ] when ;
M: utf8 stream-write1
>r 1string r> stream-write ;
M: utf8 stream-write
>r encode-utf8 r> delegate stream-write ;
M: utf8 dispose delegate dispose ;

View File

@ -1,16 +0,0 @@
USING: io.utf8 tools.test strings arrays unicode.syntax ;
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
[ "x" ] [ "x" decode-utf8 >string ] unit-test
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test

View File

@ -1,24 +0,0 @@
USING: kernel parser words sequences ;
IN: const
: define-const ( word value -- )
[ parsed ] curry dupd define
t "parsing" set-word-prop ;
: CONST:
CREATE scan-word dup parsing?
[ execute dup pop ] when define-const ; parsing
: define-enum ( words -- )
dup length [ define-const ] 2each ;
: ENUM:
";" parse-tokens [ create-in ] map define-enum ; parsing
: define-value ( word -- )
{ f } clone [ first ] curry define ;
: VALUE: CREATE define-value ; parsing
: set-value ( value word -- )
word-def first set-first ;

View File

@ -1,7 +1,7 @@
! Copyright (C) 2007 Daniel Ehrenberg ! Copyright (C) 2007 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: delegate sequences.private sequences assocs prettyprint.sections USING: delegate sequences.private sequences assocs prettyprint.sections
io definitions kernel ; io definitions kernel continuations ;
IN: delegate.protocols IN: delegate.protocols
PROTOCOL: sequence-protocol PROTOCOL: sequence-protocol
@ -15,7 +15,7 @@ PROTOCOL: assoc-protocol
! everything should work, just slower (with >alist) ! everything should work, just slower (with >alist)
PROTOCOL: stream-protocol PROTOCOL: stream-protocol
stream-read1 stream-read stream-read-until stream-read1 stream-read stream-read-until dispose
stream-flush stream-write1 stream-write stream-format stream-flush stream-write1 stream-write stream-format
stream-nl make-span-stream make-block-stream stream-readln stream-nl make-span-stream make-block-stream stream-readln
make-cell-stream stream-write-table ; make-cell-stream stream-write-table ;

View File

@ -7,8 +7,11 @@ IN: multiline
lexer get dup next-line lexer-line-text ; lexer get dup next-line lexer-line-text ;
: (parse-here) ( -- ) : (parse-here) ( -- )
next-line-text dup ";" = next-line-text [
[ drop lexer get next-line ] [ % "\n" % (parse-here) ] if ; dup ";" =
[ drop lexer get next-line ]
[ % "\n" % (parse-here) ] if
] [ ";" unexpected-eof ] if* ;
: parse-here ( -- str ) : parse-here ( -- str )
[ (parse-here) ] "" make 1 head* [ (parse-here) ] "" make 1 head*
@ -19,11 +22,13 @@ IN: multiline
parse-here 1quotation define-inline ; parsing parse-here 1quotation define-inline ; parsing
: (parse-multiline-string) ( start-index end-text -- end-index ) : (parse-multiline-string) ( start-index end-text -- end-index )
lexer get lexer-line-text 2dup start lexer get lexer-line-text [
[ rot dupd >r >r swap subseq % r> r> length + ] [ 2dup start
rot tail % "\n" % 0 [ rot dupd >r >r swap subseq % r> r> length + ] [
lexer get next-line swap (parse-multiline-string) rot tail % "\n" % 0
] if* ; lexer get next-line swap (parse-multiline-string)
] if*
] [ nip unexpected-eof ] if* ;
: parse-multiline-string ( end-text -- str ) : parse-multiline-string ( end-text -- str )
[ [

View File

@ -1,7 +1,7 @@
USING: unicode.categories kernel math combinators splitting USING: unicode.categories kernel math combinators splitting
sequences math.parser io.files io assocs arrays namespaces sequences math.parser io.files io assocs arrays namespaces
combinators.lib assocs.lib math.ranges unicode.normalize combinators.lib assocs.lib math.ranges unicode.normalize
unicode.syntax unicode.data compiler.units alien.syntax const ; unicode.syntax unicode.data compiler.units alien.syntax ;
IN: unicode.breaks IN: unicode.breaks
C-ENUM: Any L V T Extend Control CR LF graphemes ; C-ENUM: Any L V T Extend Control CR LF graphemes ;