diff --git a/core/io/encodings/binary/binary.factor b/core/io/encodings/binary/binary.factor index b8bcc0f87a..5038628ed9 100644 --- a/core/io/encodings/binary/binary.factor +++ b/core/io/encodings/binary/binary.factor @@ -1,3 +1,8 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -IN: io.encodings.binary SYMBOL: binary +USING: io.encodings kernel ; +IN: io.encodings.binary + +TUPLE: binary ; +M: binary drop ; +M: binary drop ; diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 2f68334bde..b7c71d5527 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -2,62 +2,36 @@ ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors namespaces growable strings io classes continuations combinators -io.styles io.streams.plain io.encodings.binary splitting -io.streams.duplex byte-arrays ; +io.styles io.streams.plain splitting +io.streams.duplex byte-arrays sequences.private ; IN: io.encodings ! The encoding descriptor protocol -GENERIC: decode-step ( buf char encoding -- ) -M: object decode-step drop swap push ; +GENERIC: decode-char ( stream encoding -- char/f ) -GENERIC: init-decoder ( stream encoding -- encoding ) -M: tuple-class init-decoder construct-empty init-decoder ; -M: object init-decoder nip ; +GENERIC: encode-char ( char stream encoding -- ) -GENERIC: stream-write-encoded ( string stream encoding -- byte-array ) -M: object stream-write-encoded drop stream-write ; +GENERIC: ( stream decoding -- newstream ) + +GENERIC: ( stream encoding -- newstream ) + +: replacement-char HEX: fffd ; ! Decoding + construct-empty ; +M: tuple f decoder construct-boa ; -: push-decoded ( buf ch -- buf ch state ) - over push 0 begin ; - -: push-replacement ( buf -- buf ch state ) - ! This is the replacement character - HEX: fffd push-decoded ; - -: space ( resizable -- room-left ) - dup underlying swap [ length ] 2apply - ; - -: full? ( resizable -- ? ) space zero? ; - -: end-read-loop ( buf ch state stream quot -- string/f ) - 2drop 2drop >string f like ; - -: decode-read-loop ( buf stream encoding -- string/f ) - pick full? [ 2drop >string ] [ - over stream-read1 [ - -rot tuck >r >r >r dupd r> decode-step r> r> - decode-read-loop - ] [ 2drop >string f like ] if* - ] if ; - -: decode-read ( length stream encoding -- string ) - rot -rot decode-read-loop ; - -TUPLE: decoder code cr ; -: ( stream encoding -- newstream ) - dup binary eq? [ drop ] [ - dupd init-decoder { set-delegate set-decoder-code } - decoder construct - ] if ; +: >decoder< ( decoder -- stream encoding ) + { decoder-stream decoder-code } get-slots ; : cr+ t swap set-decoder-cr ; inline @@ -82,72 +56,78 @@ TUPLE: decoder code cr ; over decoder-cr [ over cr- "\n" ?head [ - swap stream-read1 [ add ] when* - ] [ nip ] if - ] [ nip ] if ; + over stream-read1 [ add ] when* + ] when + ] when nip ; + +: read-loop ( n stream -- string ) + over 0 [ + [ + >r stream-read1 dup + [ swap r> set-nth-unsafe f ] [ r> 3drop t ] if + ] 2curry find-integer + ] keep swap [ head ] when* ; M: decoder stream-read - tuck { delegate decoder-code } get-slots decode-read fix-read ; + tuck read-loop fix-read ; -M: decoder stream-read-partial stream-read ; - -: decoder-read-until ( stream delim -- ch ) - ! Copied from { c-reader stream-read-until }!!! - over stream-read1 dup [ - dup pick memq? [ 2nip ] [ , decoder-read-until ] if - ] [ - 2nip - ] if ; +: (read-until) ( buf quot -- string/f sep/f ) + ! quot: -- char keep-going? + dup call + [ >r drop "" like r> ] + [ pick push (read-until) ] if ; inline M: decoder stream-read-until - ! Copied from { c-reader stream-read-until }!!! - [ swap decoder-read-until ] "" make - swap over empty? over not and [ 2drop f f ] when ; + SBUF" " clone -rot >decoder< + [ decode-char dup rot memq? ] 3curry (read-until) ; : fix-read1 ( stream char -- char ) over decoder-cr [ over cr- dup CHAR: \n = [ - drop stream-read1 - ] [ nip ] if - ] [ nip ] if ; + drop dup stream-read1 + ] when + ] when nip ; M: decoder stream-read1 - 1 swap stream-read f like [ first ] [ f ] if* ; + dup >decoder< decode-char fix-read1 ; M: decoder stream-readln ( stream -- str ) "\r\n" over stream-read-until handle-readln ; +M: decoder dispose decoder-stream dispose ; + ! Encoding TUPLE: encode-error ; : encode-error ( -- * ) \ encode-error construct-empty throw ; -TUPLE: encoder code ; -: ( stream encoding -- newstream ) - dup binary eq? [ drop ] [ - construct-empty { set-delegate set-encoder-code } - encoder construct - ] if ; +TUPLE: encoder stream code ; +M: tuple-class construct-empty ; +M: tuple encoder construct-boa ; + +: >encoder< ( encoder -- stream encoding ) + { encoder-stream encoder-code } get-slots ; M: encoder stream-write1 - >r 1string r> stream-write ; + >encoder< encode-char ; M: encoder stream-write - { delegate encoder-code } get-slots stream-write-encoded ; + >encoder< [ encode-char ] 2curry each ; -M: encoder dispose delegate dispose ; +M: encoder dispose encoder-stream dispose ; INSTANCE: encoder plain-writer ! Rebinding duplex streams which have not read anything yet : reencode ( stream encoding -- newstream ) - over encoder? [ >r delegate r> ] when ; + over encoder? [ >r encoder-stream r> ] when ; : redecode ( stream encoding -- newstream ) - over decoder? [ >r delegate r> ] when ; + over decoder? [ >r decoder-stream r> ] when ; +PRIVATE> : ( stream-in stream-out encoding -- duplex ) tuck reencode >r redecode r> ; diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index 5887a8375e..02b10c45a5 100644 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -6,82 +6,68 @@ IN: io.encodings.utf8 ! Decoding UTF-8 -TUPLE: utf8 ch state ; +TUPLE: utf8 ; -SYMBOL: double -SYMBOL: triple -SYMBOL: triple2 -SYMBOL: quad -SYMBOL: quad2 -SYMBOL: quad3 +r over starts-2? - [ 6 shift swap BIN: 111111 bitand bitor r> ] - [ r> 3drop push-replacement ] if ; +: append-nums ( stream byte -- stream char ) + over stream-read1 dup starts-2? + [ 6 shift swap BIN: 111111 bitand bitor ] + [ 2drop replacement-char ] if ; -: begin-utf8 ( buf byte -- buf ch state ) +: double ( stream byte -- stream char ) + BIN: 11111 bitand append-nums ; + +: triple ( stream byte -- stream char ) + BIN: 1111 bitand append-nums append-nums ; + +: quad ( stream byte -- stream char ) + BIN: 111 bitand append-nums append-nums append-nums ; + +: begin-utf8 ( stream byte -- stream char ) { - { [ dup -7 shift zero? ] [ push-decoded ] } - { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] } - { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] } - { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] } - { [ t ] [ drop push-replacement ] } + { [ dup -7 shift zero? ] [ ] } + { [ dup -5 shift BIN: 110 number= ] [ double ] } + { [ dup -4 shift BIN: 1110 number= ] [ triple ] } + { [ dup -3 shift BIN: 11110 number= ] [ quad ] } + { [ t ] [ drop replacement-char ] } } cond ; -: end-multibyte ( buf byte ch -- buf ch state ) - f append-nums [ push-decoded ] unless* ; +: decode-utf8 ( stream -- char/f ) + dup stream-read1 dup [ begin-utf8 ] when nip ; -: decode-utf8-step ( buf byte ch state -- buf ch state ) - { - { begin [ drop begin-utf8 ] } - { double [ end-multibyte ] } - { triple [ triple2 append-nums ] } - { triple2 [ end-multibyte ] } - { quad [ quad2 append-nums ] } - { quad2 [ quad3 append-nums ] } - { quad3 [ end-multibyte ] } - } case ; - -: unpack-state ( encoding -- ch state ) - { utf8-ch utf8-state } get-slots ; - -: pack-state ( ch state encoding -- ) - { set-utf8-ch set-utf8-state } set-slots ; - -M: utf8 decode-step ( buf char encoding -- ) - [ unpack-state decode-utf8-step ] keep pack-state drop ; - -M: utf8 init-decoder nip begin over set-utf8-state ; +M: utf8 decode-char + drop decode-utf8 ; ! Encoding UTF-8 -: encoded ( char -- ) - BIN: 111111 bitand BIN: 10000000 bitor write1 ; +: encoded ( stream char -- ) + BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ; -: char>utf8 ( char -- ) +: char>utf8 ( stream char -- ) { - { [ dup -7 shift zero? ] [ write1 ] } + { [ dup -7 shift zero? ] [ swap stream-write1 ] } { [ dup -11 shift zero? ] [ - dup -6 shift BIN: 11000000 bitor write1 + 2dup -6 shift BIN: 11000000 bitor swap stream-write1 encoded ] } { [ dup -16 shift zero? ] [ - dup -12 shift BIN: 11100000 bitor write1 - dup -6 shift encoded + 2dup -12 shift BIN: 11100000 bitor swap stream-write1 + 2dup -6 shift encoded encoded ] } { [ t ] [ - dup -18 shift BIN: 11110000 bitor write1 - dup -12 shift encoded - dup -6 shift encoded + 2dup -18 shift BIN: 11110000 bitor swap stream-write1 + 2dup -12 shift encoded + 2dup -6 shift encoded encoded ] } } cond ; -M: utf8 stream-write-encoded - ! For efficiency, this should be modified to avoid variable reads - drop [ [ char>utf8 ] each ] with-stream* ; +M: utf8 encode-char + drop swap char>utf8 ; + +PRIVATE> diff --git a/core/io/streams/string/string.factor b/core/io/streams/string/string.factor index 7833e0aa47..33404292a9 100755 --- a/core/io/streams/string/string.factor +++ b/core/io/streams/string/string.factor @@ -1,9 +1,9 @@ ! Copyright (C) 2003, 2007 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -IN: io.streams.string USING: io kernel math namespaces sequences sbufs strings generic splitting growable continuations io.streams.plain -io.encodings ; +io.encodings io.encodings.private ; +IN: io.streams.string M: growable dispose drop ; diff --git a/extra/io/encodings/ascii/ascii.factor b/extra/io/encodings/ascii/ascii.factor index bd71b733f1..16d87ef39c 100644 --- a/extra/io/encodings/ascii/ascii.factor +++ b/extra/io/encodings/ascii/ascii.factor @@ -1,18 +1,20 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ; +USING: io io.encodings kernel math ; IN: io.encodings.ascii -: encode-check< ( string stream max -- ) - [ pick <= [ encode-error ] [ stream-write1 ] if ] 2curry each ; + [ encode-error ] [ stream-write1 ] if ; -: push-if< ( sbuf character max -- ) - over <= [ drop HEX: fffd ] when swap push ; +: decode-if< ( stream encoding max -- character ) + nip swap stream-read1 tuck > [ drop replacement-character ] unless ; +PRIVATE> TUPLE: ascii ; -M: ascii stream-write-encoded ( string stream encoding -- ) - drop 128 encode-check< ; +M: ascii encode-char + 128 encode-if< ; -M: ascii decode-step - drop 128 push-if< ; +M: ascii decode-char + 128 decode-if< ; diff --git a/extra/io/encodings/latin1/latin1.factor b/extra/io/encodings/latin1/latin1.factor index 71e98a1747..2b82318885 100755 --- a/extra/io/encodings/latin1/latin1.factor +++ b/extra/io/encodings/latin1/latin1.factor @@ -1,12 +1,12 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: io io.encodings strings kernel io.encodings.ascii sequences math ; +USING: io io.encodings kernel io.encodings.ascii.private ; IN: io.encodings.latin1 TUPLE: latin1 ; -M: latin1 stream-write-encoded - drop 256 encode-check< ; +M: latin1 encode-char + 256 encode-if< ; -M: latin1 decode-step - drop swap push ; +M: latin1 decode-char + drop stream-read1 ; diff --git a/extra/io/encodings/utf16/utf16.factor b/extra/io/encodings/utf16/utf16.factor index a501fad0bd..7e82935db7 100755 --- a/extra/io/encodings/utf16/utf16.factor +++ b/extra/io/encodings/utf16/utf16.factor @@ -4,92 +4,71 @@ USING: math kernel sequences sbufs vectors namespaces io.binary io.encodings combinators splitting io byte-arrays ; IN: io.encodings.utf16 -! UTF-16BE decoding - -TUPLE: utf16be ch state ; - -SYMBOL: double -SYMBOL: quad1 -SYMBOL: quad2 -SYMBOL: quad3 -SYMBOL: ignore - -: do-ignore ( -- ch state ) 0 ignore ; - -: append-nums ( byte ch -- ch ) - 8 shift bitor ; - -: end-multibyte ( buf byte ch -- buf ch state ) - append-nums push-decoded ; - -: begin-utf16be ( buf byte -- buf ch state ) - dup -3 shift BIN: 11011 number= [ - dup BIN: 00000100 bitand zero? - [ BIN: 11 bitand quad1 ] - [ drop do-ignore ] if - ] [ double ] if ; - -: handle-quad2be ( byte ch -- ch state ) - swap dup -2 shift BIN: 110111 number= [ - >r 2 shift r> BIN: 11 bitand bitor quad3 - ] [ 2drop do-ignore ] if ; - -: decode-utf16be-step ( buf byte ch state -- buf ch state ) - { - { begin [ drop begin-utf16be ] } - { double [ end-multibyte ] } - { quad1 [ append-nums quad2 ] } - { quad2 [ handle-quad2be ] } - { quad3 [ append-nums HEX: 10000 + push-decoded ] } - { ignore [ 2drop push-replacement ] } - } case ; - -: unpack-state-be ( encoding -- ch state ) - { utf16be-ch utf16be-state } get-slots ; - -: pack-state-be ( ch state encoding -- ) - { set-utf16be-ch set-utf16be-state } set-slots ; - -M: utf16be decode-step - [ unpack-state-be decode-utf16be-step ] keep pack-state-be drop ; - -M: utf16be init-decoder nip begin over set-utf16be-state ; - -! UTF-16LE decoding +TUPLE: utf16be ; TUPLE: utf16le ch state ; -: handle-double ( buf byte ch -- buf ch state ) - swap dup -3 shift BIN: 11011 = [ - dup BIN: 100 bitand 0 number= - [ BIN: 11 bitand 8 shift bitor quad2 ] - [ 2drop push-replacement ] if - ] [ end-multibyte ] if ; +TUPLE: utf16 started? ; -: handle-quad3le ( buf byte ch -- buf ch state ) - swap dup -2 shift BIN: 110111 = [ - BIN: 11 bitand append-nums HEX: 10000 + push-decoded - ] [ 2drop push-replacement ] if ; +r 2 shift r> BIN: 11 bitand bitor + over stream-read1 swap append-nums HEX: 10000 + + ] [ 2drop replacement-char ] if + ] when ; + +: ignore ( stream -- stream char ) + dup stream-read1 drop replacement-char ; + +: begin-utf16be ( stream byte -- stream char ) + dup -3 shift BIN: 11011 number= [ + dup BIN: 00000100 bitand zero? + [ BIN: 11 bitand quad-be ] + [ drop ignore ] if + ] [ double-be ] if ; + +M: decode-char + drop dup stream-read1 dup [ begin-utf16be ] when nip ; + +! UTF-16LE decoding + +: quad-le ( stream ch -- stream char ) + over stream-read1 swap 10 shift bitor + over stream-read1 dup -2 shift BIN: 110111 = [ + BIN: 11 bitand append-nums HEX: 10000 + + ] [ 2drop replacement-char ] if ; + +: double-le ( stream byte1 byte2 -- stream char ) + dup -3 shift BIN: 11011 = [ + dup BIN: 100 bitand 0 number= + [ BIN: 11 bitand 8 shift bitor quad-le ] + [ 2drop replacement-char ] if + ] [ swap append-nums ] if ; : decode-utf16le-step ( buf byte ch state -- buf ch state ) { { begin [ drop double ] } { double [ handle-double ] } - { quad1 [ append-nums quad2 ] } { quad2 [ 10 shift bitor quad3 ] } { quad3 [ handle-quad3le ] } } case ; -: unpack-state-le ( encoding -- ch state ) - { utf16le-ch utf16le-state } get-slots ; +: begin-utf16le ( stream byte -- stream char ) + over stream-read1 [ double-le ] [ drop replacement-char ] if* -: pack-state-le ( ch state encoding -- ) - { set-utf16le-ch set-utf16le-state } set-slots ; - -M: utf16le decode-step - [ unpack-state-le decode-utf16le-step ] keep pack-state-le drop ; - -M: utf16le init-decoder nip begin over set-utf16le-state ; +M: decode-char + drop dup stream-read1 dup [ begin-utf16le ] when nip ; ! UTF-16LE/BE encoding @@ -103,25 +82,25 @@ M: utf16le init-decoder nip begin over set-utf16le-state ; dup -8 shift BIN: 11011100 bitor swap BIN: 11111111 bitand ; -: char>utf16be ( char -- ) +: stream-write2 ( stream char1 char2 -- ) + rot [ stream-write1 ] 2apply ; + +: char>utf16be ( stream char -- ) dup HEX: FFFF > [ HEX: 10000 - - dup encode-first swap write1 write1 - encode-second swap write1 write1 - ] [ h>b/b write1 write1 ] if ; + dup encode-first stream-write2 + encode-second stream-write2 + ] [ h>b/b swap stream-write2 ] if ; -: stream-write-utf16be ( string stream -- ) - [ [ char>utf16be ] each ] with-stream* ; - -M: utf16be stream-write-encoded ( string stream encoding -- ) - drop stream-write-utf16be ; +M: utf16be encode-char ( char stream encoding -- ) + drop char>utf16be ; : char>utf16le ( char -- ) dup HEX: FFFF > [ HEX: 10000 - - dup encode-first write1 write1 - encode-second write1 write1 - ] [ h>b/b swap write1 write1 ] if ; + dup encode-first swap stream-write2 + encode-second swap stream-write2 + ] [ h>b/b stream-write2 ] if ; : stream-write-utf16le ( string stream -- ) [ [ char>utf16le ] each ] with-stream* ; @@ -139,17 +118,15 @@ M: utf16le stream-write-encoded ( string stream encoding -- ) : start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; -TUPLE: utf16 started? ; - -M: utf16 stream-write-encoded - dup utf16-started? [ drop ] - [ t swap set-utf16-started? bom-le over stream-write ] if - stream-write-utf16le ; - : bom>le/be ( bom -- le/be ) dup bom-le sequence= [ drop utf16le ] [ bom-be sequence= [ utf16be ] [ decode-error ] if ] if ; -M: utf16 init-decoder ( stream encoding -- newencoding ) - 2 rot stream-read bom>le/be construct-empty init-decoder ; +M: utf16 ( stream utf16 -- decoder ) + 2 rot stream-read bom>le/be ; + +M: utf16 ( stream utf16 -- encoder ) + drop bom-le over stream-write utf16le ; + +PRIVATE>