From 6ff319cfcb3f8bda945ccd39c6b6cbd66721258b Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 13 Feb 2008 19:53:53 -0600 Subject: [PATCH 1/2] Unicode upgrade and fix --- core/io/encodings/encodings.factor | 50 ++++++++++++++++++++++------ core/io/encodings/utf16/utf16.factor | 34 +++++-------------- core/io/encodings/utf8/utf8.factor | 30 +++-------------- extra/unicode/data/data.factor | 10 +++++- 4 files changed, 61 insertions(+), 63 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index b27b89642d..1f261dea52 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -26,6 +26,8 @@ SYMBOL: begin : start-decoding ( seq length -- buf ch state seq ) 0 begin roll ; +GENERIC: decode-step ( buf byte ch state encoding -- buf ch state ) + : decode ( seq quot -- string ) >r dup length start-decoding r> [ -rot ] swap compose each @@ -39,26 +41,54 @@ SYMBOL: begin : end-read-loop ( buf ch state stream quot -- string/f ) 2drop 2drop >string f like ; -: under ( a b c -- c a b c ) - tuck >r swapd r> ; inline - -: decode-read-loop ( buf ch state stream quot -- string/f ) +: decode-read-loop ( buf ch state stream encoding -- string/f ) >r >r pick r> r> rot full? [ end-read-loop ] [ over stream-read1 [ - -rot tuck >r >r >r -rot r> call r> r> decode-read-loop + -rot tuck >r >r >r -rot r> decode-step r> r> decode-read-loop ] [ end-read-loop ] if* - ] if ; inline + ] if ; -: decode-read ( length stream quot -- string ) +: decode-read ( length stream encoding -- string ) >r swap start-decoding r> - decode-read-loop ; inline + decode-read-loop ; GENERIC: init-decoding ( stream encoding -- decoded-stream ) : ( stream decoding-class -- decoded-stream ) - construct-empty init-decoding ; + construct-empty init-decoding ; GENERIC: init-encoding ( stream encoding -- encoded-stream ) : ( stream encoding-class -- encoded-stream ) - construct-empty init-encoding ; + construct-empty init-encoding ; + +GENERIC: encode-string ( string encoding -- byte-array ) +M: tuple-class encode-string construct-empty encode-string ; + +MIXIN: encoding-stream + +M: encoding-stream init-decoding ( stream encoding-stream -- encoding-stream ) + tuck set-delegate ; + +M: encoding-stream init-encoding ( stream encoding-stream -- encoding-stream ) + tuck set-delegate ; + +M: encoding-stream stream-read1 1 swap stream-read ; + +M: encoding-stream stream-read + [ delegate ] keep decode-read ; + +M: encoding-stream stream-read-partial stream-read ; + +M: encoding-stream stream-read-until + ! Copied from { c-reader stream-read-until }!!! + [ swap read-until-loop ] "" make + swap over empty? over not and [ 2drop f f ] when ; + +M: encoding-stream stream-write1 + >r 1string r> stream-write ; + +M: encoding-stream stream-write + [ encode-string ] keep delegate stream-write ; + +M: encoding-stream dispose delegate dispose ; diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor index 81c982dd55..2095283890 100755 --- a/core/io/encodings/utf16/utf16.factor +++ b/core/io/encodings/utf16/utf16.factor @@ -115,34 +115,16 @@ SYMBOL: ignore { [ t ] [ decode-error ] } } cond ; -! UTF16LE streams - TUPLE: utf16le ; : utf16le construct-delegate ; -! In the future, this should detect and ignore a BOM at the beginning +INSTANCE: encoding-stream utf16le -M: utf16le init-decoding ( stream utf16le -- utf16le-stream ) - tuck set-delegate ; +M: utf16le encode-string drop encode-utf16le ; +M: utf16le decode-step drop decode-utf16le-step ; -M: utf16le init-encoding ( stream utf16le -- utf16le-stream ) - tuck set-delegate ; +TUPLE: utf16be ; +: utf16be construct-delegate ; +INSTANCE: encoding-stream utf16be -M: utf16le stream-read1 1 swap stream-read ; - -M: utf16le stream-read - delegate [ decode-utf16le-step ] decode-read ; - -M: utf16le stream-read-partial stream-read ; - -M: utf16le stream-read-until - ! Copied from { c-reader stream-read-until }!!! - [ swap read-until-loop ] "" make - swap over empty? over not and [ 2drop f f ] when ; - -M: utf16le stream-write1 - >r 1string r> stream-write ; - -M: utf16le stream-write - >r encode-utf16le r> delegate stream-write ; - -M: utf16le dispose delegate dispose ; +M: utf16be encode-string drop encode-utf16be ; +M: utf16le decode-step drop decode-utf16be-step ; diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index de3fd5b67b..e9c23610b5 100644 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -79,30 +79,8 @@ SYMBOL: quad3 TUPLE: utf8 ; : utf8 construct-delegate ; +INSTANCE: encoding-stream utf8 + +M: utf8 encode-string drop encode-utf8 ; +M: utf8 decode-step drop decode-utf8-step ; ! In the future, this should detect and ignore a BOM at the beginning - -M: utf8 init-decoding ( stream utf8 -- utf8-stream ) - tuck set-delegate ; - -M: utf8 init-encoding ( stream utf8 -- utf8-stream ) - tuck set-delegate ; - -M: utf8 stream-read1 1 swap stream-read ; - -M: utf8 stream-read - delegate [ decode-utf8-step ] decode-read ; - -M: utf8 stream-read-partial stream-read ; - -M: utf8 stream-read-until - ! Copied from { c-reader stream-read-until }!!! - [ swap read-until-loop ] "" make - swap over empty? over not and [ 2drop f f ] when ; - -M: utf8 stream-write1 - >r 1string r> stream-write ; - -M: utf8 stream-write - >r encode-utf8 r> delegate stream-write ; - -M: utf8 dispose delegate dispose ; diff --git a/extra/unicode/data/data.factor b/extra/unicode/data/data.factor index c579d1fdfd..3af3d927d7 100644 --- a/extra/unicode/data/data.factor +++ b/extra/unicode/data/data.factor @@ -1,8 +1,16 @@ USING: assocs math kernel sequences io.files hashtables quotations splitting arrays math.parser combinators.lib hash2 -byte-arrays words namespaces words compiler.units const ; +byte-arrays words namespaces words compiler.units parser ; IN: unicode.data +<< +: VALUE: + CREATE dup reset-generic { f } clone [ first ] curry define ; parsing + +: set-value ( value word -- ) + word-def first set-first ; +>> + ! Convenience functions : 1+* ( n/f _ -- n+1 ) drop [ 1+ ] [ 0 ] if* ; From 4103c982c33445bd628f2442587f815238b291bc Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 13 Feb 2008 19:54:35 -0600 Subject: [PATCH 2/2] encodings fix --- core/io/encodings/encodings.factor | 5 +++-- core/io/encodings/utf16/utf16.factor | 4 ++-- core/io/encodings/utf8/utf8.factor | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 1f261dea52..27c74fc4bd 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: math kernel sequences sbufs vectors -namespaces unicode.syntax growable strings io ; +USING: math kernel sequences sbufs vectors io.streams.lines io.streams.plain +namespaces unicode.syntax growable strings io classes io.streams.c +continuations ; IN: io.encodings TUPLE: encode-error ; diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor index 2095283890..99e98cd98c 100755 --- a/core/io/encodings/utf16/utf16.factor +++ b/core/io/encodings/utf16/utf16.factor @@ -117,14 +117,14 @@ SYMBOL: ignore TUPLE: utf16le ; : utf16le construct-delegate ; -INSTANCE: encoding-stream utf16le +INSTANCE: utf16le encoding-stream M: utf16le encode-string drop encode-utf16le ; M: utf16le decode-step drop decode-utf16le-step ; TUPLE: utf16be ; : utf16be construct-delegate ; -INSTANCE: encoding-stream utf16be +INSTANCE: utf16be encoding-stream M: utf16be encode-string drop encode-utf16be ; M: utf16le decode-step drop decode-utf16be-step ; diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index e9c23610b5..f681b18142 100644 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -79,7 +79,7 @@ SYMBOL: quad3 TUPLE: utf8 ; : utf8 construct-delegate ; -INSTANCE: encoding-stream utf8 +INSTANCE: utf8 encoding-stream M: utf8 encode-string drop encode-utf8 ; M: utf8 decode-step drop decode-utf8-step ;