From 2eda6fc6aaf09c5905674b685ed81b27a8b38f86 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Mon, 19 Apr 2010 01:13:21 -0500 Subject: [PATCH] io.encodings: add a fast-path for ascii, utf8 and 8-bit encodings when string only contains ASCII characters --- basis/alien/data/data.factor | 15 ++------- basis/biassocs/biassocs.factor | 4 +-- basis/bootstrap/compiler/compiler.factor | 4 ++- basis/hints/hints.factor | 10 +++--- basis/io/encodings/8-bit/8-bit.factor | 32 +++++++++++--------- basis/io/encodings/ascii/ascii.factor | 31 +++++++++++-------- basis/io/ports/ports.factor | 4 +-- basis/io/streams/byte-array/fast/authors.txt | 1 + basis/io/streams/byte-array/fast/fast.factor | 15 +++++++++ core/io/encodings/encodings.factor | 13 ++++---- core/io/encodings/utf8/utf8.factor | 21 ++++++++----- core/strings/strings.factor | 3 +- 12 files changed, 87 insertions(+), 66 deletions(-) create mode 100644 basis/io/streams/byte-array/fast/authors.txt create mode 100644 basis/io/streams/byte-array/fast/fast.factor diff --git a/basis/alien/data/data.factor b/basis/alien/data/data.factor index a0450d5122..af1ed24663 100644 --- a/basis/alien/data/data.factor +++ b/basis/alien/data/data.factor @@ -1,8 +1,7 @@ ! (c)2009, 2010 Slava Pestov, Joe Groff bsd license -USING: accessors alien alien.c-types alien.arrays alien.strings arrays -byte-arrays cpu.architecture fry io io.encodings.binary -io.files io.streams.memory kernel libc math sequences words -byte-vectors ; +USING: accessors alien alien.c-types alien.arrays alien.strings +arrays byte-arrays cpu.architecture fry io io.encodings.binary +io.files io.streams.memory kernel libc math sequences words ; IN: alien.data GENERIC: require-c-array ( c-type -- ) @@ -63,13 +62,6 @@ M: memory-stream stream-read swap memory>byte-array ] [ [ + ] change-index drop ] 2bi ; -M: byte-vector stream-write - [ dup byte-length tail-slice ] - [ [ [ byte-length ] bi@ + ] keep lengthen ] - [ drop byte-length ] - 2tri - [ >c-ptr swap >c-ptr ] dip memcpy ; - M: value-type c-type-rep drop int-rep ; M: value-type c-type-getter @@ -83,4 +75,3 @@ M: array c-type-boxer-quot unclip [ array-length ] dip [ ] 2curry ; M: array c-type-unboxer-quot drop [ >c-ptr ] ; - diff --git a/basis/biassocs/biassocs.factor b/basis/biassocs/biassocs.factor index 7daa478f54..ab3157d400 100644 --- a/basis/biassocs/biassocs.factor +++ b/basis/biassocs/biassocs.factor @@ -13,9 +13,9 @@ TUPLE: biassoc from to ; M: biassoc assoc-size from>> assoc-size ; -M: biassoc at* from>> at* ; +M: biassoc at* from>> at* ; inline -M: biassoc value-at* to>> at* ; +M: biassoc value-at* to>> at* ; inline : once-at ( value key assoc -- ) 2dup key? [ 3drop ] [ set-at ] if ; diff --git a/basis/bootstrap/compiler/compiler.factor b/basis/bootstrap/compiler/compiler.factor index 90562e9fc7..9cb9c125ab 100644 --- a/basis/bootstrap/compiler/compiler.factor +++ b/basis/bootstrap/compiler/compiler.factor @@ -35,7 +35,7 @@ gc [ optimized? not ] filter compile ; "debug-compiler" get [ - + nl "Compiling..." write flush @@ -117,4 +117,6 @@ gc " done" print flush + "io.streams.byte-array.fast" require + ] unless diff --git a/basis/hints/hints.factor b/basis/hints/hints.factor index 558f7dd8a4..dc16cf8b24 100644 --- a/basis/hints/hints.factor +++ b/basis/hints/hints.factor @@ -2,10 +2,10 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs byte-arrays byte-vectors classes combinators definitions effects fry generic generic.single -generic.standard hashtables io.binary io.streams.string kernel -kernel.private math math.integers.private math.parser -namespaces parser sbufs sequences splitting splitting.private strings -vectors words ; +generic.standard hashtables io.binary io.encodings +io.streams.string kernel kernel.private math +math.integers.private math.parser namespaces parser sbufs +sequences splitting splitting.private strings vectors words ; IN: hints GENERIC: specializer-predicate ( spec -- quot ) @@ -131,3 +131,5 @@ M\ hashtable at* { { fixnum object } { word object } } "specializer" set-word-pr M\ hashtable set-at { { object fixnum object } { object word object } } "specializer" set-word-prop \ bignum/f { { bignum bignum } { bignum fixnum } { fixnum bignum } { fixnum fixnum } } "specializer" set-word-prop + +\ encode-string { string object object } "specializer" set-word-prop diff --git a/basis/io/encodings/8-bit/8-bit.factor b/basis/io/encodings/8-bit/8-bit.factor index 7f92028c31..db269c319d 100644 --- a/basis/io/encodings/8-bit/8-bit.factor +++ b/basis/io/encodings/8-bit/8-bit.factor @@ -1,10 +1,10 @@ ! Copyright (C) 2008 Daniel Ehrenberg, Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: math.parser arrays io.encodings sequences kernel assocs -hashtables io.encodings.ascii generic parser classes.tuple words -words.symbol io io.files splitting namespaces math -compiler.units accessors classes.singleton classes.mixin -io.encodings.iana fry simple-flat-file lexer ; +USING: arrays assocs biassocs kernel io.encodings math.parser +sequences hashtables io.encodings.ascii generic parser +classes.tuple words words.symbol io io.files splitting +namespaces math compiler.units accessors classes.singleton +classes.mixin io.encodings.iana fry simple-flat-file lexer ; IN: io.encodings.8-bit > value-at [ encode-error ] unless* ; inline -M: 8-bit encode-char biassoc>> encode-8-bit ; +M: 8-bit encode-char + swap [ 8-bit-encode ] dip stream-write1 ; -: decode-8-bit ( stream assoc -- char/f ) - swap stream-read1 - [ swap at [ replacement-char ] unless* ] - [ drop f ] if* ; inline +M: 8-bit encode-string + swap [ '[ _ 8-bit-encode ] B{ } map-as ] dip stream-write ; -M: 8-bit decode-char biassoc>> decode-8-bit ; +M: 8-bit decode-char + swap stream-read1 dup + [ swap biassoc>> at [ replacement-char ] unless* ] + [ 2drop f ] + if ; MIXIN: 8-bit-encoding diff --git a/basis/io/encodings/ascii/ascii.factor b/basis/io/encodings/ascii/ascii.factor index 00d3bc7509..2b5640489f 100644 --- a/basis/io/encodings/ascii/ascii.factor +++ b/basis/io/encodings/ascii/ascii.factor @@ -1,22 +1,27 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: io io.encodings kernel math io.encodings.private ; +USING: accessors byte-arrays io io.encodings +io.encodings.private kernel math sequences ; IN: io.encodings.ascii - ] 2bi [ >fixnum ] [ drop replacement-char ] if ] - [ 2drop f ] if ; inline -PRIVATE> - SINGLETON: ascii M: ascii encode-char - 128 encode-if< ; inline + drop + over 127 <= [ stream-write1 ] [ encode-error ] if ; inline + +M: ascii encode-string + drop + [ + dup aux>> + [ [ dup 127 <= [ encode-error ] unless ] B{ } map-as ] + [ >byte-array ] + if + ] dip + stream-write ; M: ascii decode-char - 128 decode-if< ; inline + drop + stream-read1 dup [ + dup 127 <= [ >fixnum ] [ drop replacement-char ] if + ] when ; inline diff --git a/basis/io/ports/ports.factor b/basis/io/ports/ports.factor index 0927e7e480..cd0843a70b 100644 --- a/basis/io/ports/ports.factor +++ b/basis/io/ports/ports.factor @@ -114,7 +114,7 @@ M: output-port stream-write1 : write-in-groups ( byte-array port -- ) [ binary-object ] dip - [ buffer>> size>> ] [ '[ _ stream-write ] ] bi + [ buffer>> size>> ] [ '[ _ stream-write ] ] bi each ; M: output-port stream-write @@ -198,5 +198,3 @@ io.encodings.private ; HINTS: decoder-read-until { string input-port utf8 } { string input-port ascii } ; HINTS: decoder-readln { input-port utf8 } { input-port ascii } ; - -HINTS: encoder-write { object output-port utf8 } { object output-port ascii } ; diff --git a/basis/io/streams/byte-array/fast/authors.txt b/basis/io/streams/byte-array/fast/authors.txt new file mode 100644 index 0000000000..1901f27a24 --- /dev/null +++ b/basis/io/streams/byte-array/fast/authors.txt @@ -0,0 +1 @@ +Slava Pestov diff --git a/basis/io/streams/byte-array/fast/fast.factor b/basis/io/streams/byte-array/fast/fast.factor new file mode 100644 index 0000000000..e231335bfd --- /dev/null +++ b/basis/io/streams/byte-array/fast/fast.factor @@ -0,0 +1,15 @@ +! Copyright (C) 2010 Slava Pestov. +! See http://factorcode.org/license.txt for BSD license. +USING: alien byte-vectors io kernel libc math sequences ; +IN: io.streams.byte-array.fast + +! This is split off from io.streams.byte-array because it uses +! memcpy, which is a non-core word that only works after the +! optimizing compiler has been loaded. + +M: byte-vector stream-write + [ dup byte-length tail-slice ] + [ [ [ byte-length ] bi@ + ] keep lengthen ] + [ drop byte-length ] + 2tri + [ >c-ptr swap >c-ptr ] dip memcpy ; diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 03e8723d20..1880859db1 100644 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -1,4 +1,4 @@ -! Copyright (C) 2008 Daniel Ehrenberg. +! Copyright (C) 2008, 2010 Daniel Ehrenberg, Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors namespaces growable strings io classes continuations destructors combinators @@ -12,6 +12,10 @@ GENERIC: decode-char ( stream encoding -- char/f ) GENERIC: encode-char ( char stream encoding -- ) +GENERIC: encode-string ( string stream encoding -- ) + +M: object encode-string [ encode-char ] 2curry each ; inline + GENERIC: ( stream encoding -- newstream ) CONSTANT: replacement-char HEX: fffd @@ -134,13 +138,8 @@ M: encoder stream-element-type M: encoder stream-write1 >encoder< encode-char ; -GENERIC# encoder-write 2 ( string stream encoding -- ) - -M: string encoder-write - [ encode-char ] 2curry each ; - M: encoder stream-write - >encoder< encoder-write ; + >encoder< encode-string ; M: encoder dispose stream>> dispose ; diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index 2911385c09..c78a86c072 100644 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2006, 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: math math.order kernel sequences sbufs vectors growable io -continuations namespaces io.encodings combinators strings ; +USING: accessors byte-arrays math math.order kernel sequences +sbufs vectors growable io continuations namespaces io.encodings +combinators strings ; IN: io.encodings.utf8 ! Decoding UTF-8 @@ -45,10 +46,10 @@ M: utf8 decode-char ! Encoding UTF-8 : encoded ( stream char -- ) - BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ; + BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ; inline -: char>utf8 ( stream char -- ) - { +: char>utf8 ( char stream -- ) + swap { { [ dup -7 shift zero? ] [ swap stream-write1 ] } { [ dup -11 shift zero? ] [ 2dup -6 shift BIN: 11000000 bitor swap stream-write1 @@ -65,10 +66,16 @@ M: utf8 decode-char 2dup -6 shift encoded encoded ] - } cond ; + } cond ; inline M: utf8 encode-char - drop swap char>utf8 ; + drop char>utf8 ; + +M: utf8 encode-string + drop + over aux>> + [ [ char>utf8 ] curry each ] + [ [ >byte-array ] dip stream-write ] if ; PRIVATE> diff --git a/core/strings/strings.factor b/core/strings/strings.factor index 18af08b3f6..50d79a4d8a 100644 --- a/core/strings/strings.factor +++ b/core/strings/strings.factor @@ -1,8 +1,7 @@ ! Copyright (C) 2003, 2008 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. USING: accessors kernel math.private sequences kernel.private -math sequences.private slots.private byte-arrays -alien.accessors ; +math sequences.private slots.private alien.accessors ; IN: strings