diff --git a/extra/io/encodings/detect/authors.txt b/extra/io/encodings/detect/authors.txt new file mode 100644 index 0000000000..6a1b3e726a --- /dev/null +++ b/extra/io/encodings/detect/authors.txt @@ -0,0 +1 @@ +Joe Groff diff --git a/extra/io/encodings/detect/detect-tests.factor b/extra/io/encodings/detect/detect-tests.factor new file mode 100644 index 0000000000..570f84d1b9 --- /dev/null +++ b/extra/io/encodings/detect/detect-tests.factor @@ -0,0 +1,40 @@ +! (c)2010 Joe Groff bsd license +USING: byte-arrays.hex io.encodings.8-bit.koi8-r io.encodings.detect tools.test ; +IN: io.encodings.detect.tests + +! UTF encodings with BOMs +[ utf16be ] [ HEX{ FEFF 0031 0032 0033 } detect-byte-array ] unit-test +[ utf16le ] [ HEX{ FFFE 3100 3200 3300 } detect-byte-array ] unit-test +[ utf32be ] [ HEX{ 0000FEFF 00000031 00000032 00000033 } detect-byte-array ] unit-test +[ utf32le ] [ HEX{ FFFE0000 31000000 32000000 33000000 } detect-byte-array ] unit-test +[ utf8 ] [ HEX{ EF BB BF 31 32 33 } detect-byte-array ] unit-test + +! XML prolog +[ utf8 ] +[ """""" >byte-array detect-byte-array ] +unit-test + +[ utf8 ] +[ """""" >byte-array detect-byte-array ] +unit-test + +[ latin1 ] +[ """""" >byte-array detect-byte-array ] +unit-test + +[ latin1 ] +[ """byte-array detect-byte-array ] +unit-test + +! Default to utf8 if decoding succeeds and there are no nulls +[ utf8 ] [ HEX{ 31 32 33 } detect-byte-array ] unit-test +[ utf8 ] [ HEX{ 31 32 C2 A0 33 } detect-byte-array ] unit-test +[ latin1 ] [ HEX{ 31 32 A0 33 } detect-byte-array ] unit-test +[ koi8-r ] [ + koi8-r default-8bit-encoding [ + HEX{ 31 32 A0 33 } detect-byte-array + ] with-variable +] unit-test + +[ binary ] [ HEX{ 31 32 33 C2 A0 00 } detect-byte-array ] unit-test +[ binary ] [ HEX{ 31 32 33 C2 A0 00 30 } detect-byte-array ] unit-test diff --git a/extra/io/encodings/detect/detect.factor b/extra/io/encodings/detect/detect.factor new file mode 100644 index 0000000000..a803c54c53 --- /dev/null +++ b/extra/io/encodings/detect/detect.factor @@ -0,0 +1,49 @@ +! (c)2010 Joe Groff bsd license +USING: accessors byte-arrays byte-arrays.hex combinators +continuations fry io io.encodings io.encodings.8-bit.latin1 +io.encodings.ascii io.encodings.binary io.encodings.iana +io.encodings.string io.encodings.utf16 io.encodings.utf32 +io.encodings.utf8 io.files io.streams.string kernel literals +math namespaces sequences strings ; +IN: io.encodings.detect + +SYMBOL: default-8bit-encoding +default-8bit-encoding [ latin1 ] initialize + + over index [ 1 + ] [ dup length ] if* head-slice >string ; + +: prolog-encoding ( string -- iana-encoding ) + '[ + _ "encoding=" over start + 10 + swap [ [ 1 - ] dip nth ] [ index-from ] [ swapd subseq ] 2tri + ] [ drop "UTF-8" ] recover ; + +: detect-xml-prolog ( bytes -- encoding ) + prolog-tag prolog-encoding name>encoding [ ascii ] unless* ; + +: valid-utf8? ( bytes -- ? ) + utf8 decode 1 head-slice* replacement-char swap member? not ; + +PRIVATE> + +: detect-byte-array ( bytes -- encoding ) + { + { [ dup HEX{ 0000FEFF } head? ] [ drop utf32be ] } + { [ dup HEX{ FFFE0000 } head? ] [ drop utf32le ] } + { [ dup HEX{ FEFF } head? ] [ drop utf16be ] } + { [ dup HEX{ FFFE } head? ] [ drop utf16le ] } + { [ dup HEX{ EF BB BF } head? ] [ drop utf8 ] } + { [ dup $[ "byte-array ] head? ] [ detect-xml-prolog ] } + { [ 0 over member? ] [ drop binary ] } + { [ dup valid-utf8? ] [ drop utf8 ] } + [ drop default-8bit-encoding get ] + } cond ; + +: detect-stream ( stream -- sample encoding ) + 256 swap stream-read dup detect-byte-array ; + +: detect-file ( file -- encoding ) + binary [ input-stream get detect-stream nip ] with-file-reader ; diff --git a/extra/io/encodings/detect/summary.txt b/extra/io/encodings/detect/summary.txt new file mode 100644 index 0000000000..23ab1cd153 --- /dev/null +++ b/extra/io/encodings/detect/summary.txt @@ -0,0 +1 @@ +Heuristic auto-detection of text encodings and binary files