From 7c5012da679a1ea1f6de34f2ad20bff36552bbb4 Mon Sep 17 00:00:00 2001 From: John Benediktsson Date: Mon, 26 Feb 2018 10:32:53 -0800 Subject: [PATCH] dbf: beginning to parse dBase database files. --- extra/dbf/authors.txt | 1 + extra/dbf/dbf.factor | 312 ++++++++++++++++++++++++++++++++++++++++++ extra/dbf/summary.txt | 1 + 3 files changed, 314 insertions(+) create mode 100644 extra/dbf/authors.txt create mode 100644 extra/dbf/dbf.factor create mode 100644 extra/dbf/summary.txt diff --git a/extra/dbf/authors.txt b/extra/dbf/authors.txt new file mode 100644 index 0000000000..e091bb8164 --- /dev/null +++ b/extra/dbf/authors.txt @@ -0,0 +1 @@ +John Benediktsson diff --git a/extra/dbf/dbf.factor b/extra/dbf/dbf.factor new file mode 100644 index 0000000000..240267bbac --- /dev/null +++ b/extra/dbf/dbf.factor @@ -0,0 +1,312 @@ +! Copyright (C) 2018 John Benediktsson +! See http://factorcode.org/license.txt for BSD license + +USING: accessors alien.c-types alien.endian alien.strings assocs +calendar calendar.parser classes.struct combinators fry io +io.binary io.encodings.8-bit io.encodings.ascii +io.encodings.binary io.encodings.string io.files +io.streams.byte-array kernel math math.parser namespaces +sequences splitting strings ; + +IN: dbf + +SYMBOL: dbf-encoding + +CONSTANT: dbf-encodings H{ + { 0x00 ascii } + { 0x01 cp437 } + { 0x02 cp850 } + { 0x03 windows-1252 } + { 0x04 mac-roman } + { 0x08 cp865 } + { 0x09 cp437 } + { 0x0A cp850 } + { 0x0B cp437 } + { 0x0D cp437 } + { 0x0E cp850 } + { 0x0F cp437 } + { 0x10 cp850 } + { 0x11 cp437 } + { 0x12 cp850 } + { 0x13 "cp932" } + { 0x14 cp850 } + { 0x15 cp437 } + { 0x16 cp850 } + { 0x17 cp865 } + { 0x18 cp437 } + { 0x19 cp437 } + { 0x1A cp850 } + { 0x1B cp437 } + { 0x1C cp863 } + { 0x1D cp850 } + { 0x1F cp852 } + { 0x22 cp852 } + { 0x23 cp852 } + { 0x24 cp860 } + { 0x25 cp850 } + { 0x26 cp866 } + { 0x37 cp850 } + { 0x40 cp852 } + { 0x4D "cp936" } + { 0x4E "cp949" } + { 0x4F "cp950" } + { 0x50 "cp874" } + { 0x57 windows-1252 } + { 0x58 windows-1252 } + { 0x59 windows-1252 } + { 0x64 cp852 } + { 0x65 cp866 } + { 0x66 cp865 } + { 0x67 cp861 } + { 0x68 f } + { 0x69 f } + { 0x6a "cp737" } + { 0x6b cp857 } + { 0x6c cp863 } + { 0x78 "cp950" } + { 0x79 "cp949" } + { 0x7a "cp936" } + { 0x7b "cp932" } + { 0x7c "cp874" } + { 0x7d windows-1255 } + { 0x7e windows-1256 } + { 0x86 "cp737" } + { 0x87 cp852 } + { 0x88 cp857 } + { 0x96 "mac-cyrillic" } + { 0x97 "mac-latin2" } + { 0x98 "mac-greek" } + { 0xc8 windows-1250 } + { 0xc9 windows-1251 } + { 0xca windows-1254 } + { 0xcb windows-1253 } + { 0xcc windows-1250 } +} + +CONSTANT: dbf-file-types H{ + { 0x02 "FoxBASE" } + { 0x03 "FoxBASE+/dBase III plus, no memo" } + { 0x04 "dBase IV, no memo" } + { 0x05 "dBase V, no memo" } + { 0x07 "Visual Objects 1.x" } + { 0x30 "Visual FoxPro" } + { 0x31 "Visual FoxPro, autoincrement enabled" } + { 0x32 "Visual FoxPro with field type Varchar or Varbinary" } + { 0x43 "dBase IV SQL table files, no memo" } + { 0x63 "dBase IV SQL system files, no memo" } + { 0x7b "dBase IV, with memo" } + { 0x83 "FoxBASE+/dBase III PLUS, with memo" } + { 0x87 "Visual Objects 1.x, with memo" } + { 0x8B "dBase IV with memo" } + { 0x8E "dBase IV with SQL table" } + { 0xCB "dBase IV SQL table files, with memo" } + { 0xE5 "HiPer-Six format with SMT memo file" } + { 0xF5 "FoxPro 2.x (or earlier) with memo" } + { 0xFB "FoxBASE" } +} + +STRUCT: dbf-file-header + { file-type uint8_t } + { year uint8_t } + { month uint8_t } + { day uint8_t } + { #records uint32_t } + { header-length uint16_t } + { record-length uint16_t } + { reserved1 uint16_t } + { incomplete-transaction uint8_t } + { encryption-flag uint8_t } + { free-record-thread uint32_t } + { reserved2 uint32_t } + { reserved3 uint32_t } + { mdx-flag uint8_t } + { language-driver uint8_t } + { reserved4 uint16_t } ; + +: read-file-header ( -- file-header ) + dbf-file-header read-struct ; + +CONSTANT: dbf-field-flags H{ + { 0x01 "System Column (not visible to user)" } + { 0x02 "Column can store null values" } + { 0x04 "Binary column (for CHAR and MEMO only)" } + { 0x06 "(0x02+0x04) When a field is NULL and binary (Integer, Currency, and Character/Memo fields)" } + { 0x0C "Column is autoincrementing" } +} + +STRUCT: dbf-field-header + { name uint8_t[11] } + { type uint8_t } + { address uint32_t } + { length uint8_t } + { #decimals uint8_t } + { reserved1 uint16_t } + { workarea-id uint8_t } + { reserved2 uint8_t } + { reserved3 uint8_t } + { set-fields-flag uint8_t } + { reserved4 uint8_t[7] } + { index-field-flag uint8_t } ; + +: read-field-headers ( -- field-headers ) + [ read1 dup { CHAR: \r CHAR: \n f } member? not ] [ + dbf-field-header heap-size 1 - read swap prefix + dbf-field-header memory>struct + ] produce nip ; + +: check-field-header ( field-header -- field-header ) + dup type>> { + { CHAR: I [ dup length>> 4 assert= ] } + { CHAR: L [ dup length>> 1 assert= ] } + { CHAR: O [ dup length>> 8 assert= ] } + { CHAR: Y [ dup length>> 8 assert= ] } + { CHAR: D [ dup length>> 8 assert= ] } + { CHAR: T [ dup length>> 8 assert= ] } + { CHAR: M [ dup length>> 10 assert= ] } + [ drop ] + } case ; + +: check-record-length ( file-header field-headers -- ) + [ record-length>> ] [ [ length>> ] map-sum ] bi* assert= ; + +DEFER: parse-field + +TUPLE: record deleted? values ; + +: read-records ( field-headers -- records ) + [ read1 dup { 0x1a f } member? not ] + [ + CHAR: * = over [ + [ length>> read ] + [ type>> parse-field ] bi + ] map record boa + ] produce 2nip ; + +TUPLE: dbf file-header field-headers records ; + +: load-dbf ( path -- dbf ) + binary [ + read-file-header + read-field-headers + over header-length>> seek-absolute seek-input + over language-driver>> dbf-encodings at dbf-encoding [ + dup read-records dbf boa + ] with-variable + ] with-file-reader ; + +: seek-record ( n file-header -- ) + [ record-length>> * ] [ header-length>> ] bi + + seek-absolute seek-input ; + +: parse-string ( byte-array -- string ) + [ " \0" member? ] trim-tail dbf-encoding get decode ; + +: parse-date ( byte-array -- date/f ) + dup [ " \0" member? ] all? [ drop f ] [ + binary [ (ymd>timestamp) ] with-byte-reader + ] if ; + +: parse-float ( byte-array -- n ) + [ "\r\n\t *" member? ] trim string>number ; + +: parse-int ( byte-array -- n ) + dup length 4 assert= le> ; + +: parse-short ( byte-array -- n ) + dup length 2 assert= le> ; + +SYMBOL: unknown + +ERROR: illegal-logical value ; + +: parse-logical ( byte-array -- n ) + first { + { [ dup "TtYy" member? ] [ drop t ] } + { [ dup "FfNn" member? ] [ drop f ] } + { [ dup "? " member? ] [ drop unknown ] } + [ illegal-logical ] + } cond ; + +: parse-numeric ( byte-array -- n ) + [ "\r\n\t *" member? ] trim "," "." replace string>number ; + +: parse-double ( byte-array -- n ) + dup length 8 assert= le> bits>double ; + +: parse-currency ( byte-array -- n ) + dup length 8 assert= le> 10000 / ; + +: parse-timestamp ( byte-array -- timestamp ) + [ -4713 1 1 ] dip 4 cut [ le> ] bi@ + [ days time+ ] [ milliseconds time+ ] bi* ; + +ERROR: unsupported-field-type type ; + +: parse-field ( byte-array type -- data ) + { + { CHAR: \0 [ ] } + { CHAR: 2 [ parse-short ] } + { CHAR: 4 [ parse-int ] } + { CHAR: 8 [ parse-double ] } + { CHAR: C [ parse-string ] } + { CHAR: D [ parse-date ] } + { CHAR: F [ parse-float ] } + { CHAR: I [ parse-int ] } + { CHAR: L [ parse-logical ] } + { CHAR: N [ parse-numeric ] } + { CHAR: O [ parse-double ] } + { CHAR: V [ parse-string ] } + { CHAR: Y [ parse-currency ] } + { CHAR: @ [ parse-timestamp ] } + ! { CHAR: + [ parse-autoincrement ] } + ! { CHAR: M [ parse-memo ] } + ! { CHAR: T [ parse-datetime ] } + ! { CHAR: B [ parse-double? ] } ! (only on dbversion in [0x30, 0x31, 0x32]) + ! { CHAR: G [ parse-general ] } + ! { CHAR: P [ parse-picture ] } + ! { CHAR: Q [ parse-varbinary ] } + [ unsupported-field-type ] + } case ; + +: dbase3-memo ( n path -- data ) + binary [ + 512 * seek-absolute seek-input + B{ } [ + 512 read + dup [ B{ 0 0x1a } member? ] find drop + [ head f ] [ t ] if* [ append ] dip + ] loop + ] with-file-reader ; + +LE-STRUCT: db4-memo-header + { reserved uint } ! B{ 0xff 0xff 0x08 0x08 } + { length uint } ; + +: dbase4-memo ( n path -- data ) + binary [ + 512 * seek-absolute seek-input + db4-memo-header read-struct length>> read + ] with-file-reader ; + +BE-STRUCT: vfp-file-header + { nextblock uint } + { reserved1 ushort } + { blocksize ushort } + { reserved2 uchar[504] } ; + +BE-STRUCT: vfp-memo-header + { type uint } + { length uint } ; + +CONSTANT: vfp-memo-types H{ + { 0x0 "picture memo" } + { 0x1 "text memo" } + { 0x2 "object memo" } +} + +: vfp-memo ( n path -- data ) + binary [ + vfp-file-header read-struct blocksize>> * + seek-absolute seek-input + vfp-memo-header read-struct length>> read + ] with-file-reader ; diff --git a/extra/dbf/summary.txt b/extra/dbf/summary.txt new file mode 100644 index 0000000000..372aeff006 --- /dev/null +++ b/extra/dbf/summary.txt @@ -0,0 +1 @@ +Parsers for various dBase database files