From 442bde22e581e01fb493070e66f976fc66892f80 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Wed, 9 Jul 2008 19:25:24 -0500 Subject: [PATCH] New extra/tr/ vocab for fast translation of ASCII strings; improves reverse-complement performance by 11%; add soundex vocab which uses tr --- .../reverse-complement.factor | 22 ++++-------- extra/soundex/author.txt | 1 + extra/soundex/soundex-tests.factor | 4 +++ extra/soundex/soundex.factor | 33 +++++++++++++++++ extra/soundex/summary.txt | 1 + extra/tr/authors.txt | 1 + extra/tr/summary.txt | 1 + extra/tr/tr-tests.factor | 7 ++++ extra/tr/tr.factor | 35 +++++++++++++++++++ 9 files changed, 89 insertions(+), 16 deletions(-) create mode 100644 extra/soundex/author.txt create mode 100644 extra/soundex/soundex-tests.factor create mode 100644 extra/soundex/soundex.factor create mode 100644 extra/soundex/summary.txt create mode 100644 extra/tr/authors.txt create mode 100644 extra/tr/summary.txt create mode 100644 extra/tr/tr-tests.factor create mode 100644 extra/tr/tr.factor diff --git a/extra/benchmark/reverse-complement/reverse-complement.factor b/extra/benchmark/reverse-complement/reverse-complement.factor index b7c1db043c..665cbba30d 100755 --- a/extra/benchmark/reverse-complement/reverse-complement.factor +++ b/extra/benchmark/reverse-complement/reverse-complement.factor @@ -1,30 +1,20 @@ USING: io io.files io.streams.duplex kernel sequences sequences.private strings vectors words memoize splitting -grouping hints unicode.case continuations io.encodings.ascii ; +grouping hints tr continuations io.encodings.ascii +unicode.case ; IN: benchmark.reverse-complement -MEMO: trans-map ( -- str ) - 256 >string - "TGCAAKYRMBDHV" "ACGTUMRYKVHDB" - [ pick set-nth ] 2each ; - -: do-trans-map ( str -- ) - [ ch>upper trans-map nth ] change-each ; - -HINTS: do-trans-map string ; +TR: trans-map ch>upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ; : translate-seq ( seq -- str ) - concat dup reverse-here dup do-trans-map ; + concat dup reverse-here dup trans-map-fast ; : show-seq ( seq -- ) translate-seq 60 [ print ] each ; : do-line ( seq line -- seq ) - dup first ">;" memq? [ - over show-seq print dup delete-all - ] [ - over push - ] if ; + dup first ">;" memq? + [ over show-seq print dup delete-all ] [ over push ] if ; HINTS: do-line vector string ; diff --git a/extra/soundex/author.txt b/extra/soundex/author.txt new file mode 100644 index 0000000000..1901f27a24 --- /dev/null +++ b/extra/soundex/author.txt @@ -0,0 +1 @@ +Slava Pestov diff --git a/extra/soundex/soundex-tests.factor b/extra/soundex/soundex-tests.factor new file mode 100644 index 0000000000..df6338c4ec --- /dev/null +++ b/extra/soundex/soundex-tests.factor @@ -0,0 +1,4 @@ +IN: soundex.tests +USING: soundex tools.test ; + +[ "S162" ] [ "supercalifrag" soundex ] unit-test diff --git a/extra/soundex/soundex.factor b/extra/soundex/soundex.factor new file mode 100644 index 0000000000..c82825d814 --- /dev/null +++ b/extra/soundex/soundex.factor @@ -0,0 +1,33 @@ +! Copyright (C) 2008 Slava Pestov. +! See http://factorcode.org/license.txt for BSD license. +USING: sequences sequences.lib grouping assocs kernel ascii +unicode.case tr ; +IN: soundex + +TR: soundex-tr + ch>upper + "AEHIOUWYBFPVCGJKQSXZDTLMNR" + "00000000111122222222334556" ; + +: remove-duplicates ( seq -- seq' ) + #! Remove _consecutive_ duplicates (unlike prune which removes + #! all duplicates). + [ 2 [ = not ] assoc-filter values ] [ first ] bi prefix ; + +: first>upper ( seq -- seq' ) 1 head >upper ; +: trim-first ( seq -- seq' ) dup first [ = ] curry left-trim ; +: remove-zeroes ( seq -- seq' ) CHAR: 0 swap remove ; +: remove-non-alpha ( seq -- seq' ) [ alpha? ] filter ; +: pad-4 ( first seq -- seq' ) "000" 3append 4 head ; + +: soundex ( string -- soundex ) + remove-non-alpha [ f ] [ + [ first>upper ] + [ + soundex-tr + trim-first + remove-duplicates + remove-zeroes + ] bi + pad-4 + ] if-empty ; diff --git a/extra/soundex/summary.txt b/extra/soundex/summary.txt new file mode 100644 index 0000000000..95a271d911 --- /dev/null +++ b/extra/soundex/summary.txt @@ -0,0 +1 @@ +Soundex is a phonetic algorithm for indexing names by sound diff --git a/extra/tr/authors.txt b/extra/tr/authors.txt new file mode 100644 index 0000000000..1901f27a24 --- /dev/null +++ b/extra/tr/authors.txt @@ -0,0 +1 @@ +Slava Pestov diff --git a/extra/tr/summary.txt b/extra/tr/summary.txt new file mode 100644 index 0000000000..8678446951 --- /dev/null +++ b/extra/tr/summary.txt @@ -0,0 +1 @@ +Fast character-to-character translation of ASCII strings diff --git a/extra/tr/tr-tests.factor b/extra/tr/tr-tests.factor new file mode 100644 index 0000000000..1eea69ba07 --- /dev/null +++ b/extra/tr/tr-tests.factor @@ -0,0 +1,7 @@ +IN: tr.tests +USING: tr tools.test unicode.case ; + +TR: tr-test ch>upper "ABC" "XYZ" ; + +[ "XXYY" ] [ "aabb" tr-test ] unit-test +[ "XXYY" ] [ "AABB" tr-test ] unit-test diff --git a/extra/tr/tr.factor b/extra/tr/tr.factor new file mode 100644 index 0000000000..a95d308d36 --- /dev/null +++ b/extra/tr/tr.factor @@ -0,0 +1,35 @@ +! Copyright (C) 2008 Slava Pestov. +! See http://factorcode.org/license.txt for BSD license. +USING: byte-arrays strings sequences sequences.private +fry kernel words parser lexer assocs ; +IN: tr + + + +: TR: + scan parse-definition + unclip-last [ unclip-last ] dip compute-tr + [ [ create-tr ] dip define-tr ] + [ [ "-fast" append create-tr ] dip define-fast-tr ] 2bi ; + parsing