New extra/tr/ vocab for fast translation of ASCII strings; improves reverse-complement performance by 11%; add soundex vocab which uses tr
parent
eba4b990af
commit
442bde22e5
|
@ -1,30 +1,20 @@
|
||||||
USING: io io.files io.streams.duplex kernel sequences
|
USING: io io.files io.streams.duplex kernel sequences
|
||||||
sequences.private strings vectors words memoize splitting
|
sequences.private strings vectors words memoize splitting
|
||||||
grouping hints unicode.case continuations io.encodings.ascii ;
|
grouping hints tr continuations io.encodings.ascii
|
||||||
|
unicode.case ;
|
||||||
IN: benchmark.reverse-complement
|
IN: benchmark.reverse-complement
|
||||||
|
|
||||||
MEMO: trans-map ( -- str )
|
TR: trans-map ch>upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ;
|
||||||
256 >string
|
|
||||||
"TGCAAKYRMBDHV" "ACGTUMRYKVHDB"
|
|
||||||
[ pick set-nth ] 2each ;
|
|
||||||
|
|
||||||
: do-trans-map ( str -- )
|
|
||||||
[ ch>upper trans-map nth ] change-each ;
|
|
||||||
|
|
||||||
HINTS: do-trans-map string ;
|
|
||||||
|
|
||||||
: translate-seq ( seq -- str )
|
: translate-seq ( seq -- str )
|
||||||
concat dup reverse-here dup do-trans-map ;
|
concat dup reverse-here dup trans-map-fast ;
|
||||||
|
|
||||||
: show-seq ( seq -- )
|
: show-seq ( seq -- )
|
||||||
translate-seq 60 <groups> [ print ] each ;
|
translate-seq 60 <groups> [ print ] each ;
|
||||||
|
|
||||||
: do-line ( seq line -- seq )
|
: do-line ( seq line -- seq )
|
||||||
dup first ">;" memq? [
|
dup first ">;" memq?
|
||||||
over show-seq print dup delete-all
|
[ over show-seq print dup delete-all ] [ over push ] if ;
|
||||||
] [
|
|
||||||
over push
|
|
||||||
] if ;
|
|
||||||
|
|
||||||
HINTS: do-line vector string ;
|
HINTS: do-line vector string ;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Slava Pestov
|
|
@ -0,0 +1,4 @@
|
||||||
|
IN: soundex.tests
|
||||||
|
USING: soundex tools.test ;
|
||||||
|
|
||||||
|
[ "S162" ] [ "supercalifrag" soundex ] unit-test
|
|
@ -0,0 +1,33 @@
|
||||||
|
! Copyright (C) 2008 Slava Pestov.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: sequences sequences.lib grouping assocs kernel ascii
|
||||||
|
unicode.case tr ;
|
||||||
|
IN: soundex
|
||||||
|
|
||||||
|
TR: soundex-tr
|
||||||
|
ch>upper
|
||||||
|
"AEHIOUWYBFPVCGJKQSXZDTLMNR"
|
||||||
|
"00000000111122222222334556" ;
|
||||||
|
|
||||||
|
: remove-duplicates ( seq -- seq' )
|
||||||
|
#! Remove _consecutive_ duplicates (unlike prune which removes
|
||||||
|
#! all duplicates).
|
||||||
|
[ 2 <clumps> [ = not ] assoc-filter values ] [ first ] bi prefix ;
|
||||||
|
|
||||||
|
: first>upper ( seq -- seq' ) 1 head >upper ;
|
||||||
|
: trim-first ( seq -- seq' ) dup first [ = ] curry left-trim ;
|
||||||
|
: remove-zeroes ( seq -- seq' ) CHAR: 0 swap remove ;
|
||||||
|
: remove-non-alpha ( seq -- seq' ) [ alpha? ] filter ;
|
||||||
|
: pad-4 ( first seq -- seq' ) "000" 3append 4 head ;
|
||||||
|
|
||||||
|
: soundex ( string -- soundex )
|
||||||
|
remove-non-alpha [ f ] [
|
||||||
|
[ first>upper ]
|
||||||
|
[
|
||||||
|
soundex-tr
|
||||||
|
trim-first
|
||||||
|
remove-duplicates
|
||||||
|
remove-zeroes
|
||||||
|
] bi
|
||||||
|
pad-4
|
||||||
|
] if-empty ;
|
|
@ -0,0 +1 @@
|
||||||
|
Soundex is a phonetic algorithm for indexing names by sound
|
|
@ -0,0 +1 @@
|
||||||
|
Slava Pestov
|
|
@ -0,0 +1 @@
|
||||||
|
Fast character-to-character translation of ASCII strings
|
|
@ -0,0 +1,7 @@
|
||||||
|
IN: tr.tests
|
||||||
|
USING: tr tools.test unicode.case ;
|
||||||
|
|
||||||
|
TR: tr-test ch>upper "ABC" "XYZ" ;
|
||||||
|
|
||||||
|
[ "XXYY" ] [ "aabb" tr-test ] unit-test
|
||||||
|
[ "XXYY" ] [ "AABB" tr-test ] unit-test
|
|
@ -0,0 +1,35 @@
|
||||||
|
! Copyright (C) 2008 Slava Pestov.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: byte-arrays strings sequences sequences.private
|
||||||
|
fry kernel words parser lexer assocs ;
|
||||||
|
IN: tr
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
: compute-tr ( quot from to -- mapping )
|
||||||
|
zip [ 256 ] 2dip '[ [ @ , at ] keep or ] B{ } map-as ; inline
|
||||||
|
|
||||||
|
: tr-hints ( word -- )
|
||||||
|
{ { byte-array } { string } } "specializer" set-word-prop ;
|
||||||
|
|
||||||
|
: create-tr ( token -- word )
|
||||||
|
create-in dup tr-hints ;
|
||||||
|
|
||||||
|
: define-tr ( word mapping -- )
|
||||||
|
'[ [ , nth ] map ]
|
||||||
|
(( seq -- translated ))
|
||||||
|
define-declared ;
|
||||||
|
|
||||||
|
: define-fast-tr ( word mapping -- )
|
||||||
|
'[ [ , nth-unsafe ] change-each ]
|
||||||
|
(( seq -- ))
|
||||||
|
define-declared ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: TR:
|
||||||
|
scan parse-definition
|
||||||
|
unclip-last [ unclip-last ] dip compute-tr
|
||||||
|
[ [ create-tr ] dip define-tr ]
|
||||||
|
[ [ "-fast" append create-tr ] dip define-fast-tr ] 2bi ;
|
||||||
|
parsing
|
Loading…
Reference in New Issue