soundex: fix soundex to handle test cases from wikipedia.

locals-and-roots
John Benediktsson 2016-04-16 16:37:44 -07:00
parent 2a5e0e7f4f
commit 3ffa47de6c
2 changed files with 28 additions and 20 deletions

View File

@ -3,3 +3,11 @@ USING: soundex tools.test ;
{ "S162" } [ "supercalifrag" soundex ] unit-test
{ "M000" } [ "M" soundex ] unit-test
{ "R163" } [ "Robert" soundex ] unit-test
{ "R163" } [ "Rupert" soundex ] unit-test
{ "R150" } [ "Rubin" soundex ] unit-test
{ "A261" } [ "Ashcraft" soundex ] unit-test
{ "A261" } [ "Ashcroft" soundex ] unit-test
{ "T522" } [ "Tymczak" soundex ] unit-test
{ "P236" } [ "Pfister" soundex ] unit-test

View File

@ -1,32 +1,32 @@
! Copyright (C) 2008 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
USING: sequences grouping assocs kernel ascii tr ;
USING: ascii kernel sequences tr ;
IN: soundex
TR: soundex-tr
ch>upper
TR: soundex-digits
"AEHIOUWYBFPVCGJKQSXZDTLMNR"
"00000000111122222222334556" ;
"AEHIOUWY111122222222334556" ;
: remove-duplicates ( seq -- seq' )
! Remove _consecutive_ duplicates (unlike prune which removes
! all duplicates).
[ 2 <clumps> [ = ] assoc-reject values ] [ first ] bi prefix ;
f swap [ [ = ] keep swap ] reject nip ;
: first>upper ( seq -- seq' ) 1 head >upper ;
: trim-first ( seq -- seq' ) dup first [ = ] curry trim-head ;
: remove-zeroes ( seq -- seq' ) CHAR: 0 swap remove ;
: remove-non-alpha ( seq -- seq' ) [ alpha? ] filter ;
: pad-4 ( first seq -- seq' ) "000" 3append 4 head ;
: pad-4 ( seq -- seq' ) "000" append 4 head ;
: remove-hw ( seq -- seq' )
unclip [ [ "HW" member? ] reject ] [ prefix ] bi* ;
: remove-aeiouy ( seq -- seq' )
unclip [ [ "AEIOUY" member? ] reject ] [ prefix ] bi* ;
: ?replace-first ( seq first -- seq )
over first digit? [ over set-first ] [ drop ] if ;
: soundex ( string -- soundex )
remove-non-alpha [ f ] [
[ first>upper ]
[
soundex-tr
[ "" ] [ trim-first ] if-empty
[ "" ] [ remove-duplicates ] if-empty
remove-zeroes
] bi
pad-4
] if-empty ;
>upper [ LETTER? ] filter [
remove-hw
soundex-digits
remove-duplicates
remove-aeiouy
] keep first ?replace-first pad-4 ;