soundex: fix soundex to handle test cases from wikipedia.
parent
2a5e0e7f4f
commit
3ffa47de6c
|
@ -3,3 +3,11 @@ USING: soundex tools.test ;
|
||||||
|
|
||||||
{ "S162" } [ "supercalifrag" soundex ] unit-test
|
{ "S162" } [ "supercalifrag" soundex ] unit-test
|
||||||
{ "M000" } [ "M" soundex ] unit-test
|
{ "M000" } [ "M" soundex ] unit-test
|
||||||
|
|
||||||
|
{ "R163" } [ "Robert" soundex ] unit-test
|
||||||
|
{ "R163" } [ "Rupert" soundex ] unit-test
|
||||||
|
{ "R150" } [ "Rubin" soundex ] unit-test
|
||||||
|
{ "A261" } [ "Ashcraft" soundex ] unit-test
|
||||||
|
{ "A261" } [ "Ashcroft" soundex ] unit-test
|
||||||
|
{ "T522" } [ "Tymczak" soundex ] unit-test
|
||||||
|
{ "P236" } [ "Pfister" soundex ] unit-test
|
||||||
|
|
|
@ -1,32 +1,32 @@
|
||||||
! Copyright (C) 2008 Slava Pestov.
|
! Copyright (C) 2008 Slava Pestov.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: sequences grouping assocs kernel ascii tr ;
|
USING: ascii kernel sequences tr ;
|
||||||
IN: soundex
|
IN: soundex
|
||||||
|
|
||||||
TR: soundex-tr
|
TR: soundex-digits
|
||||||
ch>upper
|
|
||||||
"AEHIOUWYBFPVCGJKQSXZDTLMNR"
|
"AEHIOUWYBFPVCGJKQSXZDTLMNR"
|
||||||
"00000000111122222222334556" ;
|
"AEHIOUWY111122222222334556" ;
|
||||||
|
|
||||||
: remove-duplicates ( seq -- seq' )
|
: remove-duplicates ( seq -- seq' )
|
||||||
! Remove _consecutive_ duplicates (unlike prune which removes
|
! Remove _consecutive_ duplicates (unlike prune which removes
|
||||||
! all duplicates).
|
! all duplicates).
|
||||||
[ 2 <clumps> [ = ] assoc-reject values ] [ first ] bi prefix ;
|
f swap [ [ = ] keep swap ] reject nip ;
|
||||||
|
|
||||||
: first>upper ( seq -- seq' ) 1 head >upper ;
|
: pad-4 ( seq -- seq' ) "000" append 4 head ;
|
||||||
: trim-first ( seq -- seq' ) dup first [ = ] curry trim-head ;
|
|
||||||
: remove-zeroes ( seq -- seq' ) CHAR: 0 swap remove ;
|
: remove-hw ( seq -- seq' )
|
||||||
: remove-non-alpha ( seq -- seq' ) [ alpha? ] filter ;
|
unclip [ [ "HW" member? ] reject ] [ prefix ] bi* ;
|
||||||
: pad-4 ( first seq -- seq' ) "000" 3append 4 head ;
|
|
||||||
|
: remove-aeiouy ( seq -- seq' )
|
||||||
|
unclip [ [ "AEIOUY" member? ] reject ] [ prefix ] bi* ;
|
||||||
|
|
||||||
|
: ?replace-first ( seq first -- seq )
|
||||||
|
over first digit? [ over set-first ] [ drop ] if ;
|
||||||
|
|
||||||
: soundex ( string -- soundex )
|
: soundex ( string -- soundex )
|
||||||
remove-non-alpha [ f ] [
|
>upper [ LETTER? ] filter [
|
||||||
[ first>upper ]
|
remove-hw
|
||||||
[
|
soundex-digits
|
||||||
soundex-tr
|
remove-duplicates
|
||||||
[ "" ] [ trim-first ] if-empty
|
remove-aeiouy
|
||||||
[ "" ] [ remove-duplicates ] if-empty
|
] keep first ?replace-first pad-4 ;
|
||||||
remove-zeroes
|
|
||||||
] bi
|
|
||||||
pad-4
|
|
||||||
] if-empty ;
|
|
||||||
|
|
Loading…
Reference in New Issue