diff --git a/basis/soundex/soundex-tests.factor b/basis/soundex/soundex-tests.factor index 7d303b4501..ca0fe91999 100644 --- a/basis/soundex/soundex-tests.factor +++ b/basis/soundex/soundex-tests.factor @@ -3,3 +3,11 @@ USING: soundex tools.test ; { "S162" } [ "supercalifrag" soundex ] unit-test { "M000" } [ "M" soundex ] unit-test + +{ "R163" } [ "Robert" soundex ] unit-test +{ "R163" } [ "Rupert" soundex ] unit-test +{ "R150" } [ "Rubin" soundex ] unit-test +{ "A261" } [ "Ashcraft" soundex ] unit-test +{ "A261" } [ "Ashcroft" soundex ] unit-test +{ "T522" } [ "Tymczak" soundex ] unit-test +{ "P236" } [ "Pfister" soundex ] unit-test diff --git a/basis/soundex/soundex.factor b/basis/soundex/soundex.factor index 91110b3b98..28e43a36e1 100644 --- a/basis/soundex/soundex.factor +++ b/basis/soundex/soundex.factor @@ -1,32 +1,32 @@ ! Copyright (C) 2008 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: sequences grouping assocs kernel ascii tr ; +USING: ascii kernel sequences tr ; IN: soundex -TR: soundex-tr - ch>upper +TR: soundex-digits "AEHIOUWYBFPVCGJKQSXZDTLMNR" - "00000000111122222222334556" ; + "AEHIOUWY111122222222334556" ; : remove-duplicates ( seq -- seq' ) ! Remove _consecutive_ duplicates (unlike prune which removes ! all duplicates). - [ 2 [ = ] assoc-reject values ] [ first ] bi prefix ; + f swap [ [ = ] keep swap ] reject nip ; -: first>upper ( seq -- seq' ) 1 head >upper ; -: trim-first ( seq -- seq' ) dup first [ = ] curry trim-head ; -: remove-zeroes ( seq -- seq' ) CHAR: 0 swap remove ; -: remove-non-alpha ( seq -- seq' ) [ alpha? ] filter ; -: pad-4 ( first seq -- seq' ) "000" 3append 4 head ; +: pad-4 ( seq -- seq' ) "000" append 4 head ; + +: remove-hw ( seq -- seq' ) + unclip [ [ "HW" member? ] reject ] [ prefix ] bi* ; + +: remove-aeiouy ( seq -- seq' ) + unclip [ [ "AEIOUY" member? ] reject ] [ prefix ] bi* ; + +: ?replace-first ( seq first -- seq ) + over first digit? [ over set-first ] [ drop ] if ; : soundex ( string -- soundex ) - remove-non-alpha [ f ] [ - [ first>upper ] - [ - soundex-tr - [ "" ] [ trim-first ] if-empty - [ "" ] [ remove-duplicates ] if-empty - remove-zeroes - ] bi - pad-4 - ] if-empty ; + >upper [ LETTER? ] filter [ + remove-hw + soundex-digits + remove-duplicates + remove-aeiouy + ] keep first ?replace-first pad-4 ;