soundex: fix soundex to handle test cases from wikipedia.
							parent
							
								
									2a5e0e7f4f
								
							
						
					
					
						commit
						3ffa47de6c
					
				| 
						 | 
				
			
			@ -3,3 +3,11 @@ USING: soundex tools.test ;
 | 
			
		|||
 | 
			
		||||
{ "S162" } [ "supercalifrag" soundex ] unit-test
 | 
			
		||||
{ "M000" } [ "M" soundex ] unit-test
 | 
			
		||||
 | 
			
		||||
{ "R163" } [ "Robert" soundex ] unit-test
 | 
			
		||||
{ "R163" } [ "Rupert" soundex ] unit-test
 | 
			
		||||
{ "R150" } [ "Rubin" soundex ] unit-test
 | 
			
		||||
{ "A261" } [ "Ashcraft" soundex ] unit-test
 | 
			
		||||
{ "A261" } [ "Ashcroft" soundex ] unit-test
 | 
			
		||||
{ "T522" } [ "Tymczak" soundex ] unit-test
 | 
			
		||||
{ "P236" } [ "Pfister" soundex ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,32 +1,32 @@
 | 
			
		|||
! Copyright (C) 2008 Slava Pestov.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: sequences grouping assocs kernel ascii tr ;
 | 
			
		||||
USING: ascii kernel sequences tr ;
 | 
			
		||||
IN: soundex
 | 
			
		||||
 | 
			
		||||
TR: soundex-tr
 | 
			
		||||
    ch>upper
 | 
			
		||||
TR: soundex-digits
 | 
			
		||||
    "AEHIOUWYBFPVCGJKQSXZDTLMNR"
 | 
			
		||||
    "00000000111122222222334556" ;
 | 
			
		||||
    "AEHIOUWY111122222222334556" ;
 | 
			
		||||
 | 
			
		||||
: remove-duplicates ( seq -- seq' )
 | 
			
		||||
    ! Remove _consecutive_ duplicates (unlike prune which removes
 | 
			
		||||
    ! all duplicates).
 | 
			
		||||
    [ 2 <clumps> [ = ] assoc-reject values ] [ first ] bi prefix ;
 | 
			
		||||
    f swap [ [ = ] keep swap ] reject nip ;
 | 
			
		||||
 | 
			
		||||
: first>upper ( seq -- seq' ) 1 head >upper ;
 | 
			
		||||
: trim-first ( seq -- seq' ) dup first [ = ] curry trim-head ;
 | 
			
		||||
: remove-zeroes ( seq -- seq' ) CHAR: 0 swap remove ;
 | 
			
		||||
: remove-non-alpha ( seq -- seq' ) [ alpha? ] filter ;
 | 
			
		||||
: pad-4 ( first seq -- seq' ) "000" 3append 4 head ;
 | 
			
		||||
: pad-4 ( seq -- seq' ) "000" append 4 head ;
 | 
			
		||||
 | 
			
		||||
: remove-hw ( seq -- seq' )
 | 
			
		||||
    unclip [ [ "HW" member? ] reject ] [ prefix ] bi* ;
 | 
			
		||||
 | 
			
		||||
: remove-aeiouy ( seq -- seq' )
 | 
			
		||||
    unclip [ [ "AEIOUY" member? ] reject ] [ prefix ] bi* ;
 | 
			
		||||
 | 
			
		||||
: ?replace-first ( seq first -- seq )
 | 
			
		||||
    over first digit? [ over set-first ] [ drop ] if ;
 | 
			
		||||
 | 
			
		||||
: soundex ( string -- soundex )
 | 
			
		||||
    remove-non-alpha [ f ] [
 | 
			
		||||
        [ first>upper ]
 | 
			
		||||
        [
 | 
			
		||||
            soundex-tr
 | 
			
		||||
            [ "" ] [ trim-first ] if-empty
 | 
			
		||||
            [ "" ] [ remove-duplicates ] if-empty
 | 
			
		||||
            remove-zeroes
 | 
			
		||||
        ] bi
 | 
			
		||||
        pad-4
 | 
			
		||||
    ] if-empty ;
 | 
			
		||||
    >upper [ LETTER? ] filter [
 | 
			
		||||
        remove-hw
 | 
			
		||||
        soundex-digits
 | 
			
		||||
        remove-duplicates
 | 
			
		||||
        remove-aeiouy
 | 
			
		||||
    ] keep first ?replace-first pad-4 ;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue