| 
									
										
										
										
											2009-06-11 12:02:40 -04:00
										 |  |  | USING: combinators kernel math parser sequences splitting ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | IN: porter-stemmer | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : consonant? ( i str -- ? )
 | 
					
						
							|  |  |  |     2dup nth dup "aeiou" member? [ | 
					
						
							|  |  |  |         3drop f
 | 
					
						
							|  |  |  |     ] [ | 
					
						
							|  |  |  |         CHAR: y = [ | 
					
						
							|  |  |  |             over zero?
 | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |             [ 2drop t ] [ [ 1 - ] dip consonant? not ] if
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |         ] [ | 
					
						
							|  |  |  |             2drop t
 | 
					
						
							|  |  |  |         ] if
 | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : skip-vowels ( i str -- i str )
 | 
					
						
							|  |  |  |     2dup bounds-check? [ | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |         2dup consonant? [ [ 1 + ] dip skip-vowels ] unless
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     ] when ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : skip-consonants ( i str -- i str )
 | 
					
						
							|  |  |  |     2dup bounds-check? [ | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |         2dup consonant? [ [ 1 + ] dip skip-consonants ] when
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     ] when ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : (consonant-seq) ( n i str -- n )
 | 
					
						
							|  |  |  |     skip-vowels | 
					
						
							|  |  |  |     2dup bounds-check? [ | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |         [ 1 + ] [ 1 + ] [ ] tri* skip-consonants [ 1 + ] dip
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |         (consonant-seq) | 
					
						
							|  |  |  |     ] [ | 
					
						
							|  |  |  |         2drop
 | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : consonant-seq ( str -- n )
 | 
					
						
							| 
									
										
										
										
											2010-01-14 10:10:13 -05:00
										 |  |  |     [ 0 0 ] dip skip-consonants (consonant-seq) ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : stem-vowel? ( str -- ? )
 | 
					
						
							| 
									
										
										
										
											2010-01-14 10:10:13 -05:00
										 |  |  |     [ length iota ] keep [ consonant? ] curry all? not ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : double-consonant? ( i str -- ? )
 | 
					
						
							|  |  |  |     over 1 < [ | 
					
						
							|  |  |  |         2drop f
 | 
					
						
							|  |  |  |     ] [ | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |         2dup nth [ over 1 - over nth ] dip = [ | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |             consonant? | 
					
						
							|  |  |  |         ] [ | 
					
						
							|  |  |  |             2drop f
 | 
					
						
							|  |  |  |         ] if
 | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : consonant-end? ( n seq -- ? )
 | 
					
						
							|  |  |  |     [ length swap - ] keep consonant? ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-05-25 17:38:33 -04:00
										 |  |  | : last-is? ( str possibilities -- ? ) [ last ] dip member? ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : cvc? ( str -- ? )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { [ dup length 3 < ] [ drop f ] } | 
					
						
							|  |  |  |         { [ 1 over consonant-end? not ] [ drop f ] } | 
					
						
							|  |  |  |         { [ 2 over consonant-end? ] [ drop f ] } | 
					
						
							|  |  |  |         { [ 3 over consonant-end? not ] [ drop f ] } | 
					
						
							| 
									
										
										
										
											2008-04-11 13:56:48 -04:00
										 |  |  |         [ "wxy" last-is? not ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     } cond ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : r ( str oldsuffix newsuffix -- str )
 | 
					
						
							|  |  |  |     pick consonant-seq 0 > [ nip ] [ drop ] if append ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : step1a ( str -- newstr )
 | 
					
						
							| 
									
										
										
										
											2009-05-25 17:38:33 -04:00
										 |  |  |     dup last CHAR: s = [ | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |         { | 
					
						
							|  |  |  |             { [ "sses" ?tail ] [ "ss" append ] } | 
					
						
							|  |  |  |             { [ "ies" ?tail ] [ "i" append ] } | 
					
						
							|  |  |  |             { [ dup "ss" tail? ] [ ] } | 
					
						
							|  |  |  |             { [ "s" ?tail ] [ ] } | 
					
						
							| 
									
										
										
										
											2008-04-11 13:56:48 -04:00
										 |  |  |             [ ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |         } cond
 | 
					
						
							|  |  |  |     ] when ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : -eed ( str -- str )
 | 
					
						
							|  |  |  |     dup consonant-seq 0 > "ee" "eed" ? append ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : -ed ( str -- str ? )
 | 
					
						
							|  |  |  |     dup stem-vowel? [ [ "ed" append ] unless ] keep ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : -ing ( str -- str ? )
 | 
					
						
							|  |  |  |     dup stem-vowel? [ [ "ing" append ] unless ] keep ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : -ed/ing ( str -- str )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { [ "at" ?tail ] [ "ate" append ] } | 
					
						
							|  |  |  |         { [ "bl" ?tail ] [ "ble" append ] } | 
					
						
							|  |  |  |         { [ "iz" ?tail ] [ "ize" append ] } | 
					
						
							|  |  |  |         { | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |             [ dup length 1 - over double-consonant? ] | 
					
						
							| 
									
										
										
										
											2008-05-07 02:38:34 -04:00
										 |  |  |             [ dup "lsz" last-is? [ but-last-slice ] unless ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |         } | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             [ t ] | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 dup consonant-seq 1 = over cvc? and
 | 
					
						
							|  |  |  |                 [ "e" append ] when
 | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } cond ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : step1b ( str -- newstr )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { [ "eed" ?tail ] [ -eed ] } | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     { [ "ed" ?tail ] [ -ed ] } | 
					
						
							|  |  |  |                     { [ "ing" ?tail ] [ -ing ] } | 
					
						
							| 
									
										
										
										
											2008-04-11 13:56:48 -04:00
										 |  |  |                     [ f ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |                 } cond
 | 
					
						
							|  |  |  |             ] [ -ed/ing ] | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2008-04-11 13:56:48 -04:00
										 |  |  |         [ ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     } cond ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : step1c ( str -- newstr )
 | 
					
						
							| 
									
										
										
										
											2008-05-07 02:38:34 -04:00
										 |  |  |     dup but-last-slice stem-vowel? [ | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |         "y" ?tail [ "i" append ] when
 | 
					
						
							|  |  |  |     ] when ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : step2 ( str -- newstr )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { [ "ational" ?tail ] [ "ational" "ate"  r ] } | 
					
						
							|  |  |  |         { [ "tional"  ?tail ] [ "tional"  "tion" r ] } | 
					
						
							|  |  |  |         { [ "enci"    ?tail ] [ "enci"    "ence" r ] } | 
					
						
							|  |  |  |         { [ "anci"    ?tail ] [ "anci"    "ance" r ] } | 
					
						
							|  |  |  |         { [ "izer"    ?tail ] [ "izer"    "ize"  r ] } | 
					
						
							|  |  |  |         { [ "bli"     ?tail ] [ "bli"     "ble"  r ] } | 
					
						
							|  |  |  |         { [ "alli"    ?tail ] [ "alli"    "al"   r ] } | 
					
						
							|  |  |  |         { [ "entli"   ?tail ] [ "entli"   "ent"  r ] } | 
					
						
							|  |  |  |         { [ "eli"     ?tail ] [ "eli"     "e"    r ] } | 
					
						
							|  |  |  |         { [ "ousli"   ?tail ] [ "ousli"   "ous"  r ] } | 
					
						
							|  |  |  |         { [ "ization" ?tail ] [ "ization" "ize"  r ] } | 
					
						
							|  |  |  |         { [ "ation"   ?tail ] [ "ation"   "ate"  r ] } | 
					
						
							|  |  |  |         { [ "ator"    ?tail ] [ "ator"    "ate"  r ] } | 
					
						
							|  |  |  |         { [ "alism"   ?tail ] [ "alism"   "al"   r ] } | 
					
						
							|  |  |  |         { [ "iveness" ?tail ] [ "iveness" "ive"  r ] } | 
					
						
							|  |  |  |         { [ "fulness" ?tail ] [ "fulness" "ful"  r ] } | 
					
						
							|  |  |  |         { [ "ousness" ?tail ] [ "ousness" "ous"  r ] } | 
					
						
							|  |  |  |         { [ "aliti"   ?tail ] [ "aliti"   "al"   r ] } | 
					
						
							|  |  |  |         { [ "iviti"   ?tail ] [ "iviti"   "ive"  r ] } | 
					
						
							|  |  |  |         { [ "biliti"  ?tail ] [ "biliti"  "ble"  r ] } | 
					
						
							|  |  |  |         { [ "logi"    ?tail ] [ "logi"    "log"  r ] } | 
					
						
							| 
									
										
										
										
											2008-04-11 13:56:48 -04:00
										 |  |  |         [ ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     } cond ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : step3 ( str -- newstr )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { [ "icate" ?tail ] [ "icate" "ic" r ] } | 
					
						
							|  |  |  |         { [ "ative" ?tail ] [ "ative" ""   r ] } | 
					
						
							|  |  |  |         { [ "alize" ?tail ] [ "alize" "al" r ] } | 
					
						
							|  |  |  |         { [ "iciti" ?tail ] [ "iciti" "ic" r ] } | 
					
						
							|  |  |  |         { [ "ical"  ?tail ] [ "ical"  "ic" r ] } | 
					
						
							|  |  |  |         { [ "ful"   ?tail ] [ "ful"   ""   r ] } | 
					
						
							|  |  |  |         { [ "ness"  ?tail ] [ "ness"  ""   r ] } | 
					
						
							| 
									
										
										
										
											2008-04-11 13:56:48 -04:00
										 |  |  |         [ ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     } cond ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : -ion ( str -- newstr )
 | 
					
						
							| 
									
										
										
										
											2008-09-06 18:15:25 -04:00
										 |  |  |     [ | 
					
						
							|  |  |  |         "ion" | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     ] [ | 
					
						
							|  |  |  |         dup "st" last-is? [ "ion" append ] unless
 | 
					
						
							| 
									
										
										
										
											2008-09-06 18:15:25 -04:00
										 |  |  |     ] if-empty ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : step4 ( str -- newstr )
 | 
					
						
							|  |  |  |     dup { | 
					
						
							|  |  |  |         { [ "al"    ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ance"  ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ence"  ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "er"    ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ic"    ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "able"  ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ible"  ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ant"   ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ement" ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ment"  ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ent"   ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ion"   ?tail ] [ -ion ] } | 
					
						
							|  |  |  |         { [ "ou"    ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ism"   ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ate"   ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "iti"   ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ous"   ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ive"   ?tail ] [ ] } | 
					
						
							|  |  |  |         { [ "ize"   ?tail ] [ ] } | 
					
						
							| 
									
										
										
										
											2008-04-11 13:56:48 -04:00
										 |  |  |         [ ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     } cond dup consonant-seq 1 > [ nip ] [ drop ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : remove-e? ( str -- ? )
 | 
					
						
							|  |  |  |     dup consonant-seq dup 1 >
 | 
					
						
							|  |  |  |     [ 2drop t ] | 
					
						
							| 
									
										
										
										
											2008-05-07 02:38:34 -04:00
										 |  |  |     [ 1 = [ but-last-slice cvc? not ] [ drop f ] if ] if ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : remove-e ( str -- newstr )
 | 
					
						
							| 
									
										
										
										
											2009-05-25 17:38:33 -04:00
										 |  |  |     dup last CHAR: e = [ | 
					
						
							| 
									
										
										
										
											2008-05-07 02:38:34 -04:00
										 |  |  |         dup remove-e? [ but-last-slice ] when
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     ] when ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : ll->l ( str -- newstr )
 | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2009-05-25 17:38:33 -04:00
										 |  |  |         { [ dup last CHAR: l = not ] [ ] } | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |         { [ dup length 1 - over double-consonant? not ] [ ] } | 
					
						
							| 
									
										
										
										
											2008-05-07 02:38:34 -04:00
										 |  |  |         { [ dup consonant-seq 1 > ] [ but-last-slice ] } | 
					
						
							| 
									
										
										
										
											2008-04-11 13:56:48 -04:00
										 |  |  |         [ ] | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     } cond ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : step5 ( str -- newstr ) remove-e ll->l ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : stem ( str -- newstr )
 | 
					
						
							|  |  |  |     dup length 2 <= [ | 
					
						
							|  |  |  |         step1a step1b step1c step2 step3 step4 step5 "" like
 | 
					
						
							|  |  |  |     ] unless ;
 |