220 lines
6.0 KiB
Factor
220 lines
6.0 KiB
Factor
USING: combinators kernel math parser sequences splitting ;
|
|
IN: porter-stemmer
|
|
|
|
: consonant? ( i str -- ? )
|
|
2dup nth dup "aeiou" member? [
|
|
3drop f
|
|
] [
|
|
CHAR: y = [
|
|
over zero?
|
|
[ 2drop t ] [ [ 1 - ] dip consonant? not ] if
|
|
] [
|
|
2drop t
|
|
] if
|
|
] if ;
|
|
|
|
: skip-vowels ( i str -- i str )
|
|
2dup bounds-check? [
|
|
2dup consonant? [ [ 1 + ] dip skip-vowels ] unless
|
|
] when ;
|
|
|
|
: skip-consonants ( i str -- i str )
|
|
2dup bounds-check? [
|
|
2dup consonant? [ [ 1 + ] dip skip-consonants ] when
|
|
] when ;
|
|
|
|
: (consonant-seq) ( n i str -- n )
|
|
skip-vowels
|
|
2dup bounds-check? [
|
|
[ 1 + ] [ 1 + ] [ ] tri* skip-consonants [ 1 + ] dip
|
|
(consonant-seq)
|
|
] [
|
|
2drop
|
|
] if ;
|
|
|
|
: consonant-seq ( str -- n )
|
|
0 0 rot skip-consonants (consonant-seq) ;
|
|
|
|
: stem-vowel? ( str -- ? )
|
|
[ length ] keep [ consonant? ] curry all? not ;
|
|
|
|
: double-consonant? ( i str -- ? )
|
|
over 1 < [
|
|
2drop f
|
|
] [
|
|
2dup nth [ over 1 - over nth ] dip = [
|
|
consonant?
|
|
] [
|
|
2drop f
|
|
] if
|
|
] if ;
|
|
|
|
: consonant-end? ( n seq -- ? )
|
|
[ length swap - ] keep consonant? ;
|
|
|
|
: last-is? ( str possibilities -- ? ) [ last ] dip member? ;
|
|
|
|
: cvc? ( str -- ? )
|
|
{
|
|
{ [ dup length 3 < ] [ drop f ] }
|
|
{ [ 1 over consonant-end? not ] [ drop f ] }
|
|
{ [ 2 over consonant-end? ] [ drop f ] }
|
|
{ [ 3 over consonant-end? not ] [ drop f ] }
|
|
[ "wxy" last-is? not ]
|
|
} cond ;
|
|
|
|
: r ( str oldsuffix newsuffix -- str )
|
|
pick consonant-seq 0 > [ nip ] [ drop ] if append ;
|
|
|
|
: step1a ( str -- newstr )
|
|
dup last CHAR: s = [
|
|
{
|
|
{ [ "sses" ?tail ] [ "ss" append ] }
|
|
{ [ "ies" ?tail ] [ "i" append ] }
|
|
{ [ dup "ss" tail? ] [ ] }
|
|
{ [ "s" ?tail ] [ ] }
|
|
[ ]
|
|
} cond
|
|
] when ;
|
|
|
|
: -eed ( str -- str )
|
|
dup consonant-seq 0 > "ee" "eed" ? append ;
|
|
|
|
: -ed ( str -- str ? )
|
|
dup stem-vowel? [ [ "ed" append ] unless ] keep ;
|
|
|
|
: -ing ( str -- str ? )
|
|
dup stem-vowel? [ [ "ing" append ] unless ] keep ;
|
|
|
|
: -ed/ing ( str -- str )
|
|
{
|
|
{ [ "at" ?tail ] [ "ate" append ] }
|
|
{ [ "bl" ?tail ] [ "ble" append ] }
|
|
{ [ "iz" ?tail ] [ "ize" append ] }
|
|
{
|
|
[ dup length 1 - over double-consonant? ]
|
|
[ dup "lsz" last-is? [ but-last-slice ] unless ]
|
|
}
|
|
{
|
|
[ t ]
|
|
[
|
|
dup consonant-seq 1 = over cvc? and
|
|
[ "e" append ] when
|
|
]
|
|
}
|
|
} cond ;
|
|
|
|
: step1b ( str -- newstr )
|
|
{
|
|
{ [ "eed" ?tail ] [ -eed ] }
|
|
{
|
|
[
|
|
{
|
|
{ [ "ed" ?tail ] [ -ed ] }
|
|
{ [ "ing" ?tail ] [ -ing ] }
|
|
[ f ]
|
|
} cond
|
|
] [ -ed/ing ]
|
|
}
|
|
[ ]
|
|
} cond ;
|
|
|
|
: step1c ( str -- newstr )
|
|
dup but-last-slice stem-vowel? [
|
|
"y" ?tail [ "i" append ] when
|
|
] when ;
|
|
|
|
: step2 ( str -- newstr )
|
|
{
|
|
{ [ "ational" ?tail ] [ "ational" "ate" r ] }
|
|
{ [ "tional" ?tail ] [ "tional" "tion" r ] }
|
|
{ [ "enci" ?tail ] [ "enci" "ence" r ] }
|
|
{ [ "anci" ?tail ] [ "anci" "ance" r ] }
|
|
{ [ "izer" ?tail ] [ "izer" "ize" r ] }
|
|
{ [ "bli" ?tail ] [ "bli" "ble" r ] }
|
|
{ [ "alli" ?tail ] [ "alli" "al" r ] }
|
|
{ [ "entli" ?tail ] [ "entli" "ent" r ] }
|
|
{ [ "eli" ?tail ] [ "eli" "e" r ] }
|
|
{ [ "ousli" ?tail ] [ "ousli" "ous" r ] }
|
|
{ [ "ization" ?tail ] [ "ization" "ize" r ] }
|
|
{ [ "ation" ?tail ] [ "ation" "ate" r ] }
|
|
{ [ "ator" ?tail ] [ "ator" "ate" r ] }
|
|
{ [ "alism" ?tail ] [ "alism" "al" r ] }
|
|
{ [ "iveness" ?tail ] [ "iveness" "ive" r ] }
|
|
{ [ "fulness" ?tail ] [ "fulness" "ful" r ] }
|
|
{ [ "ousness" ?tail ] [ "ousness" "ous" r ] }
|
|
{ [ "aliti" ?tail ] [ "aliti" "al" r ] }
|
|
{ [ "iviti" ?tail ] [ "iviti" "ive" r ] }
|
|
{ [ "biliti" ?tail ] [ "biliti" "ble" r ] }
|
|
{ [ "logi" ?tail ] [ "logi" "log" r ] }
|
|
[ ]
|
|
} cond ;
|
|
|
|
: step3 ( str -- newstr )
|
|
{
|
|
{ [ "icate" ?tail ] [ "icate" "ic" r ] }
|
|
{ [ "ative" ?tail ] [ "ative" "" r ] }
|
|
{ [ "alize" ?tail ] [ "alize" "al" r ] }
|
|
{ [ "iciti" ?tail ] [ "iciti" "ic" r ] }
|
|
{ [ "ical" ?tail ] [ "ical" "ic" r ] }
|
|
{ [ "ful" ?tail ] [ "ful" "" r ] }
|
|
{ [ "ness" ?tail ] [ "ness" "" r ] }
|
|
[ ]
|
|
} cond ;
|
|
|
|
: -ion ( str -- newstr )
|
|
[
|
|
"ion"
|
|
] [
|
|
dup "st" last-is? [ "ion" append ] unless
|
|
] if-empty ;
|
|
|
|
: step4 ( str -- newstr )
|
|
dup {
|
|
{ [ "al" ?tail ] [ ] }
|
|
{ [ "ance" ?tail ] [ ] }
|
|
{ [ "ence" ?tail ] [ ] }
|
|
{ [ "er" ?tail ] [ ] }
|
|
{ [ "ic" ?tail ] [ ] }
|
|
{ [ "able" ?tail ] [ ] }
|
|
{ [ "ible" ?tail ] [ ] }
|
|
{ [ "ant" ?tail ] [ ] }
|
|
{ [ "ement" ?tail ] [ ] }
|
|
{ [ "ment" ?tail ] [ ] }
|
|
{ [ "ent" ?tail ] [ ] }
|
|
{ [ "ion" ?tail ] [ -ion ] }
|
|
{ [ "ou" ?tail ] [ ] }
|
|
{ [ "ism" ?tail ] [ ] }
|
|
{ [ "ate" ?tail ] [ ] }
|
|
{ [ "iti" ?tail ] [ ] }
|
|
{ [ "ous" ?tail ] [ ] }
|
|
{ [ "ive" ?tail ] [ ] }
|
|
{ [ "ize" ?tail ] [ ] }
|
|
[ ]
|
|
} cond dup consonant-seq 1 > [ nip ] [ drop ] if ;
|
|
|
|
: remove-e? ( str -- ? )
|
|
dup consonant-seq dup 1 >
|
|
[ 2drop t ]
|
|
[ 1 = [ but-last-slice cvc? not ] [ drop f ] if ] if ;
|
|
|
|
: remove-e ( str -- newstr )
|
|
dup last CHAR: e = [
|
|
dup remove-e? [ but-last-slice ] when
|
|
] when ;
|
|
|
|
: ll->l ( str -- newstr )
|
|
{
|
|
{ [ dup last CHAR: l = not ] [ ] }
|
|
{ [ dup length 1 - over double-consonant? not ] [ ] }
|
|
{ [ dup consonant-seq 1 > ] [ but-last-slice ] }
|
|
[ ]
|
|
} cond ;
|
|
|
|
: step5 ( str -- newstr ) remove-e ll->l ;
|
|
|
|
: stem ( str -- newstr )
|
|
dup length 2 <= [
|
|
step1a step1b step1c step2 step3 step4 step5 "" like
|
|
] unless ;
|