factor/library/help/search.factor

61 lines
1.8 KiB
Factor
Raw Normal View History

2006-06-11 23:38:39 -04:00
! Copyright (C) 2006 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
IN: help
USING: arrays graphs hashtables help io kernel math namespaces
porter-stemmer prettyprint sequences strings ;
! Right now this code is specific to the help. It will be
! generalized to an abstract full text search engine later.
2006-06-11 23:38:39 -04:00
: ignored-word? ( str -- ? )
{ "the" "of" "is" "to" "an" "and" "if" "in" "with" "this" "not" "are" "for" "by" "can" "be" "or" "from" "it" "does" "as" } member? ;
: tokenize ( string -- seq )
[ dup letter? swap LETTER? or not ] split*
[ >lower stem ] map
[
dup ignored-word? over length 1 = or swap empty? or not
] subset ;
: index-text ( score article string -- )
tokenize [ >r 2dup r> nest hash+ ] each 2drop ;
: index-article-title ( article -- )
2006-06-16 23:12:40 -04:00
3 swap dup article-title index-text ;
: index-article-content ( article -- )
1 swap dup [ help ] string-out index-text ;
: index-article ( article -- )
dup index-article-title index-article-content ;
SYMBOL: term-index
: discard-irrelevant ( results -- results )
#! Discard results in the low 33%
dup 0 [ second max ] reduce
swap [ first2 rot / 2array ] map-with
[ second 1/3 > ] subset ;
2006-06-11 23:38:39 -04:00
: count-occurrences ( seq -- hash )
[
2006-06-17 01:03:56 -04:00
dup [ [ drop off ] hash-each ] each
[ [ swap +@ ] hash-each ] each
2006-06-11 23:38:39 -04:00
] make-hash ;
: search-help ( phrase -- assoc )
tokenize [ term-index get hash ] map [ ] subset
2006-06-11 23:38:39 -04:00
count-occurrences hash>alist
[ first2 2array ] map
[ [ second ] 2apply swap - ] sort discard-irrelevant ;
2006-06-11 23:38:39 -04:00
: index-help ( -- )
[ all-articles [ index-article ] each ] make-hash
term-index set-global ;
2006-06-11 23:38:39 -04:00
: search-help. ( phrase -- )
"Search results for ``" write dup write "'':" print
search-help [
first <link> [ article-title ] keep write-object terpri
] each ;