Add: Boyer-Moore string search algorithm

db4
Dmitry Shubin 2010-04-16 03:49:55 +04:00
parent 74075511c2
commit 38ef5919e8
12 changed files with 250 additions and 0 deletions

View File

@ -0,0 +1 @@
Dmitry Shubin

View File

@ -0,0 +1,59 @@
! Copyright (C) 2010 Dmitry Shubin.
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax kernel sequences ;
IN: boyer-moore
HELP: <boyer-moore>
{ $values
{ "pat" sequence } { "bm" object }
}
{ $description
"Given a pattern performs pattern preprocessing and returns "
"results as an (opaque) object that is reusable across "
"searches in different sequences via " { $link search-from }
" generic word."
} ;
HELP: search-from
{ $values
{ "seq" sequence }
{ "from" "a non-negative integer" }
{ "obj" object }
{ "i/f" "the index of first match or " { $link f } }
}
{ $description "Performs an attempt to find the first "
"occurence of pattern in " { $snippet "seq" }
" starting from " { $snippet "from" } " using "
"Boyer-Moore search algorithm. Output is the index "
"if the attempt was succeessful and " { $link f }
" otherwise."
} ;
HELP: search
{ $values
{ "seq" sequence }
{ "obj" object }
{ "i/f" "the index of first match or " { $link f } }
}
{ $description "A simpler variant of " { $link search-from }
" that starts searching from the beginning of the sequence."
} ;
ARTICLE: "boyer-moore" "The Boyer-Moore algorithm"
{ $heading "Summary" }
"The " { $vocab-link "boyer-moore" } " vocabulary "
"implements a Boyer-Moore string search algorithm with "
"so-called 'strong good suffix shift rule'. Since agorithm is "
"alphabet-independent it is applicable to searching in any "
"collection that implements " { $links "sequence-protocol" } "."
{ $heading "Complexity" }
"Let " { $snippet "n" } " and " { $snippet "m" } " be lengths "
"of the sequences being searched " { $emphasis "in" } " and "
{ $emphasis "for" } " respectively. Then searching runs in "
{ $snippet "O(n)" } " time in it's worst case using additional "
{ $snippet "O(m)" } " space. Preprocessing phase runs in "
{ $snippet "O(m)" } " time."
;
ABOUT: "boyer-moore"

View File

@ -0,0 +1,10 @@
! Copyright (C) 2010 Dmitry Shubin.
! See http://factorcode.org/license.txt for BSD license.
USING: tools.test boyer-moore ;
IN: boyer-moore.tests
[ 0 ] [ "qwerty" "" search ] unit-test
[ 0 ] [ "" "" search ] unit-test
[ f ] [ "qw" "qwerty" search ] unit-test
[ 3 ] [ "qwerty" "r" search ] unit-test
[ 8 ] [ "qwerasdfqwer" 2 "qwe" search-from ] unit-test

View File

@ -0,0 +1,76 @@
! Copyright (C) 2010 Dmitry Shubin.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs kernel locals math math.order
math.ranges sequences sequences.private z-algorithm ;
IN: boyer-moore
<PRIVATE
:: (normal-suffixes) ( i zs ss -- )
i zs nth-unsafe ss
[ [ i ] unless* ] change-nth-unsafe ; inline
: normal-suffixes ( zs -- ss )
[ length [ f <array> ] [ [1,b) ] bi ] keep pick
[ (normal-suffixes) ] 2curry each ; inline
:: (partial-suffixes) ( len old elt i -- len old/new old )
len elt i 1 + = [ len elt - ] [ old ] if old ; inline
: partial-suffixes ( zs -- ss )
[ length dup ] [ <reversed> ] bi
[ (partial-suffixes) ] map-index 2nip ; inline
: <gs-table> ( seq -- table )
z-values [ partial-suffixes ] [ normal-suffixes ] bi
[ [ nip ] when* ] 2map reverse! ; inline
: insert-bc-shift ( table elt len i -- table )
1 + swap - swap pick 2dup key?
[ 3drop ] [ set-at ] if ; inline
: <bc-table> ( seq -- table )
H{ } clone swap [ length ] keep
[ insert-bc-shift ] with each-index ; inline
TUPLE: boyer-moore pattern bc-table gs-table ;
: gs-shift ( i c bm -- s ) nip gs-table>> nth-unsafe ; inline
: bc-shift ( i c bm -- s ) bc-table>> at dup 1 ? + ; inline
: do-shift ( pos i c bm -- newpos )
[ gs-shift ] [ bc-shift ] bi-curry 2bi max + ; inline
: match? ( i1 s1 i2 s2 -- ? ) [ nth-unsafe ] 2bi@ = ; inline
:: mismatch? ( s1 s2 pos len -- i/f )
len 1 - [ [ pos + s1 ] keep s2 match? not ]
find-last-integer ; inline
:: (search-from) ( seq from bm -- i/f )
bm pattern>> :> pat
pat length :> plen
seq length plen - :> lim
from
[ dup lim <=
[ seq pat pick plen mismatch?
[ 2dup + seq nth-unsafe bm do-shift t ] [ f ] if*
] [ drop f f ] if
] loop ; inline
PRIVATE>
: <boyer-moore> ( pat -- bm )
dup <reversed> [ <bc-table> ] [ <gs-table> ] bi
boyer-moore boa ;
GENERIC: search-from ( seq from obj -- i/f )
M: sequence search-from
dup length zero?
[ 3drop 0 ] [ <boyer-moore> (search-from) ] if ;
M: boyer-moore search-from (search-from) ;
: search ( seq obj -- i/f ) [ 0 ] dip search-from ;

View File

@ -0,0 +1 @@
Boyer-Moore string search algorithm

View File

@ -0,0 +1 @@
algorithms

View File

@ -0,0 +1 @@
Dmitry Shubin

View File

@ -0,0 +1 @@
Z algorithm for pattern preprocessing

View File

@ -0,0 +1 @@
algorithms

View File

@ -0,0 +1,49 @@
! Copyright (C) 2010 Dmitry Shubin.
! See http://factorcode.org/license.txt for BSD license.
USING: arrays help.markup help.syntax sequences ;
IN: z-algorithm
HELP: lcp
{ $values
{ "seq1" sequence } { "seq2" sequence }
{ "n" "a non-negative integer" }
}
{ $description
"Outputs the length of longest common prefix of two sequences."
} ;
HELP: z-values
{ $values
{ "seq" sequence } { "Z" array }
}
{ $description
"Outputs an array of the same length as " { $snippet "seq" }
", containing Z-values for given sequence. See "
{ $link "z-algorithm" } " for details."
} ;
ARTICLE: "z-algorithm" "Z algorithm"
{ $heading "Definition" }
"Given the sequence " { $snippet "S" } " and the index "
{ $snippet "i" } ", let " { $snippet "i" } "-th Z value of "
{ $snippet "S" } " be the length of the longest subsequence of "
{ $snippet "S" } " that starts at " { $snippet "i" }
" and matches the prefix of " { $snippet "S" } "."
{ $heading "Example" }
"Here is an example for string " { $snippet "\"abababaca\"" } ":"
{ $table
{ { $snippet "i:" } "0" "1" "2" "3" "4" "5" "6" "7" "8" }
{ { $snippet "S:" } "a" "b" "a" "b" "a" "b" "a" "c" "a" }
{ { $snippet "Z:" } "9" "0" "5" "0" "3" "0" "1" "0" "1" }
}
{ $heading "Summary" }
"The " { $vocab-link "z-algorithm" }
" vocabulary implements algorithm for finding all Z values for sequence "
{ $snippet "S" }
" in linear time. In contrast to naive approach which takes "
{ $snippet "Θ(n^2)" } " time."
;
ABOUT: "z-algorithm"

View File

@ -0,0 +1,13 @@
! Copyright (C) 2010 Dmitry Shubin.
! See http://factorcode.org/license.txt for BSD license.
USING: tools.test z-algorithm ;
IN: z-algorithm.tests
[ 0 ] [ "qwerty" "" lcp ] unit-test
[ 0 ] [ "qwerty" "asdf" lcp ] unit-test
[ 3 ] [ "qwerty" "qwe" lcp ] unit-test
[ 3 ] [ "qwerty" "qwet" lcp ] unit-test
[ { } ] [ "" z-values ] unit-test
[ { 1 } ] [ "q" z-values ] unit-test
[ { 9 0 5 0 3 0 1 0 1 } ] [ "abababaca" z-values ] unit-test

View File

@ -0,0 +1,37 @@
! Copyright (C) 2010 Dmitry Shubin.
! See http://factorcode.org/license.txt for BSD license.
USING: arrays combinators.smart kernel locals math math.ranges
sequences sequences.private ;
IN: z-algorithm
: lcp ( seq1 seq2 -- n )
[ min-length ] 2keep mismatch [ nip ] when* ;
<PRIVATE
:: out-of-zbox ( seq Z l r k -- seq Z l r )
seq k tail-slice seq lcp :> Zk
Zk Z push seq Z
Zk 0 > [ k Zk k + 1 - ] [ l r ] if ; inline
:: inside-zbox ( seq Z l r k -- seq Z l r )
k l - Z nth :> Zk'
r k - 1 + :> b
seq Z Zk' b <
[ Zk' Z push l r ] ! still inside
[ seq r 1 + seq b [ tail-slice ] 2bi@ lcp :> q
q b + Z push k q r +
] if ; inline
: (z-value) ( seq Z l r k -- seq Z l r )
2dup < [ out-of-zbox ] [ inside-zbox ] if ; inline
:: (z-values) ( seq -- Z )
V{ } clone 0 0 seq length :> ( Z l r len )
len Z push [ seq Z l r 1 len [a,b) [ (z-value) ] each ]
drop-outputs Z ; inline
PRIVATE>
: z-values ( seq -- Z )
dup length 0 > [ (z-values) ] when >array ;