Add: Boyer-Moore string search algorithm
parent
74075511c2
commit
38ef5919e8
|
@ -0,0 +1 @@
|
||||||
|
Dmitry Shubin
|
|
@ -0,0 +1,59 @@
|
||||||
|
! Copyright (C) 2010 Dmitry Shubin.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: help.markup help.syntax kernel sequences ;
|
||||||
|
IN: boyer-moore
|
||||||
|
|
||||||
|
HELP: <boyer-moore>
|
||||||
|
{ $values
|
||||||
|
{ "pat" sequence } { "bm" object }
|
||||||
|
}
|
||||||
|
{ $description
|
||||||
|
"Given a pattern performs pattern preprocessing and returns "
|
||||||
|
"results as an (opaque) object that is reusable across "
|
||||||
|
"searches in different sequences via " { $link search-from }
|
||||||
|
" generic word."
|
||||||
|
} ;
|
||||||
|
|
||||||
|
HELP: search-from
|
||||||
|
{ $values
|
||||||
|
{ "seq" sequence }
|
||||||
|
{ "from" "a non-negative integer" }
|
||||||
|
{ "obj" object }
|
||||||
|
{ "i/f" "the index of first match or " { $link f } }
|
||||||
|
}
|
||||||
|
{ $description "Performs an attempt to find the first "
|
||||||
|
"occurence of pattern in " { $snippet "seq" }
|
||||||
|
" starting from " { $snippet "from" } " using "
|
||||||
|
"Boyer-Moore search algorithm. Output is the index "
|
||||||
|
"if the attempt was succeessful and " { $link f }
|
||||||
|
" otherwise."
|
||||||
|
} ;
|
||||||
|
|
||||||
|
HELP: search
|
||||||
|
{ $values
|
||||||
|
{ "seq" sequence }
|
||||||
|
{ "obj" object }
|
||||||
|
{ "i/f" "the index of first match or " { $link f } }
|
||||||
|
}
|
||||||
|
{ $description "A simpler variant of " { $link search-from }
|
||||||
|
" that starts searching from the beginning of the sequence."
|
||||||
|
} ;
|
||||||
|
|
||||||
|
ARTICLE: "boyer-moore" "The Boyer-Moore algorithm"
|
||||||
|
{ $heading "Summary" }
|
||||||
|
"The " { $vocab-link "boyer-moore" } " vocabulary "
|
||||||
|
"implements a Boyer-Moore string search algorithm with "
|
||||||
|
"so-called 'strong good suffix shift rule'. Since agorithm is "
|
||||||
|
"alphabet-independent it is applicable to searching in any "
|
||||||
|
"collection that implements " { $links "sequence-protocol" } "."
|
||||||
|
|
||||||
|
{ $heading "Complexity" }
|
||||||
|
"Let " { $snippet "n" } " and " { $snippet "m" } " be lengths "
|
||||||
|
"of the sequences being searched " { $emphasis "in" } " and "
|
||||||
|
{ $emphasis "for" } " respectively. Then searching runs in "
|
||||||
|
{ $snippet "O(n)" } " time in it's worst case using additional "
|
||||||
|
{ $snippet "O(m)" } " space. Preprocessing phase runs in "
|
||||||
|
{ $snippet "O(m)" } " time."
|
||||||
|
;
|
||||||
|
|
||||||
|
ABOUT: "boyer-moore"
|
|
@ -0,0 +1,10 @@
|
||||||
|
! Copyright (C) 2010 Dmitry Shubin.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: tools.test boyer-moore ;
|
||||||
|
IN: boyer-moore.tests
|
||||||
|
|
||||||
|
[ 0 ] [ "qwerty" "" search ] unit-test
|
||||||
|
[ 0 ] [ "" "" search ] unit-test
|
||||||
|
[ f ] [ "qw" "qwerty" search ] unit-test
|
||||||
|
[ 3 ] [ "qwerty" "r" search ] unit-test
|
||||||
|
[ 8 ] [ "qwerasdfqwer" 2 "qwe" search-from ] unit-test
|
|
@ -0,0 +1,76 @@
|
||||||
|
! Copyright (C) 2010 Dmitry Shubin.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: accessors arrays assocs kernel locals math math.order
|
||||||
|
math.ranges sequences sequences.private z-algorithm ;
|
||||||
|
IN: boyer-moore
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
:: (normal-suffixes) ( i zs ss -- )
|
||||||
|
i zs nth-unsafe ss
|
||||||
|
[ [ i ] unless* ] change-nth-unsafe ; inline
|
||||||
|
|
||||||
|
: normal-suffixes ( zs -- ss )
|
||||||
|
[ length [ f <array> ] [ [1,b) ] bi ] keep pick
|
||||||
|
[ (normal-suffixes) ] 2curry each ; inline
|
||||||
|
|
||||||
|
:: (partial-suffixes) ( len old elt i -- len old/new old )
|
||||||
|
len elt i 1 + = [ len elt - ] [ old ] if old ; inline
|
||||||
|
|
||||||
|
: partial-suffixes ( zs -- ss )
|
||||||
|
[ length dup ] [ <reversed> ] bi
|
||||||
|
[ (partial-suffixes) ] map-index 2nip ; inline
|
||||||
|
|
||||||
|
: <gs-table> ( seq -- table )
|
||||||
|
z-values [ partial-suffixes ] [ normal-suffixes ] bi
|
||||||
|
[ [ nip ] when* ] 2map reverse! ; inline
|
||||||
|
|
||||||
|
: insert-bc-shift ( table elt len i -- table )
|
||||||
|
1 + swap - swap pick 2dup key?
|
||||||
|
[ 3drop ] [ set-at ] if ; inline
|
||||||
|
|
||||||
|
: <bc-table> ( seq -- table )
|
||||||
|
H{ } clone swap [ length ] keep
|
||||||
|
[ insert-bc-shift ] with each-index ; inline
|
||||||
|
|
||||||
|
TUPLE: boyer-moore pattern bc-table gs-table ;
|
||||||
|
|
||||||
|
: gs-shift ( i c bm -- s ) nip gs-table>> nth-unsafe ; inline
|
||||||
|
|
||||||
|
: bc-shift ( i c bm -- s ) bc-table>> at dup 1 ? + ; inline
|
||||||
|
|
||||||
|
: do-shift ( pos i c bm -- newpos )
|
||||||
|
[ gs-shift ] [ bc-shift ] bi-curry 2bi max + ; inline
|
||||||
|
|
||||||
|
: match? ( i1 s1 i2 s2 -- ? ) [ nth-unsafe ] 2bi@ = ; inline
|
||||||
|
|
||||||
|
:: mismatch? ( s1 s2 pos len -- i/f )
|
||||||
|
len 1 - [ [ pos + s1 ] keep s2 match? not ]
|
||||||
|
find-last-integer ; inline
|
||||||
|
|
||||||
|
:: (search-from) ( seq from bm -- i/f )
|
||||||
|
bm pattern>> :> pat
|
||||||
|
pat length :> plen
|
||||||
|
seq length plen - :> lim
|
||||||
|
from
|
||||||
|
[ dup lim <=
|
||||||
|
[ seq pat pick plen mismatch?
|
||||||
|
[ 2dup + seq nth-unsafe bm do-shift t ] [ f ] if*
|
||||||
|
] [ drop f f ] if
|
||||||
|
] loop ; inline
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: <boyer-moore> ( pat -- bm )
|
||||||
|
dup <reversed> [ <bc-table> ] [ <gs-table> ] bi
|
||||||
|
boyer-moore boa ;
|
||||||
|
|
||||||
|
GENERIC: search-from ( seq from obj -- i/f )
|
||||||
|
|
||||||
|
M: sequence search-from
|
||||||
|
dup length zero?
|
||||||
|
[ 3drop 0 ] [ <boyer-moore> (search-from) ] if ;
|
||||||
|
|
||||||
|
M: boyer-moore search-from (search-from) ;
|
||||||
|
|
||||||
|
: search ( seq obj -- i/f ) [ 0 ] dip search-from ;
|
|
@ -0,0 +1 @@
|
||||||
|
Boyer-Moore string search algorithm
|
|
@ -0,0 +1 @@
|
||||||
|
algorithms
|
|
@ -0,0 +1 @@
|
||||||
|
Dmitry Shubin
|
|
@ -0,0 +1 @@
|
||||||
|
Z algorithm for pattern preprocessing
|
|
@ -0,0 +1 @@
|
||||||
|
algorithms
|
|
@ -0,0 +1,49 @@
|
||||||
|
! Copyright (C) 2010 Dmitry Shubin.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: arrays help.markup help.syntax sequences ;
|
||||||
|
IN: z-algorithm
|
||||||
|
|
||||||
|
HELP: lcp
|
||||||
|
{ $values
|
||||||
|
{ "seq1" sequence } { "seq2" sequence }
|
||||||
|
{ "n" "a non-negative integer" }
|
||||||
|
}
|
||||||
|
{ $description
|
||||||
|
"Outputs the length of longest common prefix of two sequences."
|
||||||
|
} ;
|
||||||
|
|
||||||
|
HELP: z-values
|
||||||
|
{ $values
|
||||||
|
{ "seq" sequence } { "Z" array }
|
||||||
|
}
|
||||||
|
{ $description
|
||||||
|
"Outputs an array of the same length as " { $snippet "seq" }
|
||||||
|
", containing Z-values for given sequence. See "
|
||||||
|
{ $link "z-algorithm" } " for details."
|
||||||
|
} ;
|
||||||
|
|
||||||
|
ARTICLE: "z-algorithm" "Z algorithm"
|
||||||
|
{ $heading "Definition" }
|
||||||
|
"Given the sequence " { $snippet "S" } " and the index "
|
||||||
|
{ $snippet "i" } ", let " { $snippet "i" } "-th Z value of "
|
||||||
|
{ $snippet "S" } " be the length of the longest subsequence of "
|
||||||
|
{ $snippet "S" } " that starts at " { $snippet "i" }
|
||||||
|
" and matches the prefix of " { $snippet "S" } "."
|
||||||
|
|
||||||
|
{ $heading "Example" }
|
||||||
|
"Here is an example for string " { $snippet "\"abababaca\"" } ":"
|
||||||
|
{ $table
|
||||||
|
{ { $snippet "i:" } "0" "1" "2" "3" "4" "5" "6" "7" "8" }
|
||||||
|
{ { $snippet "S:" } "a" "b" "a" "b" "a" "b" "a" "c" "a" }
|
||||||
|
{ { $snippet "Z:" } "9" "0" "5" "0" "3" "0" "1" "0" "1" }
|
||||||
|
}
|
||||||
|
|
||||||
|
{ $heading "Summary" }
|
||||||
|
"The " { $vocab-link "z-algorithm" }
|
||||||
|
" vocabulary implements algorithm for finding all Z values for sequence "
|
||||||
|
{ $snippet "S" }
|
||||||
|
" in linear time. In contrast to naive approach which takes "
|
||||||
|
{ $snippet "Θ(n^2)" } " time."
|
||||||
|
;
|
||||||
|
|
||||||
|
ABOUT: "z-algorithm"
|
|
@ -0,0 +1,13 @@
|
||||||
|
! Copyright (C) 2010 Dmitry Shubin.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: tools.test z-algorithm ;
|
||||||
|
IN: z-algorithm.tests
|
||||||
|
|
||||||
|
[ 0 ] [ "qwerty" "" lcp ] unit-test
|
||||||
|
[ 0 ] [ "qwerty" "asdf" lcp ] unit-test
|
||||||
|
[ 3 ] [ "qwerty" "qwe" lcp ] unit-test
|
||||||
|
[ 3 ] [ "qwerty" "qwet" lcp ] unit-test
|
||||||
|
|
||||||
|
[ { } ] [ "" z-values ] unit-test
|
||||||
|
[ { 1 } ] [ "q" z-values ] unit-test
|
||||||
|
[ { 9 0 5 0 3 0 1 0 1 } ] [ "abababaca" z-values ] unit-test
|
|
@ -0,0 +1,37 @@
|
||||||
|
! Copyright (C) 2010 Dmitry Shubin.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: arrays combinators.smart kernel locals math math.ranges
|
||||||
|
sequences sequences.private ;
|
||||||
|
IN: z-algorithm
|
||||||
|
|
||||||
|
: lcp ( seq1 seq2 -- n )
|
||||||
|
[ min-length ] 2keep mismatch [ nip ] when* ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
:: out-of-zbox ( seq Z l r k -- seq Z l r )
|
||||||
|
seq k tail-slice seq lcp :> Zk
|
||||||
|
Zk Z push seq Z
|
||||||
|
Zk 0 > [ k Zk k + 1 - ] [ l r ] if ; inline
|
||||||
|
|
||||||
|
:: inside-zbox ( seq Z l r k -- seq Z l r )
|
||||||
|
k l - Z nth :> Zk'
|
||||||
|
r k - 1 + :> b
|
||||||
|
seq Z Zk' b <
|
||||||
|
[ Zk' Z push l r ] ! still inside
|
||||||
|
[ seq r 1 + seq b [ tail-slice ] 2bi@ lcp :> q
|
||||||
|
q b + Z push k q r +
|
||||||
|
] if ; inline
|
||||||
|
|
||||||
|
: (z-value) ( seq Z l r k -- seq Z l r )
|
||||||
|
2dup < [ out-of-zbox ] [ inside-zbox ] if ; inline
|
||||||
|
|
||||||
|
:: (z-values) ( seq -- Z )
|
||||||
|
V{ } clone 0 0 seq length :> ( Z l r len )
|
||||||
|
len Z push [ seq Z l r 1 len [a,b) [ (z-value) ] each ]
|
||||||
|
drop-outputs Z ; inline
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: z-values ( seq -- Z )
|
||||||
|
dup length 0 > [ (z-values) ] when >array ;
|
Loading…
Reference in New Issue