LCS docs, bug fixes
parent
32d032e8fc
commit
d1545ac929
|
@ -4,3 +4,32 @@ IN: lcs
|
||||||
HELP: levenshtein
|
HELP: levenshtein
|
||||||
{ $values { "old" "a sequence" } { "new" "a sequence" } { "n" "the Levenshtein distance" } }
|
{ $values { "old" "a sequence" } { "new" "a sequence" } { "n" "the Levenshtein distance" } }
|
||||||
{ $description "Calculates the Levenshtein distance between old and new, that is, the minimal number of changes from the old sequence to the new one, in terms of deleting, inserting and replacing characters." } ;
|
{ $description "Calculates the Levenshtein distance between old and new, that is, the minimal number of changes from the old sequence to the new one, in terms of deleting, inserting and replacing characters." } ;
|
||||||
|
|
||||||
|
HELP: lcs
|
||||||
|
{ $values { "seq1" "a sequence" } { "seq2" "a sequence" } { "lcs" "a longest common subsequence" } }
|
||||||
|
{ $description "Given two sequences, calculates a longest common subsequence between them. Note two things: this is only one of the many possible LCSs, and the LCS may not be contiguous." } ;
|
||||||
|
|
||||||
|
HELP: diff
|
||||||
|
{ $values { "old" "a sequence" } { "new" "a sequence" } { "diff" "an edit script" } }
|
||||||
|
{ $description "Given two sequences, find a minimal edit script from the old to the new. There may be more than one minimal edit script, and this chooses one arbitrarily. This script is in the form of an array of the tuples of the classes " { $link retain } ", " { $link delete } " and " { $link insert } " which have their information stored in the 'item' slot." } ;
|
||||||
|
|
||||||
|
HELP: retain
|
||||||
|
{ $class-description "Represents an action in an edit script where an item is kept, going from the initial sequence to the final sequence. This has one slot, called item, containing the thing which is retained" } ;
|
||||||
|
|
||||||
|
HELP: delete
|
||||||
|
{ $class-description "Represents an action in an edit script where an item is deleted, going from the initial sequence to the final sequence. This has one slot, called item, containing the thing which is deleted" } ;
|
||||||
|
|
||||||
|
HELP: insert
|
||||||
|
{ $class-description "Represents an action in an edit script where an item is added, going from the initial sequence to the final sequence. This has one slot, called item, containing the thing which is inserted" } ;
|
||||||
|
|
||||||
|
ARTICLE: "lcs" "LCS, Diffing and Distance"
|
||||||
|
"This vocabulary provides words for three apparently unrelated but in fact very similar problems: finding a longest common subsequence between two sequences, getting a minimal edit script (diff) between two sequences, and calculating the Levenshtein distance between two sequences. The implementations of these algorithms are very closely related, and all running times are O(nm), where n and m are the lengths of the input sequences."
|
||||||
|
{ $subsection lcs }
|
||||||
|
{ $subsection diff }
|
||||||
|
{ $subsection levenshtein }
|
||||||
|
"The " { $link diff } " word returns a sequence of tuples of the following classes. They all hold their contents in the 'item' slot."
|
||||||
|
{ $subsection insert }
|
||||||
|
{ $subsection delete }
|
||||||
|
{ $subsection retain } ;
|
||||||
|
|
||||||
|
ABOUT: "lcs"
|
||||||
|
|
|
@ -7,8 +7,19 @@ USING: tools.test lcs ;
|
||||||
[ 1 ] [ "freshpak" "freshpack" levenshtein ] unit-test
|
[ 1 ] [ "freshpak" "freshpack" levenshtein ] unit-test
|
||||||
[ 1 ] [ "freshpack" "freshpak" levenshtein ] unit-test
|
[ 1 ] [ "freshpack" "freshpak" levenshtein ] unit-test
|
||||||
|
|
||||||
! [ "hell" ] [ "hello" "hell" lcs ] unit-test
|
[ "hell" ] [ "hello" "hell" lcs ] unit-test
|
||||||
! [ "hell" ] [ "hell" "hello" lcs ] unit-test
|
[ "hell" ] [ "hell" "hello" lcs ] unit-test
|
||||||
[ "ell" ] [ "ell" "hell" lcs ] unit-test
|
[ "ell" ] [ "ell" "hell" lcs ] unit-test
|
||||||
[ "ell" ] [ "hell" "ell" lcs ] unit-test
|
[ "ell" ] [ "hell" "ell" lcs ] unit-test
|
||||||
! [ "abd" ] [ "faxbcd" "abdef" lcs ] unit-test
|
[ "abd" ] [ "faxbcd" "abdef" lcs ] unit-test
|
||||||
|
|
||||||
|
[ {
|
||||||
|
T{ delete f CHAR: f }
|
||||||
|
T{ retain f CHAR: a }
|
||||||
|
T{ delete f CHAR: x }
|
||||||
|
T{ retain f CHAR: b }
|
||||||
|
T{ delete f CHAR: c }
|
||||||
|
T{ retain f CHAR: d }
|
||||||
|
T{ insert f CHAR: e }
|
||||||
|
T{ insert f CHAR: f }
|
||||||
|
} ] [ "faxbcd" "abdef" diff ] unit-test
|
||||||
|
|
|
@ -2,21 +2,20 @@ USING: sequences kernel math locals math.order math.ranges
|
||||||
accessors combinators.lib arrays namespaces combinators ;
|
accessors combinators.lib arrays namespaces combinators ;
|
||||||
IN: lcs
|
IN: lcs
|
||||||
|
|
||||||
! Classic dynamic programming O(n^2) algorithm for the
|
<PRIVATE
|
||||||
! Longest Common Subsequence
|
: levenshtein-step ( insert delete change same? -- next )
|
||||||
! Slight modification to get Levenshtein distance
|
0 1 ? + >r [ 1+ ] bi@ r> min min ;
|
||||||
|
|
||||||
! j is row, i is column
|
: lcs-step ( insert delete change same? -- next )
|
||||||
! Going from str1 to str2
|
1 -9999 ? + max max ; ! Replace -9999 with -inf when added
|
||||||
! str1 along side column, str2 along top row
|
|
||||||
|
|
||||||
:: lcs-step ( i j matrix old new change-cost -- )
|
:: loop-step ( i j matrix old new step -- )
|
||||||
i j matrix nth nth
|
i j 1+ matrix nth nth ! insertion
|
||||||
i old nth j new nth = 0 change-cost ? +
|
i 1+ j matrix nth nth ! deletion
|
||||||
i j 1+ matrix nth nth 1+ ! insertion cost
|
i j matrix nth nth ! replace/retain
|
||||||
i 1+ j matrix nth nth 1+ ! deletion cost
|
i old nth j new nth = ! same?
|
||||||
min min
|
step call
|
||||||
i 1+ j 1+ matrix nth set-nth ;
|
i 1+ j 1+ matrix nth set-nth ; inline
|
||||||
|
|
||||||
: lcs-initialize ( |str1| |str2| -- matrix )
|
: lcs-initialize ( |str1| |str2| -- matrix )
|
||||||
[ drop 0 <array> ] with map ;
|
[ drop 0 <array> ] with map ;
|
||||||
|
@ -24,21 +23,24 @@ IN: lcs
|
||||||
: levenshtein-initialize ( |str1| |str2| -- matrix )
|
: levenshtein-initialize ( |str1| |str2| -- matrix )
|
||||||
[ [ + ] curry map ] with map ;
|
[ [ + ] curry map ] with map ;
|
||||||
|
|
||||||
:: run-lcs ( old new quot change-cost -- matrix )
|
:: run-lcs ( old new init step -- matrix )
|
||||||
[let | matrix [ old length 1+ new length 1+ quot call ] |
|
[let | matrix [ old length 1+ new length 1+ init call ] |
|
||||||
old length [0,b) [| i |
|
old length [0,b) [| i |
|
||||||
new length [0,b)
|
new length [0,b)
|
||||||
[| j | i j matrix old new change-cost lcs-step ]
|
[| j | i j matrix old new step loop-step ]
|
||||||
each
|
each
|
||||||
] each matrix ] ;
|
] each matrix ] ; inline
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
: levenshtein ( old new -- n )
|
: levenshtein ( old new -- n )
|
||||||
[ levenshtein-initialize ] 1 run-lcs peek peek ;
|
[ levenshtein-initialize ] [ levenshtein-step ]
|
||||||
|
run-lcs peek peek ;
|
||||||
|
|
||||||
TUPLE: retain item ;
|
TUPLE: retain item ;
|
||||||
TUPLE: delete item ;
|
TUPLE: delete item ;
|
||||||
TUPLE: insert item ;
|
TUPLE: insert item ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
TUPLE: trace-state old new table i j ;
|
TUPLE: trace-state old new table i j ;
|
||||||
|
|
||||||
: old-nth ( state -- elt )
|
: old-nth ( state -- elt )
|
||||||
|
@ -86,9 +88,10 @@ TUPLE: trace-state old new table i j ;
|
||||||
: trace-diff ( old new table -- diff )
|
: trace-diff ( old new table -- diff )
|
||||||
[ ] [ first length 1- ] [ length 1- ] tri trace-state boa
|
[ ] [ first length 1- ] [ length 1- ] tri trace-state boa
|
||||||
[ (trace-diff) ] { } make reverse ;
|
[ (trace-diff) ] { } make reverse ;
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
: diff ( old new -- diff )
|
: diff ( old new -- diff )
|
||||||
2dup [ lcs-initialize ] 2 run-lcs trace-diff ;
|
2dup [ lcs-initialize ] [ lcs-step ] run-lcs trace-diff ;
|
||||||
|
|
||||||
: lcs ( str1 str2 -- lcs )
|
: lcs ( seq1 seq2 -- lcs )
|
||||||
[ diff [ retain? ] filter ] keep [ item>> ] swap map-as ;
|
[ diff [ retain? ] filter ] keep [ item>> ] swap map-as ;
|
||||||
|
|
Loading…
Reference in New Issue