diff --git a/extra/lcs/authors.txt b/extra/lcs/authors.txt new file mode 100755 index 0000000000..504363d316 --- /dev/null +++ b/extra/lcs/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/extra/lcs/lcs-docs.factor b/extra/lcs/lcs-docs.factor new file mode 100755 index 0000000000..6c5e2ae992 --- /dev/null +++ b/extra/lcs/lcs-docs.factor @@ -0,0 +1,6 @@ +USING: help.syntax help.markup ; +IN: lcs + +HELP: levenshtein +{ $values { "old" "a sequence" } { "new" "a sequence" } { "n" "the Levenshtein distance" } } +{ $description "Calculates the Levenshtein distance between old and new, that is, the minimal number of changes from the old sequence to the new one, in terms of deleting, inserting and replacing characters." } ; diff --git a/extra/levenshtein/levenshtein-tests.factor b/extra/lcs/lcs-tests.factor old mode 100644 new mode 100755 similarity index 55% rename from extra/levenshtein/levenshtein-tests.factor rename to extra/lcs/lcs-tests.factor index 722ccb86ca..45297c1bff --- a/extra/levenshtein/levenshtein-tests.factor +++ b/extra/lcs/lcs-tests.factor @@ -1,9 +1,14 @@ ! Copyright (C) 2006 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -IN: levenshtein.tests -USING: tools.test levenshtein ; +USING: tools.test lcs ; [ 3 ] [ "sitting" "kitten" levenshtein ] unit-test [ 3 ] [ "kitten" "sitting" levenshtein ] unit-test [ 1 ] [ "freshpak" "freshpack" levenshtein ] unit-test [ 1 ] [ "freshpack" "freshpak" levenshtein ] unit-test + +[ "hell" ] [ "hello" "hell" lcs ] unit-test +[ "hell" ] [ "hell" "hello" lcs ] unit-test +[ "ell" ] [ "ell" "hell" lcs ] unit-test +[ "ell" ] [ "hell" "ell" lcs ] unit-test +[ "abd" ] [ "faxbcd" "abdef" lcs ] unit-test diff --git a/extra/lcs/lcs.factor b/extra/lcs/lcs.factor new file mode 100755 index 0000000000..b1584af78b --- /dev/null +++ b/extra/lcs/lcs.factor @@ -0,0 +1,94 @@ +USING: sequences kernel math locals math.order math.ranges +accessors combinators.lib arrays namespaces combinators ; +IN: lcs + +! Classic dynamic programming O(n^2) algorithm for the +! Longest Common Subsequence +! Slight modification to get Levenshtein distance + +! j is row, i is column +! Going from str1 to str2 +! str1 along side column, str2 along top row + +:: lcs-step ( i j matrix old new change-cost -- ) + i j matrix nth nth + i old nth j new nth = 0 change-cost ? + + i j 1+ matrix nth nth 1+ ! insertion cost + i 1+ j matrix nth nth 1+ ! deletion cost + min min + i 1+ j 1+ matrix nth set-nth ; + +: lcs-initialize ( |str1| |str2| -- matrix ) + [ drop 0 ] with map ; + +: levenshtein-initialize ( |str1| |str2| -- matrix ) + [ [ + ] curry map ] with map ; + +:: run-lcs ( old new quot change-cost -- matrix ) + [let | matrix [ old length 1+ new length 1+ quot call ] | + old length [0,b) [| i | + new length [0,b) + [| j | i j matrix old new change-cost lcs-step ] + each + ] each matrix ] ; + +: levenshtein ( old new -- n ) + [ levenshtein-initialize ] 1 run-lcs peek peek ; + +TUPLE: retain item ; +TUPLE: delete item ; +TUPLE: insert item ; + +TUPLE: trace-state old new table i j ; + +: old-nth ( state -- elt ) + [ i>> 1- ] [ old>> ] bi nth ; + +: new-nth ( state -- elt ) + [ j>> 1- ] [ new>> ] bi nth ; + +: top-beats-side? ( state -- ? ) + [ [ i>> ] [ j>> 1- ] [ table>> ] tri nth nth ] + [ [ i>> 1- ] [ j>> ] [ table>> ] tri nth nth ] bi > ; + +: retained? ( state -- ? ) + { + [ i>> 0 > ] [ j>> 0 > ] + [ [ old-nth ] [ new-nth ] bi = ] + } <-&& ; + +: do-retain ( state -- state ) + dup old-nth retain boa , + [ 1- ] change-i [ 1- ] change-j ; + +: inserted? ( state -- ? ) + [ j>> 0 > ] + [ [ i>> zero? ] [ top-beats-side? ] or? ] and? ; + +: do-insert ( state -- state ) + dup new-nth insert boa , [ 1- ] change-j ; + +: deleted? ( state -- ? ) + [ i>> 0 > ] + [ [ j>> zero? ] [ top-beats-side? not ] or? ] and? ; + +: do-delete ( state -- state ) + dup old-nth delete boa , [ 1- ] change-i ; + +: (trace-diff) ( state -- ) + { + { [ dup retained? ] [ do-retain (trace-diff) ] } + { [ dup inserted? ] [ do-insert (trace-diff) ] } + { [ dup deleted? ] [ do-delete (trace-diff) ] } + [ drop ] ! i=j=0 + } cond ; + +: trace-diff ( old new table -- diff ) + [ ] [ first length 1- ] [ length 1- ] tri trace-state boa + [ (trace-diff) ] { } make reverse ; + +: diff ( old new -- diff ) + 2dup [ lcs-initialize ] 2 run-lcs trace-diff ; + +: lcs ( str1 str2 -- lcs ) + [ diff [ retain? ] filter ] keep [ item>> ] swap map-as ; diff --git a/extra/lcs/summary.txt b/extra/lcs/summary.txt new file mode 100755 index 0000000000..9e70fd7e63 --- /dev/null +++ b/extra/lcs/summary.txt @@ -0,0 +1 @@ +Levenshtein distance and diff between sequences diff --git a/extra/lcs/tags.txt b/extra/lcs/tags.txt new file mode 100755 index 0000000000..4d914f4c46 --- /dev/null +++ b/extra/lcs/tags.txt @@ -0,0 +1 @@ +algorithms diff --git a/extra/levenshtein/authors.txt b/extra/levenshtein/authors.txt deleted file mode 100644 index 1901f27a24..0000000000 --- a/extra/levenshtein/authors.txt +++ /dev/null @@ -1 +0,0 @@ -Slava Pestov diff --git a/extra/levenshtein/levenshtein.factor b/extra/levenshtein/levenshtein.factor deleted file mode 100644 index 07731bfb84..0000000000 --- a/extra/levenshtein/levenshtein.factor +++ /dev/null @@ -1,47 +0,0 @@ -! Copyright (C) 2006 Slava Pestov. -! See http://factorcode.org/license.txt for BSD license. -USING: arrays help io kernel math namespaces sequences -math.order ; -IN: levenshtein - -: ( m n -- matrix ) - [ drop 0 ] with map ; inline - -: matrix-> nth nth ; inline -: ->matrix nth set-nth ; inline - -SYMBOL: d - -: ->d ( n i j -- ) d get ->matrix ; inline -: d-> ( i j -- n ) d get matrix-> ; inline - -SYMBOL: costs - -: init-d ( str1 str2 -- ) - [ length 1+ ] bi@ 2dup d set - [ 0 over ->d ] each - [ dup 0 ->d ] each ; inline - -: compute-costs ( str1 str2 -- ) - swap [ - [ = 0 1 ? ] with { } map-as - ] curry { } map-as costs set ; inline - -: levenshtein-step ( i j -- ) - [ 1+ d-> 1+ ] 2keep - [ >r 1+ r> d-> 1+ ] 2keep - [ d-> ] 2keep - [ costs get matrix-> + min min ] 2keep - >r 1+ r> 1+ ->d ; inline - -: levenshtein-result ( -- n ) d get peek peek ; inline - -: levenshtein ( str1 str2 -- n ) - [ - 2dup init-d - 2dup compute-costs - [ length ] bi@ [ - [ levenshtein-step ] curry each - ] with each - levenshtein-result - ] with-scope ; diff --git a/extra/levenshtein/summary.txt b/extra/levenshtein/summary.txt deleted file mode 100644 index 583669a8b0..0000000000 --- a/extra/levenshtein/summary.txt +++ /dev/null @@ -1 +0,0 @@ -Levenshtein edit distance algorithm