extra/lcs replaces extra/levenshtein... not fully debugged

db4
Daniel Ehrenberg 2008-05-06 03:46:44 -05:00
parent d5f63983c3
commit 70ea40681e
9 changed files with 110 additions and 51 deletions

1
extra/lcs/authors.txt Executable file
View File

@ -0,0 +1 @@
Daniel Ehrenberg

6
extra/lcs/lcs-docs.factor Executable file
View File

@ -0,0 +1,6 @@
USING: help.syntax help.markup ;
IN: lcs
HELP: levenshtein
{ $values { "old" "a sequence" } { "new" "a sequence" } { "n" "the Levenshtein distance" } }
{ $description "Calculates the Levenshtein distance between old and new, that is, the minimal number of changes from the old sequence to the new one, in terms of deleting, inserting and replacing characters." } ;

View File

@ -1,9 +1,14 @@
! Copyright (C) 2006 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
IN: levenshtein.tests
USING: tools.test levenshtein ;
USING: tools.test lcs ;
[ 3 ] [ "sitting" "kitten" levenshtein ] unit-test
[ 3 ] [ "kitten" "sitting" levenshtein ] unit-test
[ 1 ] [ "freshpak" "freshpack" levenshtein ] unit-test
[ 1 ] [ "freshpack" "freshpak" levenshtein ] unit-test
[ "hell" ] [ "hello" "hell" lcs ] unit-test
[ "hell" ] [ "hell" "hello" lcs ] unit-test
[ "ell" ] [ "ell" "hell" lcs ] unit-test
[ "ell" ] [ "hell" "ell" lcs ] unit-test
[ "abd" ] [ "faxbcd" "abdef" lcs ] unit-test

94
extra/lcs/lcs.factor Executable file
View File

@ -0,0 +1,94 @@
USING: sequences kernel math locals math.order math.ranges
accessors combinators.lib arrays namespaces combinators ;
IN: lcs
! Classic dynamic programming O(n^2) algorithm for the
! Longest Common Subsequence
! Slight modification to get Levenshtein distance
! j is row, i is column
! Going from str1 to str2
! str1 along side column, str2 along top row
:: lcs-step ( i j matrix old new change-cost -- )
i j matrix nth nth
i old nth j new nth = 0 change-cost ? +
i j 1+ matrix nth nth 1+ ! insertion cost
i 1+ j matrix nth nth 1+ ! deletion cost
min min
i 1+ j 1+ matrix nth set-nth ;
: lcs-initialize ( |str1| |str2| -- matrix )
[ drop 0 <array> ] with map ;
: levenshtein-initialize ( |str1| |str2| -- matrix )
[ [ + ] curry map ] with map ;
:: run-lcs ( old new quot change-cost -- matrix )
[let | matrix [ old length 1+ new length 1+ quot call ] |
old length [0,b) [| i |
new length [0,b)
[| j | i j matrix old new change-cost lcs-step ]
each
] each matrix ] ;
: levenshtein ( old new -- n )
[ levenshtein-initialize ] 1 run-lcs peek peek ;
TUPLE: retain item ;
TUPLE: delete item ;
TUPLE: insert item ;
TUPLE: trace-state old new table i j ;
: old-nth ( state -- elt )
[ i>> 1- ] [ old>> ] bi nth ;
: new-nth ( state -- elt )
[ j>> 1- ] [ new>> ] bi nth ;
: top-beats-side? ( state -- ? )
[ [ i>> ] [ j>> 1- ] [ table>> ] tri nth nth ]
[ [ i>> 1- ] [ j>> ] [ table>> ] tri nth nth ] bi > ;
: retained? ( state -- ? )
{
[ i>> 0 > ] [ j>> 0 > ]
[ [ old-nth ] [ new-nth ] bi = ]
} <-&& ;
: do-retain ( state -- state )
dup old-nth retain boa ,
[ 1- ] change-i [ 1- ] change-j ;
: inserted? ( state -- ? )
[ j>> 0 > ]
[ [ i>> zero? ] [ top-beats-side? ] or? ] and? ;
: do-insert ( state -- state )
dup new-nth insert boa , [ 1- ] change-j ;
: deleted? ( state -- ? )
[ i>> 0 > ]
[ [ j>> zero? ] [ top-beats-side? not ] or? ] and? ;
: do-delete ( state -- state )
dup old-nth delete boa , [ 1- ] change-i ;
: (trace-diff) ( state -- )
{
{ [ dup retained? ] [ do-retain (trace-diff) ] }
{ [ dup inserted? ] [ do-insert (trace-diff) ] }
{ [ dup deleted? ] [ do-delete (trace-diff) ] }
[ drop ] ! i=j=0
} cond ;
: trace-diff ( old new table -- diff )
[ ] [ first length 1- ] [ length 1- ] tri trace-state boa
[ (trace-diff) ] { } make reverse ;
: diff ( old new -- diff )
2dup [ lcs-initialize ] 2 run-lcs trace-diff ;
: lcs ( str1 str2 -- lcs )
[ diff [ retain? ] filter ] keep [ item>> ] swap map-as ;

1
extra/lcs/summary.txt Executable file
View File

@ -0,0 +1 @@
Levenshtein distance and diff between sequences

1
extra/lcs/tags.txt Executable file
View File

@ -0,0 +1 @@
algorithms

View File

@ -1 +0,0 @@
Slava Pestov

View File

@ -1,47 +0,0 @@
! Copyright (C) 2006 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
USING: arrays help io kernel math namespaces sequences
math.order ;
IN: levenshtein
: <matrix> ( m n -- matrix )
[ drop 0 <array> ] with map ; inline
: matrix-> nth nth ; inline
: ->matrix nth set-nth ; inline
SYMBOL: d
: ->d ( n i j -- ) d get ->matrix ; inline
: d-> ( i j -- n ) d get matrix-> ; inline
SYMBOL: costs
: init-d ( str1 str2 -- )
[ length 1+ ] bi@ 2dup <matrix> d set
[ 0 over ->d ] each
[ dup 0 ->d ] each ; inline
: compute-costs ( str1 str2 -- )
swap [
[ = 0 1 ? ] with { } map-as
] curry { } map-as costs set ; inline
: levenshtein-step ( i j -- )
[ 1+ d-> 1+ ] 2keep
[ >r 1+ r> d-> 1+ ] 2keep
[ d-> ] 2keep
[ costs get matrix-> + min min ] 2keep
>r 1+ r> 1+ ->d ; inline
: levenshtein-result ( -- n ) d get peek peek ; inline
: levenshtein ( str1 str2 -- n )
[
2dup init-d
2dup compute-costs
[ length ] bi@ [
[ levenshtein-step ] curry each
] with each
levenshtein-result
] with-scope ;

View File

@ -1 +0,0 @@
Levenshtein edit distance algorithm