extra/lcs replaces extra/levenshtein... not fully debugged
parent
d5f63983c3
commit
70ea40681e
|
@ -0,0 +1 @@
|
|||
Daniel Ehrenberg
|
|
@ -0,0 +1,6 @@
|
|||
USING: help.syntax help.markup ;
|
||||
IN: lcs
|
||||
|
||||
HELP: levenshtein
|
||||
{ $values { "old" "a sequence" } { "new" "a sequence" } { "n" "the Levenshtein distance" } }
|
||||
{ $description "Calculates the Levenshtein distance between old and new, that is, the minimal number of changes from the old sequence to the new one, in terms of deleting, inserting and replacing characters." } ;
|
9
extra/levenshtein/levenshtein-tests.factor → extra/lcs/lcs-tests.factor
Normal file → Executable file
9
extra/levenshtein/levenshtein-tests.factor → extra/lcs/lcs-tests.factor
Normal file → Executable file
|
@ -1,9 +1,14 @@
|
|||
! Copyright (C) 2006 Slava Pestov.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
IN: levenshtein.tests
|
||||
USING: tools.test levenshtein ;
|
||||
USING: tools.test lcs ;
|
||||
|
||||
[ 3 ] [ "sitting" "kitten" levenshtein ] unit-test
|
||||
[ 3 ] [ "kitten" "sitting" levenshtein ] unit-test
|
||||
[ 1 ] [ "freshpak" "freshpack" levenshtein ] unit-test
|
||||
[ 1 ] [ "freshpack" "freshpak" levenshtein ] unit-test
|
||||
|
||||
[ "hell" ] [ "hello" "hell" lcs ] unit-test
|
||||
[ "hell" ] [ "hell" "hello" lcs ] unit-test
|
||||
[ "ell" ] [ "ell" "hell" lcs ] unit-test
|
||||
[ "ell" ] [ "hell" "ell" lcs ] unit-test
|
||||
[ "abd" ] [ "faxbcd" "abdef" lcs ] unit-test
|
|
@ -0,0 +1,94 @@
|
|||
USING: sequences kernel math locals math.order math.ranges
|
||||
accessors combinators.lib arrays namespaces combinators ;
|
||||
IN: lcs
|
||||
|
||||
! Classic dynamic programming O(n^2) algorithm for the
|
||||
! Longest Common Subsequence
|
||||
! Slight modification to get Levenshtein distance
|
||||
|
||||
! j is row, i is column
|
||||
! Going from str1 to str2
|
||||
! str1 along side column, str2 along top row
|
||||
|
||||
:: lcs-step ( i j matrix old new change-cost -- )
|
||||
i j matrix nth nth
|
||||
i old nth j new nth = 0 change-cost ? +
|
||||
i j 1+ matrix nth nth 1+ ! insertion cost
|
||||
i 1+ j matrix nth nth 1+ ! deletion cost
|
||||
min min
|
||||
i 1+ j 1+ matrix nth set-nth ;
|
||||
|
||||
: lcs-initialize ( |str1| |str2| -- matrix )
|
||||
[ drop 0 <array> ] with map ;
|
||||
|
||||
: levenshtein-initialize ( |str1| |str2| -- matrix )
|
||||
[ [ + ] curry map ] with map ;
|
||||
|
||||
:: run-lcs ( old new quot change-cost -- matrix )
|
||||
[let | matrix [ old length 1+ new length 1+ quot call ] |
|
||||
old length [0,b) [| i |
|
||||
new length [0,b)
|
||||
[| j | i j matrix old new change-cost lcs-step ]
|
||||
each
|
||||
] each matrix ] ;
|
||||
|
||||
: levenshtein ( old new -- n )
|
||||
[ levenshtein-initialize ] 1 run-lcs peek peek ;
|
||||
|
||||
TUPLE: retain item ;
|
||||
TUPLE: delete item ;
|
||||
TUPLE: insert item ;
|
||||
|
||||
TUPLE: trace-state old new table i j ;
|
||||
|
||||
: old-nth ( state -- elt )
|
||||
[ i>> 1- ] [ old>> ] bi nth ;
|
||||
|
||||
: new-nth ( state -- elt )
|
||||
[ j>> 1- ] [ new>> ] bi nth ;
|
||||
|
||||
: top-beats-side? ( state -- ? )
|
||||
[ [ i>> ] [ j>> 1- ] [ table>> ] tri nth nth ]
|
||||
[ [ i>> 1- ] [ j>> ] [ table>> ] tri nth nth ] bi > ;
|
||||
|
||||
: retained? ( state -- ? )
|
||||
{
|
||||
[ i>> 0 > ] [ j>> 0 > ]
|
||||
[ [ old-nth ] [ new-nth ] bi = ]
|
||||
} <-&& ;
|
||||
|
||||
: do-retain ( state -- state )
|
||||
dup old-nth retain boa ,
|
||||
[ 1- ] change-i [ 1- ] change-j ;
|
||||
|
||||
: inserted? ( state -- ? )
|
||||
[ j>> 0 > ]
|
||||
[ [ i>> zero? ] [ top-beats-side? ] or? ] and? ;
|
||||
|
||||
: do-insert ( state -- state )
|
||||
dup new-nth insert boa , [ 1- ] change-j ;
|
||||
|
||||
: deleted? ( state -- ? )
|
||||
[ i>> 0 > ]
|
||||
[ [ j>> zero? ] [ top-beats-side? not ] or? ] and? ;
|
||||
|
||||
: do-delete ( state -- state )
|
||||
dup old-nth delete boa , [ 1- ] change-i ;
|
||||
|
||||
: (trace-diff) ( state -- )
|
||||
{
|
||||
{ [ dup retained? ] [ do-retain (trace-diff) ] }
|
||||
{ [ dup inserted? ] [ do-insert (trace-diff) ] }
|
||||
{ [ dup deleted? ] [ do-delete (trace-diff) ] }
|
||||
[ drop ] ! i=j=0
|
||||
} cond ;
|
||||
|
||||
: trace-diff ( old new table -- diff )
|
||||
[ ] [ first length 1- ] [ length 1- ] tri trace-state boa
|
||||
[ (trace-diff) ] { } make reverse ;
|
||||
|
||||
: diff ( old new -- diff )
|
||||
2dup [ lcs-initialize ] 2 run-lcs trace-diff ;
|
||||
|
||||
: lcs ( str1 str2 -- lcs )
|
||||
[ diff [ retain? ] filter ] keep [ item>> ] swap map-as ;
|
|
@ -0,0 +1 @@
|
|||
Levenshtein distance and diff between sequences
|
|
@ -0,0 +1 @@
|
|||
algorithms
|
|
@ -1 +0,0 @@
|
|||
Slava Pestov
|
|
@ -1,47 +0,0 @@
|
|||
! Copyright (C) 2006 Slava Pestov.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: arrays help io kernel math namespaces sequences
|
||||
math.order ;
|
||||
IN: levenshtein
|
||||
|
||||
: <matrix> ( m n -- matrix )
|
||||
[ drop 0 <array> ] with map ; inline
|
||||
|
||||
: matrix-> nth nth ; inline
|
||||
: ->matrix nth set-nth ; inline
|
||||
|
||||
SYMBOL: d
|
||||
|
||||
: ->d ( n i j -- ) d get ->matrix ; inline
|
||||
: d-> ( i j -- n ) d get matrix-> ; inline
|
||||
|
||||
SYMBOL: costs
|
||||
|
||||
: init-d ( str1 str2 -- )
|
||||
[ length 1+ ] bi@ 2dup <matrix> d set
|
||||
[ 0 over ->d ] each
|
||||
[ dup 0 ->d ] each ; inline
|
||||
|
||||
: compute-costs ( str1 str2 -- )
|
||||
swap [
|
||||
[ = 0 1 ? ] with { } map-as
|
||||
] curry { } map-as costs set ; inline
|
||||
|
||||
: levenshtein-step ( i j -- )
|
||||
[ 1+ d-> 1+ ] 2keep
|
||||
[ >r 1+ r> d-> 1+ ] 2keep
|
||||
[ d-> ] 2keep
|
||||
[ costs get matrix-> + min min ] 2keep
|
||||
>r 1+ r> 1+ ->d ; inline
|
||||
|
||||
: levenshtein-result ( -- n ) d get peek peek ; inline
|
||||
|
||||
: levenshtein ( str1 str2 -- n )
|
||||
[
|
||||
2dup init-d
|
||||
2dup compute-costs
|
||||
[ length ] bi@ [
|
||||
[ levenshtein-step ] curry each
|
||||
] with each
|
||||
levenshtein-result
|
||||
] with-scope ;
|
|
@ -1 +0,0 @@
|
|||
Levenshtein edit distance algorithm
|
Loading…
Reference in New Issue