diff --git a/extra/interval-maps/interval-maps-tests.factor b/extra/interval-maps/interval-maps-tests.factor index 54d2e9d26b..5a4b508939 100755 --- a/extra/interval-maps/interval-maps-tests.factor +++ b/extra/interval-maps/interval-maps-tests.factor @@ -11,3 +11,8 @@ SYMBOL: test [ 2 ] [ 1 test get interval-at ] unit-test [ f ] [ 2 test get interval-at ] unit-test [ f ] [ 0 test get interval-at ] unit-test + +[ { { { 1 4 } 3 } { { 4 8 } 6 } } ] must-fail + +[ { { { 1 3 } 2 } { { 4 5 } 4 } { { 7 8 } 4 } } ] +[ { { 1 2 } { 2 2 } { 3 2 } { 4 4 } { 5 4 } { 7 4 } { 8 4 } } coalesce ] unit-test diff --git a/extra/interval-maps/interval-maps.factor b/extra/interval-maps/interval-maps.factor index bc23d0d346..7dcb9466cc 100755 --- a/extra/interval-maps/interval-maps.factor +++ b/extra/interval-maps/interval-maps.factor @@ -1,5 +1,5 @@ USING: kernel sequences arrays math.intervals accessors -math.order sorting math assocs ; +math.order sorting math assocs locals namespaces ; IN: interval-maps TUPLE: interval-map array ; @@ -24,6 +24,8 @@ M: interval >interval ; : ensure-disjoint ( intervals -- intervals ) dup keys [ interval-intersect not ] monotonic? [ "Intervals are not disjoint" throw ] unless ; + + PRIVATE> : interval-at* ( key map -- value ? ) @@ -35,7 +37,20 @@ PRIVATE> : interval-key? ( key map -- ? ) interval-at* nip ; : ( specification -- map ) - all-intervals ensure-disjoint - [ [ first to>> ] compare ] sort + all-intervals { } assoc-like + [ [ first to>> ] compare ] sort ensure-disjoint [ interval-node boa ] { } assoc>map interval-map boa ; + +:: coalesce ( alist -- specification ) + ! Only works with integer keys, because they're discrete + ! Makes 2array keys + [ + alist sort-keys unclip first2 dupd roll + [| oldkey oldval key val | ! Underneath is start + oldkey 1+ key = + oldval val = and + [ oldkey 2array oldval 2array , key ] unless + key val + ] assoc-each [ 2array ] bi@ , + ] { } make ; diff --git a/extra/lcs/authors.txt b/extra/lcs/authors.txt new file mode 100755 index 0000000000..504363d316 --- /dev/null +++ b/extra/lcs/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/extra/lcs/lcs-docs.factor b/extra/lcs/lcs-docs.factor new file mode 100755 index 0000000000..49e46c7641 --- /dev/null +++ b/extra/lcs/lcs-docs.factor @@ -0,0 +1,35 @@ +USING: help.syntax help.markup ; +IN: lcs + +HELP: levenshtein +{ $values { "old" "a sequence" } { "new" "a sequence" } { "n" "the Levenshtein distance" } } +{ $description "Calculates the Levenshtein distance between old and new, that is, the minimal number of changes from the old sequence to the new one, in terms of deleting, inserting and replacing characters." } ; + +HELP: lcs +{ $values { "seq1" "a sequence" } { "seq2" "a sequence" } { "lcs" "a longest common subsequence" } } +{ $description "Given two sequences, calculates a longest common subsequence between them. Note two things: this is only one of the many possible LCSs, and the LCS may not be contiguous." } ; + +HELP: diff +{ $values { "old" "a sequence" } { "new" "a sequence" } { "diff" "an edit script" } } +{ $description "Given two sequences, find a minimal edit script from the old to the new. There may be more than one minimal edit script, and this chooses one arbitrarily. This script is in the form of an array of the tuples of the classes " { $link retain } ", " { $link delete } " and " { $link insert } " which have their information stored in the 'item' slot." } ; + +HELP: retain +{ $class-description "Represents an action in an edit script where an item is kept, going from the initial sequence to the final sequence. This has one slot, called item, containing the thing which is retained" } ; + +HELP: delete +{ $class-description "Represents an action in an edit script where an item is deleted, going from the initial sequence to the final sequence. This has one slot, called item, containing the thing which is deleted" } ; + +HELP: insert +{ $class-description "Represents an action in an edit script where an item is added, going from the initial sequence to the final sequence. This has one slot, called item, containing the thing which is inserted" } ; + +ARTICLE: "lcs" "LCS, Diffing and Distance" +"This vocabulary provides words for three apparently unrelated but in fact very similar problems: finding a longest common subsequence between two sequences, getting a minimal edit script (diff) between two sequences, and calculating the Levenshtein distance between two sequences. The implementations of these algorithms are very closely related, and all running times are O(nm), where n and m are the lengths of the input sequences." +{ $subsection lcs } +{ $subsection diff } +{ $subsection levenshtein } +"The " { $link diff } " word returns a sequence of tuples of the following classes. They all hold their contents in the 'item' slot." +{ $subsection insert } +{ $subsection delete } +{ $subsection retain } ; + +ABOUT: "lcs" diff --git a/extra/lcs/lcs-tests.factor b/extra/lcs/lcs-tests.factor new file mode 100755 index 0000000000..3aa10a0687 --- /dev/null +++ b/extra/lcs/lcs-tests.factor @@ -0,0 +1,25 @@ +! Copyright (C) 2006 Slava Pestov. +! See http://factorcode.org/license.txt for BSD license. +USING: tools.test lcs ; + +[ 3 ] [ "sitting" "kitten" levenshtein ] unit-test +[ 3 ] [ "kitten" "sitting" levenshtein ] unit-test +[ 1 ] [ "freshpak" "freshpack" levenshtein ] unit-test +[ 1 ] [ "freshpack" "freshpak" levenshtein ] unit-test + +[ "hell" ] [ "hello" "hell" lcs ] unit-test +[ "hell" ] [ "hell" "hello" lcs ] unit-test +[ "ell" ] [ "ell" "hell" lcs ] unit-test +[ "ell" ] [ "hell" "ell" lcs ] unit-test +[ "abd" ] [ "faxbcd" "abdef" lcs ] unit-test + +[ { + T{ delete f CHAR: f } + T{ retain f CHAR: a } + T{ delete f CHAR: x } + T{ retain f CHAR: b } + T{ delete f CHAR: c } + T{ retain f CHAR: d } + T{ insert f CHAR: e } + T{ insert f CHAR: f } +} ] [ "faxbcd" "abdef" diff ] unit-test diff --git a/extra/lcs/lcs.factor b/extra/lcs/lcs.factor new file mode 100755 index 0000000000..cdebfc4325 --- /dev/null +++ b/extra/lcs/lcs.factor @@ -0,0 +1,97 @@ +USING: sequences kernel math locals math.order math.ranges +accessors combinators.lib arrays namespaces combinators ; +IN: lcs + +r [ 1+ ] bi@ r> min min ; + +: lcs-step ( insert delete change same? -- next ) + 1 -9999 ? + max max ; ! Replace -9999 with -inf when added + +:: loop-step ( i j matrix old new step -- ) + i j 1+ matrix nth nth ! insertion + i 1+ j matrix nth nth ! deletion + i j matrix nth nth ! replace/retain + i old nth j new nth = ! same? + step call + i 1+ j 1+ matrix nth set-nth ; inline + +: lcs-initialize ( |str1| |str2| -- matrix ) + [ drop 0 ] with map ; + +: levenshtein-initialize ( |str1| |str2| -- matrix ) + [ [ + ] curry map ] with map ; + +:: run-lcs ( old new init step -- matrix ) + [let | matrix [ old length 1+ new length 1+ init call ] | + old length [0,b) [| i | + new length [0,b) + [| j | i j matrix old new step loop-step ] + each + ] each matrix ] ; inline +PRIVATE> + +: levenshtein ( old new -- n ) + [ levenshtein-initialize ] [ levenshtein-step ] + run-lcs peek peek ; + +TUPLE: retain item ; +TUPLE: delete item ; +TUPLE: insert item ; + +> 1- ] [ old>> ] bi nth ; + +: new-nth ( state -- elt ) + [ j>> 1- ] [ new>> ] bi nth ; + +: top-beats-side? ( state -- ? ) + [ [ i>> ] [ j>> 1- ] [ table>> ] tri nth nth ] + [ [ i>> 1- ] [ j>> ] [ table>> ] tri nth nth ] bi > ; + +: retained? ( state -- ? ) + { + [ i>> 0 > ] [ j>> 0 > ] + [ [ old-nth ] [ new-nth ] bi = ] + } <-&& ; + +: do-retain ( state -- state ) + dup old-nth retain boa , + [ 1- ] change-i [ 1- ] change-j ; + +: inserted? ( state -- ? ) + [ j>> 0 > ] + [ [ i>> zero? ] [ top-beats-side? ] or? ] and? ; + +: do-insert ( state -- state ) + dup new-nth insert boa , [ 1- ] change-j ; + +: deleted? ( state -- ? ) + [ i>> 0 > ] + [ [ j>> zero? ] [ top-beats-side? not ] or? ] and? ; + +: do-delete ( state -- state ) + dup old-nth delete boa , [ 1- ] change-i ; + +: (trace-diff) ( state -- ) + { + { [ dup retained? ] [ do-retain (trace-diff) ] } + { [ dup inserted? ] [ do-insert (trace-diff) ] } + { [ dup deleted? ] [ do-delete (trace-diff) ] } + [ drop ] ! i=j=0 + } cond ; + +: trace-diff ( old new table -- diff ) + [ ] [ first length 1- ] [ length 1- ] tri trace-state boa + [ (trace-diff) ] { } make reverse ; +PRIVATE> + +: diff ( old new -- diff ) + 2dup [ lcs-initialize ] [ lcs-step ] run-lcs trace-diff ; + +: lcs ( seq1 seq2 -- lcs ) + [ diff [ retain? ] filter ] keep [ item>> ] swap map-as ; diff --git a/extra/lcs/summary.txt b/extra/lcs/summary.txt new file mode 100755 index 0000000000..9e70fd7e63 --- /dev/null +++ b/extra/lcs/summary.txt @@ -0,0 +1 @@ +Levenshtein distance and diff between sequences diff --git a/extra/lcs/tags.txt b/extra/lcs/tags.txt new file mode 100755 index 0000000000..4d914f4c46 --- /dev/null +++ b/extra/lcs/tags.txt @@ -0,0 +1 @@ +algorithms diff --git a/extra/levenshtein/authors.txt b/extra/levenshtein/authors.txt deleted file mode 100644 index 1901f27a24..0000000000 --- a/extra/levenshtein/authors.txt +++ /dev/null @@ -1 +0,0 @@ -Slava Pestov diff --git a/extra/levenshtein/levenshtein-tests.factor b/extra/levenshtein/levenshtein-tests.factor deleted file mode 100644 index 722ccb86ca..0000000000 --- a/extra/levenshtein/levenshtein-tests.factor +++ /dev/null @@ -1,9 +0,0 @@ -! Copyright (C) 2006 Slava Pestov. -! See http://factorcode.org/license.txt for BSD license. -IN: levenshtein.tests -USING: tools.test levenshtein ; - -[ 3 ] [ "sitting" "kitten" levenshtein ] unit-test -[ 3 ] [ "kitten" "sitting" levenshtein ] unit-test -[ 1 ] [ "freshpak" "freshpack" levenshtein ] unit-test -[ 1 ] [ "freshpack" "freshpak" levenshtein ] unit-test diff --git a/extra/levenshtein/levenshtein.factor b/extra/levenshtein/levenshtein.factor deleted file mode 100644 index 07731bfb84..0000000000 --- a/extra/levenshtein/levenshtein.factor +++ /dev/null @@ -1,47 +0,0 @@ -! Copyright (C) 2006 Slava Pestov. -! See http://factorcode.org/license.txt for BSD license. -USING: arrays help io kernel math namespaces sequences -math.order ; -IN: levenshtein - -: ( m n -- matrix ) - [ drop 0 ] with map ; inline - -: matrix-> nth nth ; inline -: ->matrix nth set-nth ; inline - -SYMBOL: d - -: ->d ( n i j -- ) d get ->matrix ; inline -: d-> ( i j -- n ) d get matrix-> ; inline - -SYMBOL: costs - -: init-d ( str1 str2 -- ) - [ length 1+ ] bi@ 2dup d set - [ 0 over ->d ] each - [ dup 0 ->d ] each ; inline - -: compute-costs ( str1 str2 -- ) - swap [ - [ = 0 1 ? ] with { } map-as - ] curry { } map-as costs set ; inline - -: levenshtein-step ( i j -- ) - [ 1+ d-> 1+ ] 2keep - [ >r 1+ r> d-> 1+ ] 2keep - [ d-> ] 2keep - [ costs get matrix-> + min min ] 2keep - >r 1+ r> 1+ ->d ; inline - -: levenshtein-result ( -- n ) d get peek peek ; inline - -: levenshtein ( str1 str2 -- n ) - [ - 2dup init-d - 2dup compute-costs - [ length ] bi@ [ - [ levenshtein-step ] curry each - ] with each - levenshtein-result - ] with-scope ; diff --git a/extra/levenshtein/summary.txt b/extra/levenshtein/summary.txt deleted file mode 100644 index 583669a8b0..0000000000 --- a/extra/levenshtein/summary.txt +++ /dev/null @@ -1 +0,0 @@ -Levenshtein edit distance algorithm diff --git a/extra/unicode/script/script.factor b/extra/unicode/script/script.factor index 14fba46c4d..d0bb4ac30d 100755 --- a/extra/unicode/script/script.factor +++ b/extra/unicode/script/script.factor @@ -1,12 +1,12 @@ USING: unicode.syntax.backend kernel sequences assocs io.files io.encodings ascii math.ranges io splitting math.parser namespaces byte-arrays locals math sets io.encodings.ascii -words compiler.units ; +words compiler.units arrays interval-maps ; IN: unicode.script num-table -VALUE: num>name-table +VALUE: script-table +SYMBOL: interned : parse-script ( stream -- assoc ) ! assoc is code point/range => name @@ -14,26 +14,18 @@ VALUE: num>name-table ";" split1 [ [ blank? ] trim ] bi@ ] H{ } map>assoc ; -: set-if ( value var -- ) - dup 500000 < [ set ] [ 2drop ] if ; +: range, ( value key -- ) + swap interned get + [ word-name = ] with find nip 2array , ; -: expand-ranges ( assoc -- char-assoc ) - ! char-assoc is code point => name - [ [ - CHAR: . pick member? [ - swap ".." split1 [ hex> ] bi@ [a,b] - [ set-if ] with each - ] [ swap hex> set-if ] if - ] assoc-each ] H{ } make-assoc ; - -: hash>byte-array ( hash -- byte-array ) - [ keys supremum 1+ dup ] keep - [ -rot set-nth ] with assoc-each ; - -: make-char>num ( assoc -- char>num-table ) - expand-ranges - [ num>name-table index ] assoc-map - hash>byte-array ; +: expand-ranges ( assoc -- interval-map ) + [ + [ + CHAR: . pick member? [ + swap ".." split1 [ hex> ] bi@ 2array + ] [ swap hex> ] if range, + ] assoc-each + ] { } make ; : >symbols ( strings -- symbols ) [ @@ -41,9 +33,9 @@ VALUE: num>name-table ] with-compilation-unit ; : process-script ( ranges -- ) - [ values prune \ num>name-table set-value ] - [ make-char>num \ char>num-table set-value ] bi - num>name-table >symbols \ num>name-table set-value ; + dup values prune >symbols interned [ + expand-ranges \ script-table set-value + ] with-variable ; : load-script ( -- ) "resource:extra/unicode/script/Scripts.txt" @@ -52,5 +44,7 @@ VALUE: num>name-table load-script PRIVATE> +SYMBOL: Unknown + : script-of ( char -- script ) - char>num-table nth num>name-table nth ; + script-table interval-at [ Unknown ] unless* ;