diff --git a/extra/machine-learning/decision-trees/decision-trees-tests.factor b/extra/machine-learning/decision-trees/decision-trees-tests.factor new file mode 100644 index 0000000000..31d2d24970 --- /dev/null +++ b/extra/machine-learning/decision-trees/decision-trees-tests.factor @@ -0,0 +1,16 @@ +! Copyright (C) 2018 Björn Lindqvist +! See http://factorcode.org/license.txt for BSD license +USING: kernel machine-learning.data-sets +machine-learning.decision-trees math.extras sequences tools.test ; +IN: machine-learning.decision-trees.tests + +{ { 0.08 0.01 0.0 0.03 0.29 0.0 } } [ + "monks-1.train" load-monks + 6 [ + average-gain 2 round-to-decimal + ] with map +] unit-test + +{ 4 } [ + "monks-1.train" load-monks highest-gain-index +] unit-test diff --git a/extra/machine-learning/decision-trees/decision-trees.factor b/extra/machine-learning/decision-trees/decision-trees.factor new file mode 100644 index 0000000000..b843e7847f --- /dev/null +++ b/extra/machine-learning/decision-trees/decision-trees.factor @@ -0,0 +1,33 @@ +! Copyright (C) 2018 Björn Lindqvist +! See http://factorcode.org/license.txt for BSD license +USING: accessors assocs fry grouping.extras kernel locals math +math.functions math.statistics sequences sequences.extras sorting ; +IN: machine-learning.decision-trees + +! Why convert the logarithm to base 2? I don't know. +: entropy2 ( seq -- e ) + normalized-histogram values entropy 2 log / ; + +: group-by-sorted ( seq quot: ( elt -- key ) -- groups ) + [ sort-with ] keep group-by ; inline + +: subsets-weighted-entropy ( data-target idx -- seq ) + ! Group the data according to the given index. + '[ first _ swap nth ] group-by-sorted + ! Then unpack the partitioned groups of targets + '[ [ second ] map ] assoc-map values + ! Finally, calculate the weighted entropy for each group + [ [ entropy2 ] [ length ] bi * ] map-sum ; inline + +:: average-gain ( dataset idx -- gain ) + dataset target>> :> target + dataset data>> :> data + data target zip :> data-target + data-target idx subsets-weighted-entropy :> weighted + + target entropy2 weighted data length / - ; + +: highest-gain-index ( dataset -- idx ) + dup feature-names>> length [ + average-gain + ] with map arg-max ;