machine-learning.decision-trees: vocab for creating decision trees

not done
2018-02-08 19:47:00 +01:00 · 2018-02-08 19:47:00 +01:00 · 6dac2075df
parent 7b2c2a93f5
commit 6dac2075df
2 changed files with 49 additions and 0 deletions
--- a/extra/machine-learning/decision-trees/decision-trees-tests.factor
+++ b/extra/machine-learning/decision-trees/decision-trees-tests.factor
@ -0,0 +1,16 @@
 ! Copyright (C) 2018 Björn Lindqvist
 ! See http://factorcode.org/license.txt for BSD license
 USING: kernel machine-learning.data-sets
 machine-learning.decision-trees math.extras sequences tools.test ;
 IN: machine-learning.decision-trees.tests
 { { 0.08 0.01 0.0 0.03 0.29 0.0 } } [
    "monks-1.train" load-monks
    6 <iota> [
        average-gain 2 round-to-decimal
    ] with map
 ] unit-test
 { 4 } [
    "monks-1.train" load-monks highest-gain-index
 ] unit-test
--- a/extra/machine-learning/decision-trees/decision-trees.factor
+++ b/extra/machine-learning/decision-trees/decision-trees.factor
@ -0,0 +1,33 @@
 ! Copyright (C) 2018 Björn Lindqvist
 ! See http://factorcode.org/license.txt for BSD license
 USING: accessors assocs fry grouping.extras kernel locals math
 math.functions math.statistics sequences sequences.extras sorting ;
 IN: machine-learning.decision-trees
 ! Why convert the logarithm to base 2? I don't know.
 : entropy2 ( seq -- e )
    normalized-histogram values entropy 2 log / ;
 : group-by-sorted ( seq quot: ( elt -- key ) -- groups )
    [ sort-with ] keep group-by ; inline
 : subsets-weighted-entropy ( data-target idx -- seq )
    ! Group the data according to the given index.
    '[ first _ swap nth ] group-by-sorted
    ! Then unpack the partitioned groups of targets
    '[ [ second ] map ] assoc-map values
    ! Finally, calculate the weighted entropy for each group
    [ [ entropy2 ] [ length ] bi * ] map-sum ; inline
 :: average-gain ( dataset idx -- gain )
    dataset target>> :> target
    dataset data>> :> data
    data target zip :> data-target
    data-target idx subsets-weighted-entropy :> weighted
    target entropy2 weighted data length / - ;
 : highest-gain-index ( dataset -- idx )
    dup feature-names>> length <iota> [
        average-gain
    ] with map arg-max ;