machine-learning.rebalancing: Add a way to rebalance an X,y dataset to N samples with an equal or skewed distribution.

db4
Doug Coleman 2012-11-17 12:38:52 -08:00
parent 6e48f8ab15
commit 0a9774248f
3 changed files with 62 additions and 0 deletions

View File

@ -0,0 +1 @@
Doug Coleman

View File

@ -0,0 +1,19 @@
! Copyright (C) 2012 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: assocs kernel machine-learning.rebalancing math
math.statistics math.text.english sequences tools.test
math.vectors ;
IN: machine-learning.rebalancing.tests
[ t ] [
{ 1 1 1 2 } [ [ number>text ] map ] [ ] bi
100,000 balance-labels nip
histogram values first2 - abs 3,000 <
] unit-test
[ t ] [
{ 1 1 1 2 } [ [ number>text ] map ] [ ] bi
{ 1/10 9/10 } 100,000 skew-labels nip
histogram values { 10,000 90,000 } -.05 v~
] unit-test

View File

@ -0,0 +1,42 @@
! Copyright (C) 2012 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: arrays assocs fry kernel math math.functions
math.statistics memoize random sequences sorting ;
IN: machine-learning.rebalancing
ERROR: probability-sum-not-one seq ;
: check-probabilities ( seq -- seq )
dup sum 1.0 .00000000001 ~ [ probability-sum-not-one ] unless ;
: equal-probabilities ( n -- array )
dup recip <array> ; inline
MEMO: probabilities-seq ( seq -- seq' )
check-probabilities [ >float ] map cum-sum ;
: probabilities-quot ( seq -- quot )
probabilities-seq
'[ _ random-unit '[ _ > ] find drop ] ; inline
: stratified-sample ( stratified-sequences probability-sequence -- elt )
probabilities-quot call swap nth random ; inline
: balance-labels ( X y n -- X' y' )
[
dup [ ] collect-index-by
values dup length equal-probabilities
'[
_ _ _ _ stratified-sample
'[ _ swap nth ] bi@ 2array
]
] dip swap replicate [ keys ] [ values ] bi ;
: skew-labels ( X y probs n -- X' y' )
[
[ dup [ ] collect-index-by sort-keys values ] dip
'[
_ _ _ _ stratified-sample
'[ _ swap nth ] bi@ 2array
]
] dip swap replicate [ keys ] [ values ] bi ;