machine-learning.rebalancing: Add a way to rebalance an X,y dataset to N samples with an equal or skewed distribution.
parent
6e48f8ab15
commit
0a9774248f
|
@ -0,0 +1 @@
|
|||
Doug Coleman
|
|
@ -0,0 +1,19 @@
|
|||
! Copyright (C) 2012 Doug Coleman.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: assocs kernel machine-learning.rebalancing math
|
||||
math.statistics math.text.english sequences tools.test
|
||||
math.vectors ;
|
||||
IN: machine-learning.rebalancing.tests
|
||||
|
||||
[ t ] [
|
||||
{ 1 1 1 2 } [ [ number>text ] map ] [ ] bi
|
||||
100,000 balance-labels nip
|
||||
histogram values first2 - abs 3,000 <
|
||||
] unit-test
|
||||
|
||||
|
||||
[ t ] [
|
||||
{ 1 1 1 2 } [ [ number>text ] map ] [ ] bi
|
||||
{ 1/10 9/10 } 100,000 skew-labels nip
|
||||
histogram values { 10,000 90,000 } -.05 v~
|
||||
] unit-test
|
|
@ -0,0 +1,42 @@
|
|||
! Copyright (C) 2012 Doug Coleman.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: arrays assocs fry kernel math math.functions
|
||||
math.statistics memoize random sequences sorting ;
|
||||
IN: machine-learning.rebalancing
|
||||
|
||||
ERROR: probability-sum-not-one seq ;
|
||||
|
||||
: check-probabilities ( seq -- seq )
|
||||
dup sum 1.0 .00000000001 ~ [ probability-sum-not-one ] unless ;
|
||||
|
||||
: equal-probabilities ( n -- array )
|
||||
dup recip <array> ; inline
|
||||
|
||||
MEMO: probabilities-seq ( seq -- seq' )
|
||||
check-probabilities [ >float ] map cum-sum ;
|
||||
|
||||
: probabilities-quot ( seq -- quot )
|
||||
probabilities-seq
|
||||
'[ _ random-unit '[ _ > ] find drop ] ; inline
|
||||
|
||||
: stratified-sample ( stratified-sequences probability-sequence -- elt )
|
||||
probabilities-quot call swap nth random ; inline
|
||||
|
||||
: balance-labels ( X y n -- X' y' )
|
||||
[
|
||||
dup [ ] collect-index-by
|
||||
values dup length equal-probabilities
|
||||
'[
|
||||
_ _ _ _ stratified-sample
|
||||
'[ _ swap nth ] bi@ 2array
|
||||
]
|
||||
] dip swap replicate [ keys ] [ values ] bi ;
|
||||
|
||||
: skew-labels ( X y probs n -- X' y' )
|
||||
[
|
||||
[ dup [ ] collect-index-by sort-keys values ] dip
|
||||
'[
|
||||
_ _ _ _ stratified-sample
|
||||
'[ _ swap nth ] bi@ 2array
|
||||
]
|
||||
] dip swap replicate [ keys ] [ values ] bi ;
|
Loading…
Reference in New Issue