factor/extra/machine-learning/rebalancing/rebalancing.factor

51 lines
1.5 KiB
Factor

! Copyright (C) 2012 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: arrays assocs fry kernel math math.functions
math.statistics memoize random sequences sorting ;
IN: machine-learning.rebalancing
ERROR: probability-sum-not-one seq ;
: check-probabilities ( seq -- seq )
dup sum 1.0 .00000000001 ~ [ probability-sum-not-one ] unless ;
: equal-probabilities ( n -- array )
dup recip <array> ; inline
MEMO: probabilities-seq ( seq -- seq' )
check-probabilities [ >float ] map cum-sum ;
: probabilities-quot ( seq -- quot )
probabilities-seq
'[ _ random-unit '[ _ > ] find drop ] ; inline
: stratified-sample ( stratified-sequences probability-sequence -- elt )
probabilities-quot call swap nth random ; inline
: stratified-samples ( stratified-sequences probability-sequence n -- elt )
[ '[ _ _ stratified-sample ] ] dip swap replicate ;
: equal-stratified-sample ( stratified-sequences -- elt )
random random ; inline
: collect-indices ( seq -- indices )
H{ } clone [ '[ swap _ push-at ] each-index ] keep ;
: balance-labels ( X y n -- X' y' )
[
dup collect-indices
values '[
_ _ _ equal-stratified-sample
'[ _ swap nth ] bi@ 2array
]
] dip swap replicate [ keys ] [ values ] bi ;
: skew-labels ( X y probs n -- X' y' )
[
[ dup collect-indices sort-keys values ] dip
'[
_ _ _ _ stratified-sample
'[ _ swap nth ] bi@ 2array
]
] dip swap replicate [ keys ] [ values ] bi ;