import pandas as pd
import numpy as np
data = pd.DataFrame(np.random.random((50, 3)), columns=['a', 'b', 'c'])
data['y'] = np.vstack((np.zeros((6, 1)), np.ones((44, 1))))
# 朴素随机过采样 过采样后样本类别的比列为1:1
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
x, y = ros.fit_sample(data[['a', 'b', 'c']], data['y'])
ros = RandomOverSampler(random_state=0)
x, y = ros.fit_sample(data[['a', 'b', 'c']], data['y'])
# SMOTE过采样
# SMOTE:对于少数类的样本a,随机选择一个最近邻的样本b,
# 然后从a到b的连线上随机选取一个点c作为新的少数类样本
from imblearn.over_sampling import SMOTE
x, y = SMOTE(k_neighbors=5).fit_sample(data[['a', 'b', 'c']], data['y']) # 少数类别的样本数不能少于k_neighbors
# ADASYN
# 关注的是在那些基于K近邻分类器被错误分类的原始样本附近生成新的少数类样本
from imblearn.over_sampling import ADASYN
x, y = ADASYN().fit_sample(data[['a', 'b', 'c']], data['y'])
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=0)
x, y = cc.fit_sample(data[['a', 'b', 'c']], data['y'])
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
x, y = rus.fit_sample(data[['a', 'b', 'c']], data['y'])
typical = {0: 1, 1: 0.2} # 抽样比例
def function(group, typical):
name = group.name
frac = typical[name]
return group.sample(frac=frac)
result = data.groupby('y', group_keys=False).apply(function, typical)
在之前的SMOTE方法中, 当由边界的样本与其他样本进行过采样差值时, 很容易生成一些噪音数据. 因此, 在过采样之后需要对样本进行清洗. 这样, 第三节中涉及到的TomekLink 与 EditedNearestNeighbours方法就能实现上述的要求. 所以就有了两种结合过采样与下采样的方法:
(i) SMOTETomek
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
x, y = smote_enn.fit_sample(data[['a', 'b', 'c']], data['y'])
(ii) SMOTEENN
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(random_state=0)
x, y = smote_tomek.fit_sample(data[['a', 'b', 'c']], data['y'])