from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.under_sampling import AllKNN
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import InstanceHardnessThreshold
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
鸢尾花数据比较均衡,乳腺癌数据不均衡,可以用作样本数据
X, y = load_iris(return_X_y=True) # 鸢尾花数据集是三个类别0,1,2,而且类别均衡
print(X, y, X.shape, y.shape, len(X), len(y), sep="\n") # X.shape=(150, 4) y.shape=(150,) len(X)=150 len(y)=150
print(sorted(Counter(y).items())) # [(0, 50), (1, 50), (2, 50)]
X, y = load_breast_cancer(return_X_y=True) # 乳腺癌数据集是两个类别0,1,类别不均衡
print(X, y)
print(X.shape, y.shape) # (569, 30) (569,)
print(len(X), len(y)) # 569 569
print(sorted(Counter(y).items())) # [(0, 212), (1, 357)]
count0, count1 = 0, 0
for yy in y:
if yy == 0:
count0 += 1
elif yy == 1:
count1 += 1
print(count0, count1) # 212 357
随机过采样
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (714, 30) (714,)
print(len(X_resampled), len(y_resampled)) # 714 714
print(sorted(Counter(y_resampled).items())) # [(0, 357), (1, 357)]
对于少数类样本a, 随机选择一个最近邻的样本b, 然后从a与b的连线上随机选取一个点c作为新的少数类样本
ros = SMOTE(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # # (714, 30) (714,)
print(len(X_resampled), len(y_resampled)) # 714 714
print(sorted(Counter(y_resampled).items())) # [(0, 357), (1, 357)]
若数据集仅由类别型特征构成,则可用SMOTEN
ros = SMOTEN(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (714, 30) (714,)
print(len(X_resampled), len(y_resampled)) # 714 714
print(sorted(Counter(y_resampled).items())) # [(0, 357), (1, 357)]
SMOTENC可处理分类特征的SMOTE
只有当数据同时包含数值型和类别型特征时,SMOTENC才起作用
ros = SMOTENC(categorical_features=[18, 19], random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (714, 30) (714,)
print(len(X_resampled), len(y_resampled)) # 714 714
print(sorted(Counter(y_resampled).items())) # [(0, 357), (1, 357)]
由于SMOTE算法随机选择少数类样本来生成新样本,没有考虑到所选少数类样本周围的情况,因此可能存在两个问题:
所选的少数类样本周围也都是少数类样本,这样合成的新样本不会提供太多有用信息;
所选的少数类样本周围都是多数类样本,这样的样本可能是噪声,生成的新样本可能会与周围多数类样本重叠。
=========================================================
因此BorderlineSMOTE算法在SMOTE算法上进行改进,只使用处于边界上的少数类样本来合成新样本,算法首先将所有少数类样本划分为三类,分别是:
noise:噪声样本,即少数类样本的周围K个近邻都是多数类样本;
danger:危险样本,即少数类样本的周围K个近邻中有一半及以上为多数类样本;
safe:安全样本,即少数类样本的周围K个近邻中有一半以上为少数类样本。
=========================================================
BorderlineSMOTE算法只会从处于”danger“状态的样本中随机选择,然后用SMOTE算法产生新的样本。
处于”danger“状态的样本代表靠近”边界“附近的少数类样本,而处于边界附近的样本往往更容易被误分类。
因而 Border-line SMOTE 只对那些靠近”边界“的少数类样本进行人工合成样本,而 SMOTE 则对所有少数类样本一视同仁。
=========================================================
Border-line SMOTE 分为两种: Borderline-1 SMOTE 和 Borderline-2 SMOTE。
Borderline-1 SMOTE 在合成样本时式中的x^ 是一个少数类样本
Borderline-2 SMOTE 中的x^ 则是k近邻中的任意一个样本
ros = BorderlineSMOTE(kind="borderline-1", random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (714, 30) (714,)
print(len(X_resampled), len(y_resampled)) # 714 714
print(sorted(Counter(y_resampled).items())) # [(0, 357), (1, 357)]
SVMSMOTE使用支持向量机分类器产生支持向量然后再生成新的少数类样本,然后使用SMOTE合成样本
ros = SVMSMOTE(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (714, 30) (714,)
print(len(X_resampled), len(y_resampled)) # 714 714
print(sorted(Counter(y_resampled).items())) # [(0, 357), (1, 357)]
KMeansSMOTE原理:在使用SMOTE进行过采样之前应用KMeans聚类。
KMeansSMOTE包括三个步骤:聚类、过滤和过采样。
在聚类步骤中,使用k均值聚类为k个组。过滤选择用于过采样的簇,保留具有高比例的少数类样本的簇。
然后,它分配合成样本的数量,将更多样本分配给少数样本稀疏分布的群集。
最后,过采样步骤,在每个选定的簇中应用SMOTE以实现少数和多数实例的目标比率。
ros = KMeansSMOTE(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (719, 30) (719,)
print(len(X_resampled), len(y_resampled)) # 719 719
print(sorted(Counter(y_resampled).items())) # [(0, 362), (1, 357)]
ADASYN自适应综合过采样,关注的是在那些基于K近邻分类器被错误分类的原始样本附近生成新的少数类样本
ros = ADASYN(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (715, 30) (715,)
print(len(X_resampled), len(y_resampled)) # 715 715
print(sorted(Counter(y_resampled).items())) # [(0, 358), (1, 357)]
RandomUnderSampler随机下采样
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (424, 30) (424,)
print(len(X_resampled), len(y_resampled)) # 424 424
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 212)]
ClusterCentroids每一个类别的样本都会用K-means算法的中心点来合成
rus = ClusterCentroids(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (424, 30) (424,)
print(len(X_resampled), len(y_resampled)) # 424 424
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 212)]
NearMiss添加了一些启发式(heuristic)的规则来选择样本,version=1,2,3,3的结果和1,2不同,3的结果(277, 30) (277,) 277 277 [(0, 212), (1, 65)]
rus = NearMiss(version=3)
X_resampled, y_resampled = rus.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (424, 30) (424,)
print(len(X_resampled), len(y_resampled)) # 424 424
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 212)]
EditedNearestNeighbours应用最近邻算法来编辑(edit)数据集, 找出那些与邻居不太友好的样本然后移除
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (424, 30) (424,)
print(len(X_resampled), len(y_resampled)) # 424 424
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 212)]
RepeatedEditedNearestNeighbours重复基础的EditedNearestNeighbours算法多次
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (520, 30) (520,)
print(len(X_resampled), len(y_resampled)) # 520 520
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 308)]
ALLKNN算法在进行每次迭代的时候, 最近邻的数量都在增加
allknn = AllKNN()
X_resampled, y_resampled = renn.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (520, 30) (520,)
print(len(X_resampled), len(y_resampled)) # 520 520
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 308)]
CondensedNearestNeighbour使用1近邻的方法来进行迭代, 来判断一个样本是应该保留还是剔除,具体算法步骤如下:
将所有少数类样本放入集合C;从目标类(待下采样的类)中选择一个样本放入C,其余的该类样本放入集合S;逐样本遍历集合S,训练1近邻分类器,对S中的所有样本进行分类;将S中错误分类的样本加入C;重复上述过程直到没有再加入C中的样本
cnn = CondensedNearestNeighbour(random_state=42)
X_resampled, y_resampled = cnn.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (262, 30) (262,)
print(len(X_resampled), len(y_resampled)) # 262 262
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 50)]
OneSidedSelection 函数使用 TomekLinks 方法来剔除噪声数据(多数类样本)
oss = OneSidedSelection(random_state=42)
X_resampled, y_resampled = oss.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (495, 30) (495,)
print(len(X_resampled), len(y_resampled)) # 495 495
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 283)]
NeighbourhoodCleaningRule 算法主要关注如何清洗数据而不是筛选(considering)他们;
因此, 该算法将使用EditedNearestNeighbours和 3-NN分类器结果拒绝的样本之间的并集
ncr = NeighbourhoodCleaningRule()
X_resampled, y_resampled = ncr.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (541, 30) (541,)
print(len(X_resampled), len(y_resampled)) # 541 541
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 329)]
InstanceHardnessThreshold是一种很特殊的方法,是在数据上运用一种分类器, 然后将概率低于阈值的样本剔除掉
iht = InstanceHardnessThreshold(random_state=42)
X_resampled, y_resampled = iht.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (427, 30) (427,)
print(len(X_resampled), len(y_resampled)) # 427 427
print(sorted(Counter(y_resampled).items())) # [(0, 212), (1, 215)]
过采样与下采样结合
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (621, 30) (621,)
print(len(X_resampled), len(y_resampled)) # 621 621
print(sorted(Counter(y_resampled).items())) # [(0, 307), (1, 314)]
过采样与下采样结合
smotetomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smotetomek.fit_resample(X, y)
print(X_resampled, y_resampled, X_resampled.shape, y_resampled.shape, sep="\n") # (696, 30) (696,)
print(len(X_resampled), len(y_resampled)) # 696 696
print(sorted(Counter(y_resampled).items())) # [(0, 348), (1, 348)]
这里的X,y对应乳腺癌数据集load_breast_cancer
trainX, testX, trainY, testY = train_test_split(X, y, random_state=42)
from sklearn.ensemble import BaggingClassifier
BaggingClassifier该分类器并不允许对每个数据集进行均衡. 因此, 在对不均衡样本进行训练的时候, 分类器其实是有偏的, 偏向于多数类
bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=42)
bc.fit(trainX, trainY)
preY = bc.predict(testX)
print(preY)
print(confusion_matrix(testY, preY))
BalancedBaggingClassifier 允许在训练每个基学习器之前对每个子集进行重抽样;
简而言之, 该方法结合了EasyEnsemble 采样器与分类器(如BaggingClassifier)的结果
BalaceBaggingClassifier使得在训练每个分类器之前,在每个子集上进行重采样,其参数与sklearn中的BaggingClassifier相同,除了增加了两个参数:sampling_strategy和replacement来控制随机下采样的方式
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=42)
bbc.fit(trainX, trainY)
preY = bbc.predict(testX)
print(preY)
print(confusion_matrix(testY, preY))
在执行boosting迭代之前执行一个随机下采样
rbc = RUSBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state=42)
rbc.fit(trainX, trainY)
preY = rbc.predict(testX)
print(preY)
print(confusion_matrix(testY, preY))
EasyEnsemble 通过对原始的数据集进行随机下采样实现对数据集进行集成
EasyEnsembleClassifier,即采用Adaboost
计算弱分类器的错误率,对错误分类的样本分配更大的权值,正确分类的样本赋予更小权值。只要分类精度大于0.5即可做最终分类器中一员,弱分类器精度越高,权重越大。
eec = EasyEnsembleClassifier(base_estimator=DecisionTreeClassifier(), random_state=42)
eec.fit(trainX, trainY)
preY = eec.predict(testX)
print(preY)
print(confusion_matrix(testY, preY))
在构建每棵树时使用平衡的bootstrap数据子集
brfc = BalancedRandomForestClassifier(random_state=42)
brfc.fit(trainX, trainY)
preY = brfc.predict(testX)
print(preY)
print(confusion_matrix(testY, preY))
感谢大佬,参考如下:
python imblearn toolbox 解决数据不平衡问题(四)——联合采样、集成采样、其它细节
机器学习中的样本不平衡学习
抽样方法,采样方法 shuffle
数据分析:使用Imblearn处理不平衡数据(过采样、欠采样)
【机器学习】详解 使用 imblearn 应对类别不均衡
数据预处理–上采样(过采样)与下采样(降采样)
感谢大家的关注和支持,希望我写的文章能够让你们有收获。
如有不足,还请各位多多指正!