

      • 一、第一种思路:平衡采样
        • 1.SMOTE算法
        • 2.SMOTE与RandomUnderSampler进行结合
        • 3.Borderline-SMOTE与SVMSMOTE
        • 4.ADASYN
        • 5.平衡采样与决策树结合
  1. 利用最近邻算法进行采样,计算出每个少数类样本的K个近邻
  2. 从K个近邻中随机挑选N个样本进行随机线性插值
  3. 构造新的少数类样本
    N e w = New= New= x x x i i i + r a n d ( 0 , 1 ) × +rand(0,1)× +rand(0,1)×( y y y j j j − - x x x i i i), j = 1 , 2 , . . . N j=1,2,...N j=1,2,...N
    其中 x x x i i i是少类中的一个观测点, y y y j j j是K近邻中随机抽取的样本
  4. 将新样本与原数据组合,组成新的数据集

  SMOTE过采样算法实现 python有一个专门实现的库—imblearn.over_sampling.SMOTE

import pandas as pd
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

X, y = make_classification(n_classes=2, class_sep=2,
weights=[0.01, 0.99], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=10000, random_state=10)
print('Original dataset shape %s' % Counter(y))

# 此时正负样本比大致为99:1
# 该数据集是公开的申请评分卡数据集,质量比较高

# 比如:经过过采样,我们想把数据集正负样本比例设置成10:1
# sampling_strategy这个参数控制采样后负样本占总样本的比例,这一个尤其注意
sm = SMOTE(sampling_strategy=0.1, random_state=10, k_neighbors=5, n_jobs=-1)

X_res, y_res = sm.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))
Original dataset shape Counter({1: 9900, 0: 100})
Resampled dataset shape Counter({1: 9900, 0: 990})



from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

X, y = make_classification(n_classes=2, class_sep=2,
                           weights=[0.01, 0.99], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1, n_samples=100000, random_state=10)
print('Original dataset shape %s' % Counter(y))
# 此时正负样本比大致为99:1

# 在总样本非常多的情况下,可以先对多类样本欠采样,然后对少数样本过采样
# sampling_strategy这个参数控制采样后负样本占总样本的比例
pipeline = Pipeline([('over', SMOTE(sampling_strategy=0.1)),
                     ('under', RandomUnderSampler(sampling_strategy=0.5))

X_res, y_res = pipeline.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))
Original dataset shape Counter({1: 99000, 0: 1000})
Resampled dataset shape Counter({1: 19800, 0: 9900})

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import BorderlineSMOTE

X, y = make_classification(n_classes=2, class_sep=2,
                           weights=[0.01, 0.99], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1, n_samples=100000, random_state=10)

print('Original dataset shape %s' % Counter(y))

# 过采样
oversample = BorderlineSMOTE(sampling_strategy=0.5)
X, y = oversample.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y))
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SVMSMOTE

X, y = make_classification(n_classes=2, class_sep=2,
                           weights=[0.01, 0.99], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1, n_samples=100000, random_state=10)

print('Original dataset shape %s' % Counter(y))

# 过采样
oversample = SVMSMOTE(sampling_strategy=0.5)
X, y = oversample.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y))
Original dataset shape Counter({1: 99000, 0: 1000})
Resampled dataset shape Counter({1: 99000, 0: 49500})



from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN

X, y = make_classification(n_classes=2, class_sep=2,
                           weights=[0.01, 0.99], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1, n_samples=100000, random_state=10)
print('Original dataset shape %s' % Counter(y))

# transform the dataset
oversample = ADASYN(sampling_strategy=0.5)
X, y = oversample.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y))

Original dataset shape Counter({1: 99000, 0: 1000})
Resampled dataset shape Counter({1: 99000, 0: 49708})


from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
	n_clusters_per_class=1, weights=[0.01,0.99], flip_y=0, random_state=1)

model = DecisionTreeClassifier()
# 分层抽样(30个)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# 交差验证的10折数据集上与总数据集有一样的分布
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))
Mean ROC AUC: 0.769


from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
	n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

steps = [('over', SMOTE()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))
Mean ROC AUC: 0.825


from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
	n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

model = DecisionTreeClassifier()
# 正负样本比例达到2:1
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))
Mean ROC AUC: 0.849


from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    # define pipeline
    model = DecisionTreeClassifier()
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))
> k=1, Mean ROC AUC: 0.823
> k=2, Mean ROC AUC: 0.825
> k=3, Mean ROC AUC: 0.842
> k=4, Mean ROC AUC: 0.846
> k=5, Mean ROC AUC: 0.840
> k=6, Mean ROC AUC: 0.843
> k=7, Mean ROC AUC: 0.855


from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler

# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    # define pipeline
    model = DecisionTreeClassifier()
    over = SVMSMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))
> k=1, Mean ROC AUC: 0.839
> k=2, Mean ROC AUC: 0.848
> k=3, Mean ROC AUC: 0.840
> k=4, Mean ROC AUC: 0.855
> k=5, Mean ROC AUC: 0.844
> k=6, Mean ROC AUC: 0.846
> k=7, Mean ROC AUC: 0.846


from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

# define pipeline
model = DecisionTreeClassifier()
over = ADASYN(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean(scores)
print(' Mean ROC AUC: %.3f' % (score))
 Mean ROC AUC: 0.826






