Synthetic Minority Oversampling Technique,是一种用于合成少数类样本的过采样技术,通过对训练集中的正例进行插值来产生额外的正例。
基本思想:
对少数类样本进行分析,然后在现有少数类样本之间进行插值,人工合成新样本,并将新样本添加到数据集中进行训练。
该技术是目前处理非平衡数据的常用手段,并受到学术界和工业界的一致认同。
1. 采样最邻近算法,计算出每个少数类样本的K个近邻;
2. 从K个近邻中随机挑选N个样本进行随机线性插值;
3. 构造新的少数类样本;
4. 将新样本与原数据合并,产生新的训练集。
由于SMOTE算法是基于样本空间进行插值的,会放大数据集中的噪声和异常,因此要对训练样本进行清洗。这里使用LightGBM算法对数据进行拟合,将预测结果较差的样本权重降低不参与SMOTE算法的插值过程。
参考:不均衡学习和异常检测_正负样本不均衡异常检测-CSDN博客
1. 样本加权,标注识别不准的样本
class imbalanceData(): # 处理不均衡数据
def __init__(self, train, test, mmin, mmax, label, lis=[]): # lis不参与建模变量列表
self.label = label
self.train_x = train.drop([label]+lis, axis=1)
self.train_y = train[label]
self.test_x = test.drop([label]+lis, axis=1)
self.test_y = test[bad_ind]
self.columns = list(self.train_x.columns)
self.keep = self.columns + [self.label]
self.mmin = 0.1 # mmin低分段错分比例
self.mmax = 0.7 # mmax高分段错分比例
# 针对头部和尾部预测不准的样本,进行加权处理
def weight(self,x,y):
if x == 0 and y < self.mmin:
return 0.1 # 噪声的权重,不参与过采样
elif x == 1 and y > self.mmax:
return 0.1 # 正常样本权重,参与过采样
else:
return 1
2. 样本选择
用一个lgbm算法和weight函数进行样本选择,只取预测准确的部分进行后续的smote过采样。
def data_cleaning(self):
lgb_model, lgb_auc = self.lgb_test()
sample = self.train_x.copy()
sample[self.label] = self.train_y
sample['pred'] = lgb_model.predict_proba(self.train_x)[:,1]
sample = sample.sort_values(by=['pred'], ascending=False).reset_index()
sample['rank'] = np.array(sample.index)/len(sample)
sample['weight'] = sample.apply(lambda x:self.weight(x.label, x['rank']),
axis = 1)
smote_sample = sample[sample.weight == 1][self.keep]
drop_sample = sample[sample.weight < 1][self.keep]
train_x_smote = smote_sample[self.columns]
train_y_smote = smote_sample[self.label]
return train_x_smote, train_y_smote, drop_sample
3. smote过采样
只对部分样本做过采样
def apply_smote(self):
train_x_smote, train_y_smote, drop_sample = self.data_cleaning()
rex, rey = self.smote(train_x_smote, train_y_smote)
print('badpctn:',rey.sum()/len(rey)) # 0.5
df_rex = pd.DataFrame(rex)
df_rex.columns = self.columns
df_rex['weight'] = 1
df_rex[self.label] = rey
df_aff_smote = df_rex.append(drop_sample)
return df_aff_smote, rex, rey
4. 定义LightGBM函数
def lgb_test(self):
import lightgbm as lgb
clf =lgb.LGBMClassifier(boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
learning_rate = 0.1,
n_estimators = 24,
max_depth = 4,
num_leaves = 25,
max_bin = 40,
min_data_in_leaf = 5,
bagging_fraction = 0.6,
bagging_freq = 0,
feature_fraction = 0.8)
clf.fit(self.train_x,self.train_y,eval_set=[(self.train_x, self.train_y), (self.test_x,self.test_y)], eval_metric = 'auc')
return clf,clf.best_score_['valid_1']['auc']
5. 调用imblearn中的smote函数
def smote(self, train_x_smote, train_y_smote, K=15, random_state=0):
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=K, n_jobs=1, random_state=random_state)
rex, rey = smote.fit_resample(train_x_smote,train_y_smote)
return rex, rey
x_smote = rex[feature_lst]
y_smote = rey
lr_model = LogisticRegression(C=0.1)
lr_model.fit(x_smote, y_smote)
x = train[feature_lst]
y = train['label']
val_x = val[feature_lst]
val_y = val['bad_ind']
y_pred = lr_model.predict_proba(x)[:,1]
fpr_lr_train, tpr_lr_train, _ = roc_curve(y, y_pred) # 计算TPR和FPR
train_ks = abs(fpr_lr_train-tpr_lr_train).max() # 计算训练集KS
print('train_ks : ', train_ks)
y_pred = lr_model.predict_proba(val_x)[:,1] # 计算验证集预测值
fpr_lr, tpr_lr, _ = roc_curve(val_y, y_pred) # 计算验证集预测值
val_ks = abs(fpr_lr - tpr_lr).max() # 计算验证集KS值
print('val_ks : ', val_ks)