weight = n_samples/(n_classes * np.bincount(y))
n_samples
为样本数,n_classes
为类别数量,np.bincount(y)
会输出每个样本的数量class_weight = 'balanced'
调整正负样本的权重,可以使得正负样本总权重相同import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,roc_curve,auc
data = pd.read_csv('data/Bcard.txt')
feature_lst = ['person_info','finance_info','credit_info','act_info']
train = data[data.obs_mth != '2018-11-30'].reset_index().copy()
val = data[data.obs_mth == '2018-11-30'].reset_index().copy()
x = train[feature_lst]
y = train['bad_ind']
val_x = val[feature_lst]
val_y = val['bad_ind']
lr_model = LogisticRegression(C=0.1)
lr_model.fit(x,y)
y_pred = lr_model.predict_proba(x)[:,1] #取出训练集预测值
fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred) #计算TPR和FPR
train_ks = abs(fpr_lr_train - tpr_lr_train).max() #计算训练集KS
print('train_ks : ',train_ks)
y_pred = lr_model.predict_proba(val_x)[:,1] #计算验证集预测值
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) #计算验证集预测值
val_ks = abs(fpr_lr - tpr_lr).max() #计算验证集KS值
print('val_ks : ',val_ks)
print('训练集:\n',y.value_counts())
print('跨时间验证集:\n',val_y.value_counts())
import numpy as np
print(np.bincount(y)[1]/np.bincount(y)[0])
print(np.bincount(val_y)[1]/np.bincount(val_y)[0])
class_weight = 'balanced'
lr_model = LogisticRegression(C=0.1,class_weight = 'balanced')
lr_model.fit(x,y)
y_pred = lr_model.predict_proba(x)[:,1] #取出训练集预测值
fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred) #计算TPR和FPR
train_ks = abs(fpr_lr_train - tpr_lr_train).max() #计算训练集KS
print('train_ks : ',train_ks)
y_pred = lr_model.predict_proba(val_x)[:,1] #计算验证集预测值
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) #计算验证集预测值
val_ks = abs(fpr_lr - tpr_lr).max() #计算验证集KS值
print('val_ks : ',val_ks)
class_weight = 'balanced'
提高了y = 1样本的权重,可以看出模型在训练集和时间验证集上KS值都有5%左右的提升# 使用sklearn计算
from sklearn.utils.class_weight import compute_class_weight
class_weight = 'balanced'
classes = np.array([0, 1, 2]) #标签类别
weight = compute_class_weight(class_weight, classes, y)
print(weight)
# [0.66666667 0.88888889 2.66666667]
逻辑回归通过参数class_weight = ‘balanced’ 调整正负样本的权重,可以使得正负样本总权重相同
使用之前逻辑回归评分卡的例子
train_ks : 0.41573985983413414
val_ks : 0.3928959732014397
使用相同的特征和数据,添加逻辑回归参数class_weight = ‘balanced’
train_ks : 0.4482325608488951
val_ks : 0.4198642457760936
从结果中看出,调整了class_weight=‘balanced’ 提高了 y=1 样本的权重,可以看出模型在训练集和跨时间验证集上KS值都有5%左右的提升
def lgb_test(train_x,train_y,test_x,test_y):
import lightgbm as lgb
clf =lgb.LGBMClassifier(boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
learning_rate = 0.1,
n_estimators = 24,
max_depth = 4,
num_leaves = 25,
max_bin = 40,
min_data_in_leaf = 5,
bagging_fraction = 0.6,
bagging_freq = 0,
feature_fraction = 0.8,
)
clf.fit(train_x,train_y,eval_set=[(train_x,train_y),(test_x,test_y)],eval_metric = 'auc')
return clf,clf.best_score_['valid_1']['auc']
feature_lst = ['person_info','finance_info','credit_info','act_info']
train_x = train[feature_lst]
train_y = train['bad_ind']
test_x = val[feature_lst]
test_y = val['bad_ind']
lgb_model,lgb_auc = lgb_test(train_x,train_y,test_x,test_y)
sample = train_x.copy()
sample['bad_ind'] = train_y
sample['pred'] = lgb_model.predict_proba(train_x)[:,1]
sample = sample.sort_values(by=['pred'],ascending=False).reset_index()
sample['rank'] = np.array(sample.index)/len(sample)
sample
def weight(x, y):
if x == 0 and y < 0.1:
return 0.1
elif x == 1 and y > 0.7:
return 0.1
else:
return 1
sample['weight'] = sample.apply(lambda x:weight(x.bad_ind,x['rank']),axis = 1)
smote_sample = sample[sample.weight == 1]
drop_sample = sample[sample.weight < 1]
train_x_smote = smote_sample[feature_lst]
train_y_smote = smote_sample['bad_ind']
smote_sample.shape
def smote(train_x_smote,train_y_smote,K=15,random_state=0):
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=K, n_jobs=1,random_state=random_state)
rex,rey = smote.fit_resample(train_x_smote,train_y_smote)
return rex,rey
得到正负样本均衡的数据集
def smote(train_x_smote,train_y_smote,K=15,random_state=0):
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=K, n_jobs=1,random_state=random_state)
rex,rey = smote.fit_resample(train_x_smote,train_y_smote)
return rex,rey
rex,rey =smote(train_x_smote,train_y_smote)
print('badpctn:',rey.sum()/len(rey))
x_smote = rex[feature_lst]
y_smote = rey
lr_model = LogisticRegression(C=0.1)
lr_model.fit(x_smote,y_smote)
x = train[feature_lst]
y = train['bad_ind']
val_x = val[feature_lst]
val_y = val['bad_ind']
y_pred = lr_model.predict_proba(x)[:,1] #取出训练集预测值
fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred) #计算TPR和FPR
train_ks = abs(fpr_lr_train - tpr_lr_train).max() #计算训练集KS
print('train_ks : ',train_ks)
y_pred = lr_model.predict_proba(val_x)[:,1] #计算验证集预测值
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) #计算验证集预测值
val_ks = abs(fpr_lr - tpr_lr).max() #计算验证集KS值
print('val_ks : ',val_ks)
class imbalanceData():
"""
处理不均衡数据
train训练集
test测试集
mmin低分段错分比例
mmax高分段错分比例
bad_ind样本标签
lis不参与建模变量列表
"""
def __init__(self, train,test,mmin,mmax, bad_ind,lis=[]):
self.bad_ind = bad_ind
self.train_x = train.drop([bad_ind]+lis,axis=1)
self.train_y = train[bad_ind]
self.test_x = test.drop([bad_ind]+lis,axis=1)
self.test_y = test[bad_ind]
self.columns = list(self.train_x.columns)
self.keep = self.columns + [self.bad_ind]
self.mmin = 0.1
self.mmax = 0.7
'''''
设置不同比例,
针对头部和尾部预测不准的样本,进行加权处理。
0.1为噪声的权重,不参与过采样。
1为正常样本权重,参与过采样。
'''
def weight(self,x,y):
if x == 0 and y < self.mmin:
return 0.1
elif x == 1 and y > self.mmax:
return 0.1
else:
return 1
'''''
用一个LightGBM算法和weight函数进行样本选择
只取预测准确的部分进行后续的smote过采样
'''
def data_cleaning(self):
lgb_model,lgb_auc = self.lgb_test()
sample = self.train_x.copy()
sample[self.bad_ind] = self.train_y
sample['pred'] = lgb_model.predict_proba(self.train_x)[:,1]
sample = sample.sort_values(by=['pred'],ascending=False).reset_index()
sample['rank'] = np.array(sample.index)/len(sample)
sample['weight'] = sample.apply(lambda x:self.weight(x.bad_ind,x['rank']),
axis = 1)
smote_sample = sample[sample.weight == 1][self.keep]
drop_sample = sample[sample.weight < 1][self.keep]
train_x_smote = smote_sample[self.columns]
train_y_smote = smote_sample[self.bad_ind]
return train_x_smote,train_y_smote,drop_sample
'''''
实施smote过采样
'''
def apply_smote(self):
'''''
选择样本,只对部分样本做过采样
train_x_smote,train_y_smote 为参与过采样的样本
drop_sample为不参加过采样的部分样本
'''
train_x_smote,train_y_smote,drop_sample = self.data_cleaning()
rex,rey = self.smote(train_x_smote,train_y_smote)
print('badpctn:',rey.sum()/len(rey))
df_rex = pd.DataFrame(rex)
df_rex.columns =self.columns
df_rex['weight'] = 1
df_rex[self.bad_ind] = rey
df_aff_smote = df_rex.append(drop_sample)
return df_aff_smote,rex,rey
'''''
定义LightGBM函数
'''
def lgb_test(self):
import lightgbm as lgb
clf =lgb.LGBMClassifier(boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
learning_rate = 0.1,
n_estimators = 24,
max_depth = 4,
num_leaves = 25,
max_bin = 40,
min_data_in_leaf = 5,
bagging_fraction = 0.6,
bagging_freq = 0,
feature_fraction = 0.8,
)
clf.fit(self.train_x,self.train_y,eval_set=[(self.train_x,self.train_y),
(self.test_x,self.test_y)],
eval_metric = 'auc')
return clf,clf.best_score_['valid_1']['auc']
'''''
调用imblearn中的smote函数
'''
def smote(self,train_x_smote,train_y_smote,K=15,random_state=0):
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=K, n_jobs=1, random_state=random_state)
rex,rey = smote.fit_resample(train_x_smote,train_y_smote)
return rex,rey