import pandas as pd
import numpy as np
import warnings
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None #显示所有列
pd.set_option('display.float_format', lambda x: '%.2f' % x) #取消科学计数法
train_data = pd.read_csv('Training.csv')
test_data = pd.read_csv('Test.csv')
#合并数据
total_data = pd.concat([train_data, test_data])
#替换.和?
total_data = total_data.replace('.', np.nan)
total_data = total_data.replace('?', np.nan)
print(total_data[['area']].info())
根据数据字典可以知道几点:
1)、area是一个字符串,区域代码第1码为大分类, 第1+2码为中分类, 依此类推。所以理论上应该是3位的,我们需要将异常数据转为空值,并且分别提取出前三位;
2)、黄色部分的字段是有相关性的,比如ck-saveall = ck-savetime ×ck-saveavg,所以可以通过计算填充;
3)、ck这个字段和后面带ck字样的字段都有关系,理论上应该后面字段中只要有一个值>0,ck就是1;
4)、其它字段应该也有内在联系,但是因为对银行业不了解,所以也不敢动。
5)、不建议用均值替换缺失值,因为我后续用的是LightGBM算法,所以可以不填充;如果使用RandomForest,则建议把缺失值填充为-1
total_data['area'].where(total_data['area'].str.len()>=3, inplace=True)
print(total_data[['area']].info())
# 将一些object变量转为数值变量
total_data.rename(columns={'depsaveavg':'dep-saveavg', 'depdrawavg': 'dep-drawavg'}, inplace=True)
num_features = list(set(total_data.columns) - set(['ID', 'area', 'ck', 'comp', 'VV']))
for col in num_features:
total_data[col] = pd.to_numeric(total_data[col])
total_data.info()
def cap(x, quantile=[0.05, 0.95]):
"""盖帽法处理异常值
Args:
x:pd.Series列,连续变量
quantile:指定盖帽法的上下分位数范围
"""
# 生成分位数
Q05, Q95=x.quantile(quantile).values.tolist()
# 替换异常值为指定的分位数
if Q05 > x.min():
x = x.copy()
x.loc[x<Q05] = Q05
if Q95 < x.max():
x = x.copy()
x.loc[x>Q95] = Q95
return(x)
columns = total_data.columns.tolist()
#对训练集进行盖帽法
total_data_1 = total_data[num_features]
total_data_2 = total_data_1.apply(cap)
new_total_data = pd.concat([total_data[['ID', 'area', 'ck', 'comp', 'VV']], total_data_2], axis=1)
total_data = new_total_data[columns]
total_data.head()
#根据规则,把'ck'和'dep'相关字段的数据补充
for way in ['ck-save', 'ck-draw', 'dep-save', 'dep-draw']:
total_data['new_{}all'.format(way)] = total_data['{}time'.format(way)] * total_data['{}avg'.format(way)]
total_data['new_{}time'.format(way)] = total_data['{}all'.format(way)] / total_data['{}avg'.format(way)]
total_data['new_{}avg'.format(way)] = total_data['{}all'.format(way)] / total_data['{}time'.format(way)]
total_data.loc[total_data['{}all'.format(way)].isnull(),'{}all'.format(way)] = total_data[total_data['{}all'.format(way)].isnull()]['new_{}all'.format(way)]
total_data.loc[total_data['{}time'.format(way)].isnull(),'{}time'.format(way)] = total_data[total_data['{}time'.format(way)].isnull()]['new_{}time'.format(way)]
total_data.loc[total_data['{}avg'.format(way)].isnull(),'{}avg'.format(way)] = total_data[total_data['{}avg'.format(way)].isnull()]['new_{}avg'.format(way)]
print(total_data.info())
#将ck的数据进行补充
print(total_data['ck'].value_counts())
total_data.loc[(total_data['ck-saveall']>0)|(total_data['ck-drawall']>0)|(total_data['ck-drawtime']>0)|(total_data['ck-saveavg']>0)
|(total_data['ck-drawavg']>0)|(total_data['ck-savetime']>0)|(total_data['ck-changame']>0)|(total_data['ck-changtime']>0)
|(total_data['ck-avg']>0), 'ck'] = '1'
print(total_data['ck'].value_counts())
说明:因为评分方式是以有资金需求的中小企业公司户的F-Measure来评估预测结果的好坏,而样本极其不均衡,所以需要通过上采样来调整数据分布,我这里使用了smote方法(考试中时间来不及直接是1的样本×40)
#筛选有用的特征值
cate_features = ['area', 'ck', 'comp']
predictors = num_features + cate_features
all_columns = predictors + ['ID', 'VV']
total_data = total_data[all_columns]
total_data = total_data.fillna(-1)
for col in cate_features:
total_data[col] = pd.to_numeric(total_data[col])
new_train_data = total_data[total_data['VV'] != 'Withheld']
new_test_data = total_data[total_data['VV'] == 'Withheld']
#上采样
smo = SMOTE(random_state=42)
new_train_data['VV'] = new_train_data['VV'].astype(int)
X_smo, y_smo = smo.fit_sample(new_train_data[predictors], new_train_data['VV'])
last_train_data = pd.concat([X_smo, y_smo], axis=1)
last_train_data.info()
last_train_data.head()
#简单预测
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import lightgbm as lgb
params = {'num_leaves': 30, #结果对最终效果影响较大,越大值越好,太大会出现过拟合
'min_data_in_leaf': 30,
'objective': 'binary', #定义的目标函数
'max_depth': -1,
'learning_rate': 0.01,
"min_sum_hessian_in_leaf": 6,
"boosting": "gbdt",
"feature_fraction": 0.8, #提取的特征比率
"bagging_freq": 1,
"bagging_fraction": 0.8,
"bagging_seed": 11,
"lambda_l1": 0.1, #l1正则
# 'lambda_l2': 0.001, #l2正则
"verbosity": -1,
"nthread": -1, #线程数量,-1表示全部线程,线程越多,运行的速度越快
'metric': {'binary_logloss'}, ##评价函数选择
"random_state": 2020, #随机数种子,可以防止每次运行的结果不一致
# 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
}
X_train, X_val, y_train, y_val = train_test_split(last_train_data[predictors], last_train_data["VV"],
test_size=0.2, random_state=2020)
training_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=training_data)
evals_result = {} #记录训练结果所用
model = lgb.train(params,
training_data,
num_boost_round=10000,
valid_sets=val_data,
early_stopping_rounds=100,
categorical_feature = cate_features,
evals_result = evals_result,
verbose_eval=500)
val_pred = model.predict(X_val)
val_pred = np.where(val_pred>=0.5, 1, 0)
val_true = y_val.as_matrix()
print(classification_report(val_true,val_pred))
test_pred = model.predict(new_test_data[predictors])
test_pred = np.where(test_pred>=0.5, 1, 0)
print(sum(test_pred))
print(len(test_pred))
answer = new_test_data.copy()
answer['VV'] = test_pred
answer[['ID', 'VV']].to_csv('results.csv', index=False)
#使用RF进行简单预测
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
X_train, X_val, y_train, y_val = train_test_split(new_train_data[predictors], new_train_data["VV"],
test_size=0.2, random_state=2020, stratify=new_train_data["VV"])
rf = RandomForestClassifier(n_estimators=50, min_samples_split=5, min_samples_leaf=3)
rf.fit(X_train, y_train)
print(accuracy_score(rf.predict(X_val), y_val))
pred = rf.predict(new_test_data[predictors])
new_test_data['VV'] = pred
new_test_data[['ID', 'VV']].to_csv('up_answer.csv', index=False)
print(sum(pred))
print(len(pred))
#5折交叉验证
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
train_x = last_train_data[predictors]
train_y = last_train_data['VV']
test_x = new_test_data[predictors]
X, y, X_test = train_x.values, train_y.values, test_x.values # 转为np.array类型
def self_metric(labels, preds):
preds = preds.get_label()
pred = np.where(preds>=0.5, 1, 0)
return f1_score(labels, preds)
param = {'num_leaves': 30, #结果对最终效果影响较大,越大值越好,太大会出现过拟合
'min_data_in_leaf': 30,
'objective': 'binary', #定义的目标函数
'max_depth': -1,
'learning_rate': 0.01,
"min_sum_hessian_in_leaf": 6,
"boosting": "gbdt",
"feature_fraction": 0.8, #提取的特征比率
"bagging_freq": 1,
"bagging_fraction": 0.8,
"bagging_seed": 11,
"lambda_l1": 0.1, #l1正则
# 'lambda_l2': 0.001, #l2正则
"verbosity": -1,
"nthread": -1, #线程数量,-1表示全部线程,线程越多,运行的速度越快
'metric': {'binary_logloss'}, ##评价函数选择
"random_state": 2020, #随机数种子,可以防止每次运行的结果不一致
# 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
}
# 五折交叉验证
folds = KFold(n_splits=5, shuffle=True, random_state=36)
folds
predictions = [] #测试的预测值
for fold_, (train_index, test_index) in enumerate(folds.split(X, y)):
print("第{}次交叉验证:".format(fold_+1))
X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[train_index], y[test_index]
training_data = lgb.Dataset(X_train, label=y_train) # 训练数据
validation_data = lgb.Dataset(X_valid, label=y_valid) # 验证数据
clf = lgb.train(param,
training_data,
num_boost_round=10000,
valid_sets=[validation_data],
verbose_eval=1000,
early_stopping_rounds=100,
# feval = self_metric
)
x_pred = clf.predict(X_valid, num_iteration=clf.best_iteration)
x_pred = np.where(x_pred>0.5,1,0)
print(f1_score(y_valid, x_pred))
y_test = clf.predict(X_test, num_iteration=clf.best_iteration) # 预测
# print(y_test[:10])
predictions.append(y_test)
final_scoreList = []
for i in range(0, 6537):
final_score = (predictions[0][i] + predictions[1][i] + predictions[2][i] + predictions[3][i] + predictions[4][i]) / 5
final_scoreList.append(final_score)
# print(final_scoreList[:10])
pred1 = np.array(final_scoreList)
pred = np.where(pred1>=0.5, 1, 0)
print(sum(pred))
print(len(pred))
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import lightgbm as lgb
from scipy import stats
X, y, X_test = last_train_data[predictors].values, last_train_data['VV'], new_test_data[predictors].values # 转为np.array类型
folds = KFold(n_splits=5, shuffle=True, random_state=36)
predictions = [] #最后的预测值
for k, (train_index, test_index) in enumerate(folds.split(X, y)):
print("第{}次交叉验证:".format(k+1))
X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[train_index], y[test_index]
clg = LGBMClassifier(
boosting="gbdt",
learning_rate=0.1,
colsample_bytree=0.8,
# max_depth=5,
# n_estimators=100,
num_leaves=31,
lambda_l1=0.1,
lambda_l2=0.1,
seed=0
)
clg.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],verbose=-1)
train_pred = clg.predict(X_train)
valid_pred = clg.predict(X_valid)
print("本轮训练集得分:%.2f%%"%(f1_score(y_train,train_pred)*100))
print("本轮验证集得分:%.2f%%"%(f1_score(y_valid,valid_pred)*100))
pred = clg.predict(X_test)
predictions.append(pred)
last_pred = stats.mode(predictions)[0][0]
new_test_data['VV'] = last_pred
new_test_data[['ID', 'VV']].to_csv('answer.csv', index=False)
print(sum(last_pred))
print(len(last_pred))