项目地址:team-learning-data-mining/FinancialRiskControl at master · datawhalechina/team-learning-data-mining · GitHub
本文主要根据自某信贷平台的贷款记录进行预测,以金融风控中的个人信贷为背景,要求根据贷款申请人的数据信息预测其是否有违约的可能,以此判断是否通过此项贷款,是一个典型的多分类的问题。本文完整的陈述了从数据探索到特征工程到构建模型的全过程。
总数据量超过120w,包含47列变量信息,其中15列为匿名变量。其中80万条作为训练集,20万条作为测试集A,20万条作为测试集B,同时会对employmentTitle、purpose、postCode和title等信息进行脱敏。
train.csv
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
#数据导入
train = pd.read_csv('D:/myP/financial_risk/train.csv')
testA = pd.read_csv('D:/myP/financial_risk/testA.csv')
print('Train data shape:',train.shape)
print('TestA data shape:',testA.shape)
print('the columns name of training dataset:\n',train.columns)
print('the columns name of test dataset:\n',testA.columns)
分析:
train.info()
分析:
train.describe()
# 缺失值可视化
missing = train.isnull().sum()/len(train)
missing = missing[missing > 0]
missing.sort_values(inplace=True) #排个序
missing.plot.bar()
查看训练集测试集中特征属性只有一值的特征
#找出训练数据集(train)和测试数据集(tset)中所有特征值唯一的特征
one_value_fea = [col for col in train.columns if train[col].nunique() <= 1]
one_value_fea_test = [col for col in testA.columns if testA[col].nunique() <= 1]
print(one_value_fea, one_value_fea_test)
分析:
#判断并打印数值型特征(除object外的所有特征)
numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
print(numerical_fea)
#打印属性为object的特征值
category_fea = list(filter(lambda x: x not in numerical_fea, list(train.columns)))
print(category_fea)
#写一个函数来区分连续性变量和离散型变量(依据变量的个数)
def get_numerical_serial_fea(data,feas):
numerical_serial_fea = []
numerical_noserial_fea = []
for fea in feas:
temp = data[fea].nunique()
if temp <= 10:
numerical_noserial_fea.append(fea)
continue
numerical_serial_fea.append(fea)
return numerical_serial_fea,numerical_noserial_fea
#对测试集进行处理
numerical_serial_fea,numerical_noserial_fea = get_numerical_serial_fea(train,numerical_fea)
#打印连续型变量及其个数
print(numerical_serial_fea,len(numerical_serial_fea))
#打印离散型变量
print(numerical_noserial_fea, len(numerical_noserial_fea))
分析:一共有 9个离散型变量和33个连续型变量
#离散型变量
for fea in numerical_noserial_fea:
a = train[fea].value_counts()
print([fea],'的取值与取值数量为:\n',a)
f = pd.melt(train, value_vars=numerical_serial_fea)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.displot, "value")
#绘制交易金额值分布
plt.figure(figsize=(16,12))
plt.suptitle('Transaction Values Distribution', fontsize=22)
plt.subplot(221) ##2代表行,2代表列,所以一共有4个图,1代表此时绘制第一个图。
sub_plot_1 = sns.distplot(train['loanAmnt'])
sub_plot_1.set_title("loanAmnt Distribuition", fontsize=18)
sub_plot_1.set_xlabel("")
sub_plot_1.set_ylabel("Probability", fontsize=15)
plt.subplot(222)
sub_plot_2 = sns.distplot(np.log(train['loanAmnt']))
sub_plot_2.set_title("loanAmnt (Log) Distribuition", fontsize=18)
sub_plot_2.set_xlabel("")
sub_plot_2.set_ylabel("Probability", fontsize=15)
Text(0, 0.5, 'Probability')
for fea in category_fea:
a = train[fea].value_counts()
print([fea],'的取值与取值数量为:\n',a)
plt.figure(figsize=(8, 8))
sns.barplot(train["employmentLength"].value_counts(dropna=False)[:20],
train["employmentLength"].value_counts(dropna=False).keys()[:20])
plt.show()
train_loan_fr = train.loc[train['isDefault'] == 1]
train_loan_nofr = train.loc[train['isDefault'] == 0]
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 8))
train_loan_fr.groupby('grade')['grade'].count().plot(kind='barh', ax=ax1, title='Count of grade fraud')
train_loan_nofr.groupby('grade')['grade'].count().plot(kind='barh', ax=ax2, title='Count of grade non-fraud')
train_loan_fr.groupby('employmentLength')['employmentLength'].count().plot(kind='barh', ax=ax3, title='Count of employmentLength fraud')
train_loan_nofr.groupby('employmentLength')['employmentLength'].count().plot(kind='barh', ax=ax4, title='Count of employmentLength non-fraud')
plt.show()
fig, ((ax1, ax2)) = plt.subplots(1, 2, figsize=(15, 6))
#对训练数据定位目标值为1,再定位特征值为"loanAmnt",再应用log函数,再画图
train.loc[train['isDefault'] == 1] \
['loanAmnt'].apply(np.log) \
.plot(kind='hist',
bins=100,
title='Log Loan Amt - Fraud',
color='r',
xlim=(-3, 10),
ax= ax1)
train.loc[train['isDefault'] == 0] \
['loanAmnt'].apply(np.log) \
.plot(kind='hist',
bins=100,
title='Log Loan Amt - Not Fraud',
color='b',
xlim=(-3, 10),
ax=ax2)
total = len(train) #数据行数80万
#对特征值'loanAmnt'按目标值'isDefault'分组累加2次
total_amt = train['loanAmnt'].sum()
plt.figure(figsize=(12,5))
plt.subplot(121)
#计数图,展示目标值"isDefault"的类别分布
plot_tr = sns.countplot(x='isDefault',data=train)
plot_tr.set_title("Fraud Loan Distribution \n 0: good user | 1: bad user", fontsize=14)
plot_tr.set_xlabel("Is fraud by count", fontsize=16)
plot_tr.set_ylabel('Count', fontsize=16)
for p in plot_tr.patches:
height = p.get_height()
plot_tr.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/total*100),
ha="center", fontsize=15)
percent_amt = (train.groupby(['isDefault'])['loanAmnt'].sum())
percent_amt = percent_amt.reset_index()
plt.subplot(122)
plot_tr_2 = sns.barplot(x='isDefault', y='loanAmnt', dodge=True, data=percent_amt)
plot_tr_2.set_title("Total Amount in loanAmnt \n 0: good user | 1: bad user", fontsize=14)
plot_tr_2.set_xlabel("Is fraud by percent", fontsize=16)
plot_tr_2.set_ylabel('Total Loan Amount Scalar', fontsize=16)
for p in plot_tr_2.patches:
height = p.get_height()
plot_tr_2.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/total_amt * 100),
ha="center", fontsize=15)
将"issueDate"转化为"issueDateDT"("issueDate"中最早的一天为2007-06-01,所以"startdate"为2007-06-01)
#train的特征值"issueDate"转化成时间格式,转化为离开始的天数
train['issueDate'] = pd.to_datetime(train['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
train['issueDateDT'] = train['issueDate'].apply(lambda x: x-startdate).dt.days
#testA的"issueDate"转化成时间格式,转化为离开始的天数
testA['issueDate'] = pd.to_datetime(testA['issueDate'],format='%Y-%m-%d')
testA['issueDateDT'] = testA['issueDate'].apply(lambda x: x-startdate).dt.days
#作图
plt.hist(train['issueDateDT'], label='train');
plt.hist(testA['issueDateDT'], label='test');
plt.legend();
plt.title('Distribution of issueDateDT dates');
#train 和 test issueDateDT 日期有重叠 所以使用基于时间的分割进行验证是不明智的
index,columns和values分别为行,列和数据
pivot = pd.pivot_table(train, index=['grade'], columns=['issueDateDT'], values=['loanAmnt'], aggfunc=np.sum)
pivot
import pandas_profiling
pfr = pandas_profiling.ProfileReport(train)
pfr.to_file("./example.html")
#将目标值从数值型变量中去除
numerical_fea.remove('isDefault')
train.isnull().sum()
#按照train训练数据的中位数填充数值型特征的缺失值
train[numerical_fea] = train[numerical_fea].fillna(train[numerical_fea].median())
testA[numerical_fea] = testA[numerical_fea].fillna(train[numerical_fea].median())
['issueDate']为时间格式特征,需要先转化为时间格式。
#转化成时间格式
for data in [train, testA]:
data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
#构造时间特征
data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
4.2.2.1 对象类型特征‘employmentLength’转换到数值
train['employmentLength'].value_counts(dropna=False).sort_index()
#定义一个函数:非零值取空格前的数字,且设置为int8格式
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
for data in [train, testA]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
4.2.2.2 对‘earliesCreditLine’进行预处理(只保留年份)
data['earliesCreditLine'].value_counts(dropna=False).sort_index()
#取最后4个数字作为年份
for data in [train, testA]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data['earliesCreditLine'].value_counts(dropna=False).sort_index()
# 部分类别特征
cate_features = ['grade', 'subGrade', 'verificationStatus', 'purpose', 'postCode', 'regionCode','applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
print(f, '类型数:', train[f].nunique())
4.2.3.1 将‘grade’映射到数的范围:
for data in [train, testA]:
data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
train['grade']
4.2.3.2 将类型数在2之上,且不是高维稀疏的纯分类特征,转化为one-hot编码
# 类型数在2之上,又不是高维稀疏的,且纯分类特征
train = pd.get_dummies(train, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
在统计学中,如果一个数据分布近似正态,那么大约 68% 的数据值会在均值的一个标准差范围内,大约 95% 会在两个标准差范围内,大约 99.7% 会在三个标准差范围内。
#判断正常值OR异常值
#定义一个函数把数据集中<3或>3倍方差的数值删除
def find_outliers_by_3segama(data,fea):
data_std = np.std(data[fea]) #方差
data_mean = np.mean(data[fea]) #均值
outliers_cut_off = data_std * 3
#均值±3*方差设为上下线
lower_rule = data_mean - outliers_cut_off
upper_rule = data_mean + outliers_cut_off
data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
return data
for fea in numerical_serial_fea:
train = find_outliers_by_3segama(train,fea)
print(train[fea+'_outliers'].value_counts())
print(train.groupby(fea+'_outliers')['isDefault'].sum())
print('*分隔符*'*4)
正常值 800000 Name: id_outliers, dtype: int64 id_outliers 正常值 159610 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 800000 Name: loanAmnt_outliers, dtype: int64 loanAmnt_outliers 正常值 159610 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 800000 Name: term_outliers, dtype: int64 term_outliers 正常值 159610 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 794259 异常值 5741 Name: interestRate_outliers, dtype: int64 interestRate_outliers 异常值 2916 正常值 156694 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 792046 异常值 7954 Name: installment_outliers, dtype: int64 installment_outliers 异常值 2152 正常值 157458 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 800000 Name: employmentTitle_outliers, dtype: int64 employmentTitle_outliers 正常值 159610 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 799701 异常值 299 Name: homeOwnership_outliers, dtype: int64 homeOwnership_outliers 异常值 62 正常值 159548 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 793973 异常值 6027 Name: annualIncome_outliers, dtype: int64 annualIncome_outliers 异常值 756 正常值 158854 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 800000 Name: verificationStatus_outliers, dtype: int64 verificationStatus_outliers 正常值 159610 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 783003 异常值 16997 Name: purpose_outliers, dtype: int64 purpose_outliers 异常值 3635 正常值 155975 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 798931 异常值 1069 Name: postCode_outliers, dtype: int64 postCode_outliers 异常值 221 正常值 159389 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 799994 异常值 6 Name: regionCode_outliers, dtype: int64 regionCode_outliers 异常值 1 正常值 159609 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 798440 异常值 1560 Name: dti_outliers, dtype: int64 dti_outliers 异常值 466 正常值 159144 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 778245 异常值 21755 Name: delinquency_2years_outliers, dtype: int64 delinquency_2years_outliers 异常值 5089 正常值 154521 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 788261 异常值 11739 Name: ficoRangeLow_outliers, dtype: int64 ficoRangeLow_outliers 异常值 778 正常值 158832 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 788261 异常值 11739 Name: ficoRangeHigh_outliers, dtype: int64 ficoRangeHigh_outliers 异常值 778 正常值 158832 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 790889 异常值 9111 Name: openAcc_outliers, dtype: int64 openAcc_outliers 异常值 2195 正常值 157415 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 792471 异常值 7529 Name: pubRec_outliers, dtype: int64 pubRec_outliers 异常值 1701 正常值 157909 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 794120 异常值 5880 Name: pubRecBankruptcies_outliers, dtype: int64 pubRecBankruptcies_outliers 异常值 1423 正常值 158187 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 790001 异常值 9999 Name: revolBal_outliers, dtype: int64 revolBal_outliers 异常值 1359 正常值 158251 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 799948 异常值 52 Name: revolUtil_outliers, dtype: int64 revolUtil_outliers 异常值 23 正常值 159587 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 791663 异常值 8337 Name: totalAcc_outliers, dtype: int64 totalAcc_outliers 异常值 1668 正常值 157942 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 800000 Name: initialListStatus_outliers, dtype: int64 initialListStatus_outliers 正常值 159610 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 784586 异常值 15414 Name: applicationType_outliers, dtype: int64 applicationType_outliers 异常值 3875 正常值 155735 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 775134 异常值 24866 Name: title_outliers, dtype: int64 title_outliers 异常值 3900 正常值 155710 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 800000 Name: policyCode_outliers, dtype: int64 policyCode_outliers 正常值 159610 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 782773 异常值 17227 Name: n0_outliers, dtype: int64 n0_outliers 异常值 3485 正常值 156125 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 790500 异常值 9500 Name: n1_outliers, dtype: int64 n1_outliers 异常值 2491 正常值 157119 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 789067 异常值 10933 Name: n2_outliers, dtype: int64 n2_outliers 异常值 3205 正常值 156405 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 789067 异常值 10933 Name: n3_outliers, dtype: int64 n3_outliers 异常值 3205 正常值 156405 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 788660 异常值 11340 Name: n4_outliers, dtype: int64 n4_outliers 异常值 2476 正常值 157134 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 790355 异常值 9645 Name: n5_outliers, dtype: int64 n5_outliers 异常值 1858 正常值 157752 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 786006 异常值 13994 Name: n6_outliers, dtype: int64 n6_outliers 异常值 3182 正常值 156428 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 788430 异常值 11570 Name: n7_outliers, dtype: int64 n7_outliers 异常值 2746 正常值 156864 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 789625 异常值 10375 Name: n8_outliers, dtype: int64 n8_outliers 异常值 2131 正常值 157479 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 786384 异常值 13616 Name: n9_outliers, dtype: int64 n9_outliers 异常值 3953 正常值 155657 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 788979 异常值 11021 Name: n10_outliers, dtype: int64 n10_outliers 异常值 2639 正常值 156971 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 799434 异常值 566 Name: n11_outliers, dtype: int64 n11_outliers 异常值 112 正常值 159498 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 797585 异常值 2415 Name: n12_outliers, dtype: int64 n12_outliers 异常值 545 正常值 159065 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 788907 异常值 11093 Name: n13_outliers, dtype: int64 n13_outliers 异常值 2482 正常值 157128 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符* 正常值 788884 异常值 11116 Name: n14_outliers, dtype: int64 n14_outliers 异常值 3364 正常值 156246 Name: isDefault, dtype: int64 *分隔符**分隔符**分隔符**分隔符*s
删除异常值
#将有异常值的行删除
for fea in numerical_serial_fea:
train = train[train[fea+'_outliers']=='正常值']
train = train.reset_index(drop=True)
train.shape
(612742, 89)
分箱的优点:
特别要注意一下分箱的基本原则:
当数值横跨多个数量级时,最好按照 10 的幂(或任何常数的幂)来进行分组:0~9、10~99、100~999、1000~9999,等等。固定宽度分箱非常容易计算,但如果计数值中有比较大的缺口,就会产生很多没有任何数据的空箱子。
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)
# 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
for col in ['grade', 'subGrade']:
#将训练数据按col分组,取目标值'isDefault'数据,计算不同'grade'的均值,加上索引列,将col的列名改成col_taregt_mean
temp_dict = train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
#将temp_dict转化为字典{grade: isDefault_mean}
temp_dict.index = temp_dict[col].values
temp_dict = temp_dict[col + '_target_mean'].to_dict()
#通过字典新建一个col_target_mean列
train[col + '_target_mean'] = train[col].map(temp_dict)
testA[col + '_target_mean'] = testA[col].map(temp_dict)
# 其他衍生变量 mean 和 std
for df in [train, testA]:
for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
labelEncode 直接放入树模型中
# 高维类别特征需要进行转换label-encode:subGrade,postCode,title
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
le = LabelEncoder()
le.fit(list(train[col].astype(str).values) + list(testA[col].astype(str).values))
train[col] = le.transform(list(train[col].astype(str).values))
testA[col] = le.transform(list(testA[col].astype(str).values))
print('Label Encoding 完成')
# 举例归一化过程
#伪代码
for fea in [要归一化的特征列表]:
data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))
方差选择法
from sklearn.feature_selection import VarianceThreshold
#其中参数threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(train,target_train)
相关系数法
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
#选择K个最好的特征,返回选择特征后的数据
#第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
#输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
#参数k为选择的特征个数
SelectKBest(k=5).fit_transform(train,target_train)
卡方检验
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#参数k为选择的特征个数
SelectKBest(chi2, k=5).fit_transform(train,target_train)
互信息法
from sklearn.feature_selection import SelectKBest
from minepy import MINE
#由于MINE的设计不是函数式的,定义mic方法将其为函数式的,
#返回一个二元组,二元组的第2项设置成固定的P值0.5
def mic(x, y):
m = MINE()
m.compute_score(x, y)
return (m.mic(), 0.5)
#参数k为选择的特征个数
SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(train,target_train)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#递归特征消除法,返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(train,target_train)
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
#带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
#GBDT作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)
本数据集中我们删除非入模特征后,并对缺失值填充,然后用计算协方差的方式看一下特征间相关性,然后进行模型训练
#"纵向用缺失值上面的值替换缺失值"
train = train.fillna(axis=0,method='ffill')
x_train = train.drop(['isDefault','id'], axis=1)
#计算协方差
data_corr = x_train.corrwith(train.isDefault) #计算相关性
result = pd.DataFrame(columns=['features', 'corr'])
result['features'] = data_corr.index
result['corr'] = data_corr.values
# 当然也可以直接看图
data_numeric = train[numerical_fea]
correlation = data_numeric.corr()
f , ax = plt.subplots(figsize = (7, 7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True, vmax=0.8)
from sklearn.model_selection import KFold
#把'id','issueDate','isDefault',以及含有'_outliers'尾缀的删除
#把所有可入模的函数定义为features
features = [f for f in train.columns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f]
#划分测试集和训练集
x_train = train[features]
x_test = testA[features]
y_train = train['isDefault']
def cv_model(clf, train_x, train_y, test_x, clf_name):
folds = 5 #k折训练的折数
seed = 2020 #随机数种子
kf = KFold(n_splits=folds, shuffle=True, random_state=seed) #实例化K折函数
data_train = np.zeros(train_x.shape[0]) #输入一个样本数*样本数的空矩阵
data_test = np.zeros(test_x.shape[0])
cv_scores = []
#i为第几折,train_index,valid_index分别为每一折训练、验证数据的索引
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1))) #{}处填充第几折
#划分训练数据为trn_x训练集的特征值, trn_y训练集目标值, val_x验证集特征值, val_y验证集目标值
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
#lightgbm模型
if clf_name == "lgb":
#clf为被实例化的模型'lgb',构造'训练矩阵'和'验证矩阵'
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
#定义参数
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': 2020,
'nthread': -1,
'n_jobs':24,
'verbose': -1,
}
#实例化model
model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
#验证集的预测试
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
#测试集的预测值
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
#1把特征值和特征值的权重zip;2按权重降序排序;3取前20构成一个列表
# print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
#xgboost模型
if clf_name == "xgb":
train_matrix = clf.DMatrix(trn_x , label=trn_y)
valid_matrix = clf.DMatrix(val_x , label=val_y)
params = {'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.04,
'tree_method': 'exact',
'seed': 2020,
'nthread': -1,
"silent": True,
}
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)
#catboost模型
if clf_name == "cat":
params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli', 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=20000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
cat_features=[], use_best_model=True, verbose=500)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
data_train[valid_index] = val_pred
data_test = test_pred / kf.n_splits
cv_scores.append(roc_auc_score(val_y, val_pred))
print(cv_scores)
print("%s_scotrainre_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
print("%s_score_std:" % clf_name, np.std(cv_scores))
return data_train, data_test
def lgb_model(x_train, y_train, x_test):
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
return lgb_train, lgb_test
def xgb_model(x_train, y_train, x_test):
xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
return xgb_train, xgb_test
def cat_model(x_train, y_train, x_test):
cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************ Training until validation scores don't improve for 200 rounds [200] training's auc: 0.749225 valid_1's auc: 0.729679 [400] training's auc: 0.765075 valid_1's auc: 0.730496 [600] training's auc: 0.778745 valid_1's auc: 0.730435 Early stopping, best iteration is: [455] training's auc: 0.769202 valid_1's auc: 0.730686 [0.7306859913754798] ************************************ 2 ************************************ Training until validation scores don't improve for 200 rounds [200] training's auc: 0.749221 valid_1's auc: 0.731315 [400] training's auc: 0.765117 valid_1's auc: 0.731658 [600] training's auc: 0.778542 valid_1's auc: 0.731333 Early stopping, best iteration is: [407] training's auc: 0.765671 valid_1's auc: 0.73173 [0.7306859913754798, 0.7317304414673989] ************************************ 3 ************************************ Training until validation scores don't improve for 200 rounds [200] training's auc: 0.748436 valid_1's auc: 0.732775 [400] training's auc: 0.764216 valid_1's auc: 0.733173 Early stopping, best iteration is: [386] training's auc: 0.763261 valid_1's auc: 0.733261 [0.7306859913754798, 0.7317304414673989, 0.7332610441015461] ************************************ 4 ************************************ Training until validation scores don't improve for 200 rounds [200] training's auc: 0.749631 valid_1's auc: 0.728327 [400] training's auc: 0.765139 valid_1's auc: 0.728845 Early stopping, best iteration is: [286] training's auc: 0.756978 valid_1's auc: 0.728976 [0.7306859913754798, 0.7317304414673989, 0.7332610441015461, 0.7289759386807912] ************************************ 5 ************************************ Training until validation scores don't improve for 200 rounds [200] training's auc: 0.748414 valid_1's auc: 0.732727 [400] training's auc: 0.763727 valid_1's auc: 0.733531 [600] training's auc: 0.777489 valid_1's auc: 0.733566 Early stopping, best iteration is: [524] training's auc: 0.772372 valid_1's auc: 0.733772 [0.7306859913754798, 0.7317304414673989, 0.7332610441015461, 0.7289759386807912, 0.7337723979789789] lgb_scotrainre_list: [0.7306859913754798, 0.7317304414673989, 0.7332610441015461, 0.7289759386807912, 0.7337723979789789] lgb_score_mean: 0.7316851627208389 lgb_score_std: 0.0017424259863954693
testA_result = pd.read_csv('D:/myP/financial_risk/sample_submit.csv')
roc_auc_score(testA_result['isDefault'].values, lgb_test)
0.7290917729487896
特征工程是机器学习,甚至是深度学习中最为重要的一部分,在实际应用中往往也是所花费时间最多的一步。各种算法书中对特征工程部分的讲解往往少得可怜,因为特征工程和具体的数据结合的太紧密,很难系统地覆盖所有场景。本章主要是通过一些常用的方法来做介绍,例如缺失值异常值的处理方法详细对任何数据集来说都是适用的。但对于分箱等操作本章给出了具体的几种思路,需要读者自己探索。在特征工程中比赛和具体的应用还是有所不同的,在实际的金融风控评分卡制作过程中,由于强调特征的可解释性,特征分箱尤其重要。学有余力同学可以自行多尝试,希望大家在本节学习中有所收获。
逻辑回归模型:
树模型:
集成模型
模型对比与性能评估:
模型调参:
优点
缺点
优点
缺点
通过组合多个学习器来完成学习任务,通过集成方法,可以将多个弱学习器组合成一个强分类器,因此集成学习的泛化能力一般比单一分类器要好。
集成方法主要包括Bagging和Boosting,Bagging和Boosting都是将已有的分类或回归算法通过一定方式组合起来,形成一个更加强大的分类。两种方法都是把若干个分类器整合为一个分类器的方法,只是整合的方式不一样,最终得到不一样的效果。常见的基于Baggin思想的集成模型有:随机森林、基于Boosting思想的集成模型有:Adaboost、GBDT、XgBoost、LightGBM等。
Baggin和Boosting的区别总结如下:
对于模型来说,其在训练集上面的误差我们称之为训练误差或者经验误差,而在测试集上的误差称之为测试误差。
对于我们来说,我们更关心的是模型对于新样本的学习能力,即我们希望通过对已有样本的学习,尽可能的将所有潜在样本的普遍规律学到手,而如果模型对训练样本学的太好,则有可能把训练样本自身所具有的一些特点当做所有潜在样本的普遍特点,这时候我们就会出现过拟合的问题。
因此我们通常将已有的数据集划分为训练集和测试集两部分,其中训练集用来训练模型,而测试集则是用来评估模型对于新样本的判别能力。
对于数据集的划分,我们通常要保证满足以下两个条件:
对于数据集的划分有三种方法:留出法,交叉验证法和自助法,下面挨个介绍:
①留出法
留出法是直接将数据集D划分为两个互斥的集合,其中一个集合作为训练集S,另一个作为测试集T。需要注意的是在划分的时候要尽可能保证数据分布的一致性,即避免因数据划分过程引入额外的偏差而对最终结果产生影响。为了保证数据分布的一致性,通常我们采用分层采样的方式来对数据进行采样。
Tips: 通常,会将数据集D中大约2/3~4/5的样本作为训练集,其余的作为测试集。
②交叉验证法
k折交叉验证通常将数据集D分为k份,其中k-1份作为训练集,剩余的一份作为测试集,这样就可以获得k组训练/测试集,可以进行k次训练与测试,最终返回的是k个测试结果的均值。交叉验证中数据集的划分依然是依据分层采样的方式来进行。
对于交叉验证法,其k值的选取往往决定了评估结果的稳定性和保真性,通常k值选取10。
当k=1的时候,我们称之为留一法
③自助法
我们每次从数据集D中取一个样本作为训练集中的元素,然后把该样本放回,重复该行为m次,这样我们就可以得到大小为m的训练集,在这里面有的样本重复出现,有的样本则没有出现过,我们把那些没有出现过的样本作为测试集。
进行这样采样的原因是因为在D中约有36.8%的数据没有在训练集中出现过。留出法与交叉验证法都是使用分层采样的方式进行数据采样与划分,而自助法则是使用有放回重复采样的方式进行数据采样
数据集划分总结
对于本次比赛,我们选用auc作为模型评价标准,类似的评价标准还有ks、f1-score等,具体介绍与实现大家可以回顾下task1中的内容。
一起来看一下auc到底是什么?
在逻辑回归里面,对于正负例的界定,通常会设一个阈值,大于阈值的为正类,小于阈值为负类。如果我们减小这个阀值,更多的样本会被识别为正类,提高正类的识别率,但同时也会使得更多的负类被错误识别为正类。为了直观表示这一现象,引入ROC。
根据分类结果计算得到ROC空间中相应的点,连接这些点就形成ROC curve,横坐标为False Positive Rate(FPR:假正率),纵坐标为True Positive Rate(TPR:真正率)。 一般情况下,这个曲线都应该处于(0,0)和(1,1)连线的上方,如图:
ROC曲线中的四个点:
总之:ROC曲线越接近左上角,该分类器的性能越好,其泛化性能就越好。而且一般来说,如果ROC是光滑的,那么基本可以判断没有太大的overfitting。
但是对于两个模型,我们如何判断哪个模型的泛化性能更优呢?这里我们有主要以下两种方法:
如果模型A的ROC曲线完全包住了模型B的ROC曲线,那么我们就认为模型A要优于模型B;
如果两条曲线有交叉的话,我们就通过比较ROC与X,Y轴所围得曲线的面积来判断,面积越大,模型的性能就越优,这个面积我们称之为AUC(area under ROC curve)
import pandas as pd
import numpy as np
import warnings
import os
import seaborn as sns
import matplotlib.pyplot as plt
"""
sns 相关设置
@return:
"""
# 声明使用 Seaborn 样式
sns.set()
# 有五种seaborn的绘图风格,它们分别是:darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。
sns.set_style("whitegrid")
# 有四个预置的环境,按大小从小到大排列分别为:paper, notebook, talk, poster。其中,notebook是默认的。
sns.set_context('talk')
# 中文字体设置-黑体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
# 解决Seaborn中文显示问题并调整字体大小
sns.set(font='SimHei')
reduce_mem_usage 函数通过调整数据类型,帮助我们减少数据在内存中占用的空间
# reduce_mem_usage 函数通过调整数据类型,帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum()
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum()
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
data = pd.read_csv('dataset/data_for_model.csv')
data = reduce_mem_usage(data)
Memory usage of dataframe is 928000128.00 MB Memory usage after optimization is: 165006456.00 MB Decreased by 82.2%
5.3.3 简单建模
Tips1:金融风控的实际项目多涉及到信用评分,因此需要模型特征具有较好的可解释性,所以目前在实际项目中多还是以逻辑回归作为基础模型。但是在比赛中以得分高低为准,不需要严谨的可解释性,所以大多基于集成算法进行建模。
Tips2:因为逻辑回归的算法特性,需要提前对异常值、缺失值数据进行处理【参考task3部分】
Tips3:基于树模型的算法特性,异常值、缺失值处理可以跳过,但是对于业务较为了解的同学也可以自己对缺失异常值进行处理,效果可能会更优于模型处理的结果。
注:以下建模的源数据参考baseline进行了相应的特征工程,对于异常缺失值未进行相应的处理操作。
建模之前的预操作
from sklearn.model_selection import KFold
# 分离数据集,方便进行交叉验证
X_train = data.loc[data['sample']=='train', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)
X_test = data.loc[data['sample']=='test', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)
y_train = data.loc[data['sample']=='train', 'isDefault']
# 5折交叉验证
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
"""对训练集数据进行划分,分成训练集和验证集,并进行相应的操作"""
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# 数据集划分
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.1,
'metric': 'auc',
'min_child_weight': 1e-3,
'num_leaves': 31,
'max_depth': -1,
'reg_lambda': 0,
'reg_alpha': 0,
'feature_fraction': 1,
'bagging_fraction': 1,
'bagging_freq': 0,
'seed': 2020,
'nthread': 8,
'silent': True,
'verbose': -1,
}
"""使用训练集数据进行模型训练"""
model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [427] valid_0's auc: 0.724947
对验证集进行预测
from sklearn import metrics
from sklearn.metrics import roc_auc_score
"""预测并计算roc的相关指标"""
val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb)
roc_auc = metrics.auc(fpr, tpr)
print('未调参前lightgbm单模型在验证集上的AUC:{}'.format(roc_auc))
"""画出roc曲线图"""
plt.figure(figsize=(8, 8))
plt.title('Validation ROC')
plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc)
plt.ylim(0,1)
plt.xlim(0,1)
plt.legend(loc='best')
plt.title('ROC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
# 画出对角线
plt.plot([0,1],[0,1],'r--')
plt.show()
未调参前lightgbm单模型在验证集上的AUC:0.7249469360631181
更进一步的,使用5折交叉验证进行模型性能评估
import lightgbm as lgb
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
print('************************************ {} ************************************'.format(str(i+1)))
X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index]
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.1,
'metric': 'auc',
'min_child_weight': 1e-3,
'num_leaves': 31,
'max_depth': -1,
'reg_lambda': 0,
'reg_alpha': 0,
'feature_fraction': 1,
'bagging_fraction': 1,
'bagging_freq': 0,
'seed': 2020,
'nthread': 8,
'silent': True,
'verbose': -1,
}
model = lgb.train(params, train_set=train_matrix, num_boost_round=20000, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200)
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
cv_scores.append(roc_auc_score(y_val, val_pred))
print(cv_scores)
print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))
************************************ 1 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [548] valid_0's auc: 0.730384 [0.7303837315833632] ************************************ 2 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [643] valid_0's auc: 0.725869 [0.7303837315833632, 0.7258692125145638] ************************************ 3 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [362] valid_0's auc: 0.730515 [0.7303837315833632, 0.7258692125145638, 0.7305149209921737] ************************************ 4 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [700] valid_0's auc: 0.729612 [0.7303837315833632, 0.7258692125145638, 0.7305149209921737, 0.7296117869375041] ************************************ 5 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds [1000] valid_0's auc: 0.729339 Early stopping, best iteration is: [833] valid_0's auc: 0.729444 [0.7303837315833632, 0.7258692125145638, 0.7305149209921737, 0.7296117869375041, 0.7294438695369077] lgb_scotrainre_list:[0.7303837315833632, 0.7258692125145638, 0.7305149209921737, 0.7296117869375041, 0.7294438695369077] lgb_score_mean:0.7291647043129024 lgb_score_std:0.0016998349834934656
贪心调参
先使用当前对模型影响最大的参数进行调优,达到当前参数下的模型最优化,再使用对模型影响次之的参数进行调优,如此下去,直到所有的参数调整完毕。
这个方法的缺点就是可能会调到局部最优而不是全局最优,但是只需要一步一步的进行参数最优化调试即可,容易理解。
需要注意的是在树模型中参数调整的顺序,也就是各个参数对模型的影响程度,这里列举一下日常调参过程中常用的参数和调参顺序:
from sklearn.model_selection import cross_val_score
# 调objective
best_obj = dict()
for obj in objective:
model = LGBMRegressor(objective=obj)
"""预测并计算roc的相关指标"""
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
best_obj[obj] = score
# num_leaves
best_leaves = dict()
for leaves in num_leaves:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
"""预测并计算roc的相关指标"""
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
best_leaves[leaves] = score
# max_depth
best_depth = dict()
for depth in max_depth:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
max_depth=depth)
"""预测并计算roc的相关指标"""
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
best_depth[depth] = score
"""
可依次将模型的参数通过上面的方式进行调整优化,并且通过可视化观察在每一个最优参数下模型的得分情况
"""
网格搜索
sklearn 提供GridSearchCV用于进行网格搜索,只需要把模型的参数输进去,就能给出最优化的结果和参数。相比起贪心调参,网格搜索的结果会更优,但是网格搜索只适合于小数据集,一旦数据的量级上去了,很难得出结果。
同样以Lightgbm算法为例,进行网格搜索调参:
"""通过网格搜索确定最优参数"""
from sklearn.model_selection import GridSearchCV
def get_best_cv_params(learning_rate=0.1, n_estimators=581, num_leaves=31, max_depth=-1, bagging_fraction=1.0,
feature_fraction=1.0, bagging_freq=0, min_data_in_leaf=20, min_child_weight=0.001,
min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=None):
# 设置5折交叉验证
cv_fold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True, )
model_lgb = lgb.LGBMClassifier(learning_rate=learning_rate,
n_estimators=n_estimators,
num_leaves=num_leaves,
max_depth=max_depth,
bagging_fraction=bagging_fraction,
feature_fraction=feature_fraction,
bagging_freq=bagging_freq,
min_data_in_leaf=min_data_in_leaf,
min_child_weight=min_child_weight,
min_split_gain=min_split_gain,
reg_lambda=reg_lambda,
reg_alpha=reg_alpha,
n_jobs= 8
)
grid_search = GridSearchCV(estimator=model_lgb,
cv=cv_fold,
param_grid=param_grid,
scoring='roc_auc'
)
grid_search.fit(X_train, y_train)
print('模型当前最优参数为:{}'.format(grid_search.best_params_))
print('模型当前最优得分为:{}'.format(grid_search.best_score_))
"""以下代码未运行,耗时较长,请谨慎运行,且每一步的最优参数需要在下一步进行手动更新,请注意"""
"""
需要注意一下的是,除了获取上面的获取num_boost_round时候用的是原生的lightgbm(因为要用自带的cv)
下面配合GridSearchCV时必须使用sklearn接口的lightgbm。
"""
"""设置n_estimators 为581,调整num_leaves和max_depth,这里选择先粗调再细调"""
lgb_params = {'num_leaves': range(10, 80, 5), 'max_depth': range(3,10,2)}
get_best_cv_params(learning_rate=0.1, n_estimators=581, num_leaves=None, max_depth=None, min_data_in_leaf=20,
min_child_weight=0.001,bagging_fraction=1.0, feature_fraction=1.0, bagging_freq=0,
min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)
"""num_leaves为30,max_depth为7,进一步细调num_leaves和max_depth"""
lgb_params = {'num_leaves': range(25, 35, 1), 'max_depth': range(5,9,1)}
get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=None, max_depth=None, min_data_in_leaf=20,
min_child_weight=0.001,bagging_fraction=1.0, feature_fraction=1.0, bagging_freq=0,
min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)
"""
确定min_data_in_leaf为45,min_child_weight为0.001 ,下面进行bagging_fraction、feature_fraction和bagging_freq的调参
"""
lgb_params = {'bagging_fraction': [i/10 for i in range(5,10,1)],
'feature_fraction': [i/10 for i in range(5,10,1)],
'bagging_freq': range(0,81,10)
}
get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=29, max_depth=7, min_data_in_leaf=45,
min_child_weight=0.001,bagging_fraction=None, feature_fraction=None, bagging_freq=None,
min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)
"""
确定bagging_fraction为0.4、feature_fraction为0.6、bagging_freq为 ,下面进行reg_lambda、reg_alpha的调参
"""
lgb_params = {'reg_lambda': [0,0.001,0.01,0.03,0.08,0.3,0.5], 'reg_alpha': [0,0.001,0.01,0.03,0.08,0.3,0.5]}
get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=29, max_depth=7, min_data_in_leaf=45,
min_child_weight=0.001,bagging_fraction=0.9, feature_fraction=0.9, bagging_freq=40,
min_split_gain=0, reg_lambda=None, reg_alpha=None, param_grid=lgb_params)
"""
确定reg_lambda、reg_alpha都为0,下面进行min_split_gain的调参
"""
lgb_params = {'min_split_gain': [i/10 for i in range(0,11,1)]}
get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=29, max_depth=7, min_data_in_leaf=45,
min_child_weight=0.001,bagging_fraction=0.9, feature_fraction=0.9, bagging_freq=40,
min_split_gain=None, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)
"""
参数确定好了以后,我们设置一个比较小的learning_rate 0.005,来确定最终的num_boost_round
"""
# 设置5折交叉验证
# cv_fold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True, )
final_params = {
'boosting_type': 'gbdt',
'learning_rate': 0.01,
'num_leaves': 29,
'max_depth': 7,
'min_data_in_leaf':45,
'min_child_weight':0.001,
'bagging_fraction': 0.9,
'feature_fraction': 0.9,
'bagging_freq': 40,
'min_split_gain': 0,
'reg_lambda':0,
'reg_alpha':0,
'nthread': 6
}
cv_result = lgb.cv(train_set=lgb_train,
early_stopping_rounds=20,
num_boost_round=5000,
nfold=5,
stratified=True,
shuffle=True,
params=final_params,
metrics='auc',
seed=0,
)
print('迭代次数{}'.format(len(cv_result['auc-mean'])))
print('交叉验证的AUC为{}'.format(max(cv_result['auc-mean'])))
在实际调整过程中,可先设置一个较大的学习率(上面的例子中0.1),通过Lgb原生的cv函数进行树个数的确定,之后再通过上面的实例代码进行参数的调整优化。
最后针对最优的参数设置一个较小的学习率(例如0.05),同样通过cv函数确定树的个数,确定最终的参数。
需要注意的是,针对大数据集,上面每一层参数的调整都需要耗费较长时间
贝叶斯调参
在使用之前需要先安装包bayesian-optimization。
贝叶斯调参的主要思想是:给定优化的目标函数(广义的函数,只需指定输入和输出即可,无需知道内部结构以及数学性质),通过不断地添加样本点来更新目标函数的后验分布(高斯过程,直到后验分布基本贴合于真实分布)。简单的说,就是考虑了上一次参数的信息,从而更好的调整当前的参数。
贝叶斯调参的步骤如下:
from sklearn.model_selection import cross_val_score
"""定义优化函数"""
def rf_cv_lgb(num_leaves, max_depth, bagging_fraction, feature_fraction, bagging_freq, min_data_in_leaf,
min_child_weight, min_split_gain, reg_lambda, reg_alpha):
# 建立模型
model_lgb = lgb.LGBMClassifier(boosting_type='gbdt', bjective='binary', metric='auc',
learning_rate=0.1, n_estimators=5000,
num_leaves=int(num_leaves), max_depth=int(max_depth),
bagging_fraction=round(bagging_fraction, 2), feature_fraction=round(feature_fraction, 2),
bagging_freq=int(bagging_freq), min_data_in_leaf=int(min_data_in_leaf),
min_child_weight=min_child_weight, min_split_gain=min_split_gain,
reg_lambda=reg_lambda, reg_alpha=reg_alpha,
n_jobs= 8
)
val = cross_val_score(model_lgb, X_train_split, y_train_split, cv=5, scoring='roc_auc').mean()
return val
from bayes_opt import BayesianOptimization
"""定义优化参数"""
bayes_lgb = BayesianOptimization(
rf_cv_lgb,
{
'num_leaves':(10, 200),
'max_depth':(3, 20),
'bagging_fraction':(0.5, 1.0),
'feature_fraction':(0.5, 1.0),
'bagging_freq':(0, 100),
'min_data_in_leaf':(10,100),
'min_child_weight':(0, 10),
'min_split_gain':(0.0, 1.0),
'reg_alpha':(0.0, 10),
'reg_lambda':(0.0, 10),
}
)
"""开始优化"""
bayes_lgb.maximize(n_iter=10)
| iter | target | baggin... | baggin... | featur... | max_depth | min_ch... | min_da... | min_sp... | num_le... | reg_alpha | reg_la... | ------------------------------------------------------------------------------------------------------------------------------------------------- | 1 | 0.7263 | 0.7196 | 80.73 | 0.7988 | 19.17 | 5.751 | 40.71 | 0.9548 | 176.2 | 2.939 | 7.212 | | 2 | 0.7279 | 0.8997 | 74.72 | 0.5904 | 7.259 | 6.175 | 92.03 | 0.4027 | 51.65 | 6.404 | 4.781 | | 3 | 0.7207 | 0.5133 | 16.53 | 0.9536 | 4.974 | 2.37 | 98.08 | 0.7909 | 52.12 | 4.443 | 4.429 | | 4 | 0.7276 | 0.6265 | 53.12 | 0.7307 | 10.67 | 1.824 | 18.98 | 0.954 | 60.47 | 6.963 | 1.999 | | 5 | 0.6963 | 0.6509 | 11.58 | 0.5386 | 11.21 | 7.85 | 11.4 | 0.4269 | 153.0 | 0.5227 | 2.257 | | 6 | 0.7276 | 0.6241 | 49.76 | 0.6057 | 10.34 | 1.718 | 22.43 | 0.8294 | 55.68 | 6.759 | 2.6 | | 7 | 0.7283 | 0.9815 | 96.15 | 0.6961 | 19.45 | 1.627 | 37.7 | 0.4185 | 14.22 | 7.057 | 9.924 | | 8 | 0.7278 | 0.7139 | 96.83 | 0.5063 | 3.941 | 1.469 | 97.28 | 0.07553 | 196.9 | 7.988 | 2.159 | | 9 | 0.7195 | 0.5352 | 98.72 | 0.9699 | 4.445 | 1.767 | 13.91 | 0.1647 | 191.5 | 4.003 | 2.027 | | 10 | 0.7281 | 0.7281 | 73.63 | 0.5598 | 19.29 | 0.5344 | 99.66 | 0.933 | 101.4 | 8.836 | 0.9222 | | 11 | 0.7279 | 0.8213 | 0.05856 | 0.7626 | 17.49 | 8.447 | 10.71 | 0.3252 | 13.64 | 9.319 | 0.4747 | | 12 | 0.7281 | 0.8372 | 95.71 | 0.9598 | 10.32 | 8.394 | 15.23 | 0.4909 | 94.48 | 9.486 | 9.044 | | 13 | 0.6993 | 0.5183 | 99.02 | 0.542 | 15.5 | 8.35 | 38.15 | 0.4079 | 58.01 | 0.2668 | 1.652 | | 14 | 0.7267 | 0.7933 | 4.459 | 0.79 | 7.557 | 2.43 | 27.91 | 0.8725 | 28.32 | 9.967 | 9.885 | | 15 | 0.6979 | 0.9419 | 1.22 | 0.835 | 11.56 | 9.962 | 93.79 | 0.018 | 197.6 | 9.711 | 3.78 | =================================================================================================================================================
"""显示优化结果"""
bayes_lgb.max
{'target': 0.7282530196283977, , 'params': {'bagging_fraction': 0.9815471914843896, , 'bagging_freq': 96.14757648686668, , 'feature_fraction': 0.6961281791730929, , 'max_depth': 19.45450235568963, , 'min_child_weight': 1.6266132496156782, , 'min_data_in_leaf': 37.697878831472295, , 'min_split_gain': 0.4184947943942168, , 'num_leaves': 14.221122487200399, , 'reg_alpha': 7.056502173310882, , 'reg_lambda': 9.924023764203156}}
"""调整一个较小的学习率,并通过cv函数确定当前最优的迭代次数"""
base_params_lgb = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.01,
'num_leaves': 14,
'max_depth': 19,
'min_data_in_leaf': 37,
'min_child_weight':1.6,
'bagging_fraction': 0.98,
'feature_fraction': 0.69,
'bagging_freq': 96,
'reg_lambda': 9,
'reg_alpha': 7,
'min_split_gain': 0.4,
'nthread': 8,
'seed': 2020,
'silent': True,
'verbose': -1,
}
cv_result_lgb = lgb.cv(
train_set=train_matrix,
early_stopping_rounds=1000,
num_boost_round=20000,
nfold=5,
stratified=True,
shuffle=True,
params=base_params_lgb,
metrics='auc',
seed=0
)
print('迭代次数{}'.format(len(cv_result_lgb['auc-mean'])))
print('最终模型的AUC为{}'.format(max(cv_result_lgb['auc-mean'])))
迭代次数14269 最终模型的AUC为0.7315032037635779
模型参数已经确定,建立最终模型并对验证集进行验证
import lightgbm as lgb
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
print('************************************ {} ************************************'.format(str(i+1)))
X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index]
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.01,
'num_leaves': 14,
'max_depth': 19,
'min_data_in_leaf': 37,
'min_child_weight':1.6,
'bagging_fraction': 0.98,
'feature_fraction': 0.69,
'bagging_freq': 96,
'reg_lambda': 9,
'reg_alpha': 7,
'min_split_gain': 0.4,
'nthread': 8,
'seed': 2020,
'silent': True,
}
model = lgb.train(params, train_set=train_matrix, num_boost_round=14269, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200)
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
cv_scores.append(roc_auc_score(y_val, val_pred))
print(cv_scores)
print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))
************************************ 1 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds [1000] valid_0's auc: 0.724676 [2000] valid_0's auc: 0.728477 [3000] valid_0's auc: 0.729863 [4000] valid_0's auc: 0.730581 [5000] valid_0's auc: 0.731193 [6000] valid_0's auc: 0.731638 [7000] valid_0's auc: 0.731966 [8000] valid_0's auc: 0.732294 [9000] valid_0's auc: 0.732471 [10000] valid_0's auc: 0.732644 [11000] valid_0's auc: 0.732756 [12000] valid_0's auc: 0.732911 Early stopping, best iteration is: [12687] valid_0's auc: 0.732973 [0.7329726464187137] ************************************ 2 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds [1000] valid_0's auc: 0.721247 [2000] valid_0's auc: 0.724996 [3000] valid_0's auc: 0.726328 [4000] valid_0's auc: 0.727099 [5000] valid_0's auc: 0.727597 [6000] valid_0's auc: 0.727968 [7000] valid_0's auc: 0.728409 [8000] valid_0's auc: 0.728648 [9000] valid_0's auc: 0.728889 [10000] valid_0's auc: 0.729017 [11000] valid_0's auc: 0.72913 [12000] valid_0's auc: 0.7293 [13000] valid_0's auc: 0.729426 Early stopping, best iteration is: [13014] valid_0's auc: 0.729429 [0.7329726464187137, 0.7294292852806246] ************************************ 3 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds [1000] valid_0's auc: 0.725542 [2000] valid_0's auc: 0.729287 [3000] valid_0's auc: 0.730627 [4000] valid_0's auc: 0.731514 [5000] valid_0's auc: 0.7321 [6000] valid_0's auc: 0.732653 [7000] valid_0's auc: 0.732959 [8000] valid_0's auc: 0.733206 [9000] valid_0's auc: 0.733471 [10000] valid_0's auc: 0.733665 [11000] valid_0's auc: 0.733858 [12000] valid_0's auc: 0.734001 [13000] valid_0's auc: 0.734086 Early stopping, best iteration is: [13640] valid_0's auc: 0.734151 [0.7329726464187137, 0.7294292852806246, 0.7341505801564857] ************************************ 4 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds [1000] valid_0's auc: 0.723942 [2000] valid_0's auc: 0.727793 [3000] valid_0's auc: 0.729261 [4000] valid_0's auc: 0.730247 [5000] valid_0's auc: 0.730821 [6000] valid_0's auc: 0.731352 [7000] valid_0's auc: 0.731793 [8000] valid_0's auc: 0.732006 [9000] valid_0's auc: 0.732191 [10000] valid_0's auc: 0.732363 [11000] valid_0's auc: 0.732604 [12000] valid_0's auc: 0.732778 Early stopping, best iteration is: [12787] valid_0's auc: 0.732833 [0.7329726464187137, 0.7294292852806246, 0.7341505801564857, 0.7328331383185244] ************************************ 5 ************************************
D:\software\install\Anaconda3\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key))
Training until validation scores don't improve for 200 rounds [1000] valid_0's auc: 0.723676 [2000] valid_0's auc: 0.727282 [3000] valid_0's auc: 0.728593 [4000] valid_0's auc: 0.729493 [5000] valid_0's auc: 0.730087 [6000] valid_0's auc: 0.730515 [7000] valid_0's auc: 0.730872 [8000] valid_0's auc: 0.731121 [9000] valid_0's auc: 0.731351 [10000] valid_0's auc: 0.731502 [11000] valid_0's auc: 0.731707 Early stopping, best iteration is: [11192] valid_0's auc: 0.731741 [0.7329726464187137, 0.7294292852806246, 0.7341505801564857, 0.7328331383185244, 0.7317405262608612] lgb_scotrainre_list:[0.7329726464187137, 0.7294292852806246, 0.7341505801564857, 0.7328331383185244, 0.7317405262608612] lgb_score_mean:0.732225235287042 lgb_score_std:0.0015929470575114753
通过5折交叉验证可以发现,模型迭代次数在13000次的时候会停之,那么我们在建立新模型时直接设置最大迭代次数,并使用验证集进行模型预测
""""""
base_params_lgb = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.01,
'num_leaves': 14,
'max_depth': 19,
'min_data_in_leaf': 37,
'min_child_weight':1.6,
'bagging_fraction': 0.98,
'feature_fraction': 0.69,
'bagging_freq': 96,
'reg_lambda': 9,
'reg_alpha': 7,
'min_split_gain': 0.4,
'nthread': 8,
'seed': 2020,
'silent': True,
}
"""使用训练集数据进行模型训练"""
final_model_lgb = lgb.train(base_params_lgb, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=13000, verbose_eval=1000, early_stopping_rounds=200)
"""预测并计算roc的相关指标"""
val_pre_lgb = final_model_lgb.predict(X_val)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb)
roc_auc = metrics.auc(fpr, tpr)
print('调参后lightgbm单模型在验证集上的AUC:{}'.format(roc_auc))
"""画出roc曲线图"""
plt.figure(figsize=(8, 8))
plt.title('Validation ROC')
plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc)
plt.ylim(0,1)
plt.xlim(0,1)
plt.legend(loc='best')
plt.title('ROC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
# 画出对角线
plt.plot([0,1],[0,1],'r--')
plt.show()
Training until validation scores don't improve for 200 rounds [1000] valid_0's auc: 0.723676 [2000] valid_0's auc: 0.727282 [3000] valid_0's auc: 0.728593 [4000] valid_0's auc: 0.729493 [5000] valid_0's auc: 0.730087 [6000] valid_0's auc: 0.730515 [7000] valid_0's auc: 0.730872 [8000] valid_0's auc: 0.731121 [9000] valid_0's auc: 0.731351 [10000] valid_0's auc: 0.731502 [11000] valid_0's auc: 0.731707 Early stopping, best iteration is: [11192] valid_0's auc: 0.731741 调参后lightgbm单模型在验证集上的AUC:0.7317405262608612
可以看到相比最早的原始参数,模型的性能还是有提升的
"""保存模型到本地"""
# 保存模型
import pickle
pickle.dump(final_model_lgb, open('dataset/model_lgb_best.pkl', 'wb'))
模型调参小总结
集成模型内置的cv函数可以较快的进行单一参数的调节,一般可以用来优先确定树模型的迭代次数
数据量较大的时候(例如本次项目的数据),网格搜索调参会特别特别慢,不建议尝试
集成模型中原生库和sklearn下的库部分参数不一致,需要注意,具体可以参考xgb和lgb的官方API
在本节中,我们主要完成了建模与调参的工作,首先在建模的过程中通过划分数据集、交叉验证等方式对模型的性能进行评估验证,并通过可视化方式绘制模型ROC曲线
最后我们对模型进行调参,这部分介绍了贪心调参、网格搜索调参、贝叶斯调参共三种调参手段,重点使用贝叶斯调参对本次项目进行简单优化,大家在实际操作的过程中可以参考调参思路进行优化,不必拘泥于以上教程所写的具体实例。