题目嘛,大概就是就是一个信用评分预测,评判指标为AUC。(官网已经查不到了,大家找找别的帖子看看吧)
def pre_data(data):
data.replace("\\N", np.NaN, inplace=True)
data.replace("~", np.NaN, inplace=True)
data.drop(['edu_deg_cd', 'deg_cd'], axis=1, inplace=True) # 删除两个教育特征(缺失值多+属性重复)
lab = LabelEncoder()
str_col = ['gdr_cd', 'mrg_situ_cd', 'acdm_deg_cd', 'atdd_type']
data[str_col] = data[str_col].fillna('None') # 填充缺失值为None,否则报错
# 字符编码转化为数字
data["gdr_cd"] = lab.fit_transform(data.gdr_cd)
data["mrg_situ_cd"] = lab.fit_transform(data.mrg_situ_cd)
data["acdm_deg_cd"] = lab.fit_transform(data.acdm_deg_cd)
data["atdd_type"] = lab.fit_transform(data.atdd_type.astype(str))
# 数字缺失值填充
col_012 = ['ic_ind', 'fr_or_sh_ind', 'dnl_mbl_bnk_ind', 'dnl_bind_cmb_lif_ind', 'hav_car_grp_ind',
'hav_hou_grp_ind',
'l6mon_agn_ind', 'vld_rsk_ases_ind'] # 0,1二分类特征
data[col_012] = data[col_012].fillna('2')
col_11 = ['frs_agn_dt_cnt', 'fin_rsk_ases_grd_cd', 'confirm_rsk_ases_lvl_typ_cd',
'tot_ast_lvl_cd', 'pot_ast_lvl_cd', 'hld_crd_card_grd_cd'] # 用众数-1填充缺失值
data[col_11] = data[col_11].fillna('-1')
data['cust_inv_rsk_endu_lvl_cd'] = data['cust_inv_rsk_endu_lvl_cd'].fillna('1') # 用众数1填充
data = data.fillna('0')
return data
针对上面第3,先合并后进入pre_data函数,再拆分。
data=pd.concat([train_tag,test_tag],axis=0,ignore_index=True,sort=False)
data['flag']=data['flag'].fillna(-1)
data=pre_data(data)
train_tag=data[data.flag != -1].copy()
test_tag=data[data.flag == -1].copy()
test_tag.drop(['flag'],axis=1,inplace=True)
test_tag=test_tag.reset_index(drop=True)
强特:信用卡天数、借记卡天数、信用卡等级等一系列特征。对这些特征进行相乘,目的使用户之间的差距增大,强特增益。
(B榜构造的,助我从0.777–>0.779)
def tag(data):
#信用卡:持卡天数*等级
data['credit_level1']=data['cur_credit_min_opn_dt_cnt']*pd.to_numeric(data['l1y_crd_card_csm_amt_dlm_cd'])
data['credit_level2']=data['cur_credit_min_opn_dt_cnt']*pd.to_numeric(data['perm_crd_lmt_cd'])
data['credit_level3']=data['cur_credit_min_opn_dt_cnt']*pd.to_numeric(data['hld_crd_card_grd_cd'])
data['level_level']=pd.to_numeric(data['l1y_crd_card_csm_amt_dlm_cd'])*pd.to_numeric(data['perm_crd_lmt_cd']) #等级*等级
data['credit_amount']=data['cur_credit_min_opn_dt_cnt']*data['cur_credit_cnt'] #持卡天数*持卡数量
#信用卡:持卡数量*等级
data['amount_level1']=data['cur_credit_cnt']*pd.to_numeric(data['perm_crd_lmt_cd'])
data['amount_level2']=data['cur_credit_cnt']*pd.to_numeric(data['l1y_crd_card_csm_amt_dlm_cd'])
data['amount_level3']=data['cur_credit_cnt']*pd.to_numeric(data['hld_crd_card_grd_cd'])
return data
trd表是交易流水表,能够挖掘很多统计特征,可以说是本次最重要的一个表。
def trd(data):
data['trx_tm'] = pd.to_datetime(data['trx_tm'], format='%Y/%m/%d') # 转变为8位时间字符
data['trx_tm'] = data['trx_tm'].dt.strftime('%Y%m%d')
#收支统计特征:总和、最大、最小、均值、标准差、数目
temp1 = data[data['cny_trx_amt'] > 0].groupby(by=['id'], as_index=False)['cny_trx_amt'].agg(
{
'income_sum': 'sum', 'income_max': 'max', 'income_min': 'min', 'income_mean': 'mean', 'income_count': 'count',
'income_std': 'std'})
temp2 = data[data['cny_trx_amt'] < 0].groupby(by=['id'], as_index=False)['cny_trx_amt'].agg( # 支出为负,min、max互换
{
'expend_sum': 'sum', 'expend_max': 'min', 'expend_min': 'max', 'expend_mean': 'mean', 'expend_count': 'count',
'ecpend_std': 'std'})
temp3 = data.groupby(by=['id'], as_index=False)['cny_trx_amt'].agg({
'trd_mean': 'mean', 'trd_std': 'std'})
#收支交易天数
temp4 = data.groupby(by=['id'], as_index=False)['trx_tm'].agg({
'trd_day_count': 'nunique'}) # 交易天数
temp5 = data[data['cny_trx_amt'] > 0].groupby(by=['id'], as_index=False)['trx_tm'].agg(
{
'income_day_count': 'nunique'}) # 收入天数
temp6 = data[data['cny_trx_amt'] < 0].groupby(by=['id'], as_index=False)['trx_tm'].agg(
{
'expend_day_count': 'nunique'}) # 支出天数
data_t = pd.merge(temp3, temp1, on=['id'], how='left')
data_t = pd.merge(data_t, temp2, on=['id'], how='left')
data_t = pd.merge(data_t, temp4, on=['id'], how='left')
data_t = pd.merge(data_t, temp5, on=['id'], how='left')
data_t = pd.merge(data_t, temp6, on=['id'], how='left')
# 求总和
data_t['trd_sum'] = data_t['income_sum'] + data_t['expend_sum'] # 余额
data_t['trd_total'] = data_t['income_sum'] - data_t['expend_sum'] # 总交易额
data_t['trd_count'] = data_t['income_count'] + data_t['expend_count'] # 交易总数
# 收入、支出、交易额求平均
data_t['trd_every_total'] = data_t['trd_total'] / data_t['trd_count'] # 平均每笔交易额
data_t['trd_day_total'] = data_t['trd_total'] / data_t['trd_day_count'] # 平均每天交易额
data_t['income_every_sum'] = data_t['income_sum'] / data_t['income_count'] # 平均每笔收入
data_t['income_day_sum'] = data_t['income_sum'] / data_t['income_day_count'] # 平均每天收入
data_t['expend_every_sum'] = data_t['expend_sum'] / data_t['expend_count'] # 平均每笔支出
data_t['expend_day_sum'] = data_t['expend_sum'] / data_t['expend_day_count'] # 平均每天支出
# 收入数、支出数、交易数求平均
data_t['trd_daymean_count'] = data_t['trd_count'] / data_t['trd_day_count'] # 平均每天交易数
data_t['income_daymean_count'] = data_t['income_count'] / data_t['income_day_count'] # 平均每天收入数
data_t['expend_daymean_count'] = data_t['expend_count'] / data_t['expend_day_count'] # 平均每天支出数
# 数量、交易额占比
data_t['income_count_ratio']=data_t['income_count']/data_t['trd_count']
data_t['expend_count_ratio']=data_t['expend_count']/data_t['trd_count']
data_t['income_total_ratio']=data_t['income_sum']/data_t['trd_total']
data_t['expend_total_ratio']=-data_t['expend_sum']/data_t['trd_total']
data_t = data_t.fillna(0)
return data_t
def trd2(data):
data['trx_tm'] = pd.to_datetime(data['trx_tm'], format='%Y/%m/%d') # 转变为8位时间字符
data['trx_tm'] = data['trx_tm'].dt.strftime('%Y%m%d')
#交易次数特征
aa=data.groupby(['id','trx_tm'])['cny_trx_amt'].count().unstack().reset_index()
col=aa.columns.tolist()[1:]
aa['trdnum_day_max']=aa[col].max(axis=1) #单日最大交易次数
aa=aa[['id','trdnum_day_max']]
#交易额特征---不计正负
temp=data[['id','trx_tm','cny_trx_amt']].copy()
temp['cny_trx_amt']=temp['cny_trx_amt'].apply(lambda x:abs(x))
a1=temp.groupby(['id','trx_tm'])['cny_trx_amt'].sum().unstack().reset_index()
col=a1.columns.tolist()[1:]
a1['trdtotal_day_max']=a1[col].max(axis=1) #单日最大交易额
a1['trdtotal_day_min']=a1[col].min(axis=1) #单日最小交易额
a1=a1[['id','trdtotal_day_max','trdtotal_day_min']]
a2=temp.groupby(by=['id'], as_index=False)['cny_trx_amt'].agg( #单笔最大交易额,单笔最小交易额
{
'every_max':'max','every_min':'min'})
a1=pd.merge(a1, a2, on=['id'], how='left')
#余额特征--计算正负值
a3=data.groupby(['id','trx_tm'])['cny_trx_amt'].sum().unstack().reset_index()
col=a3.columns.tolist()[1:]
a3['trdsum_day_max']=a3[col].max(axis=1) #单日最大余额
a3['trdsum_day_min']=a3[col].min(axis=1) #单日最小余额
a3=a3[['id','trdsum_day_max','trdsum_day_min']]
a4=data.groupby(by=['id'],as_index=False)['trx_tm'].agg({
'trx_date_min':'min',
'trx_date_max':'max'}) #交易的第一天和最后一天
aa=pd.merge(aa, a1, on=['id'], how='left')
aa=pd.merge(aa, a3, on=['id'], how='left')
aa=pd.merge(aa, a4, on=['id'], how='left')
return aa
“交易最后一天”也是一个强特。
def month_trd(data):
data['trx_tm'] = pd.to_datetime(data['trx_tm'], format='%Y/%m/%d') # 转变为8位时间字符
data['trx_tm'] = data['trx_tm'].dt.strftime('%Y%m%d')
data['month']=data['trx_tm'].apply(lambda x:x[4:6])
month_value=data.month.unique().tolist()
month_value.sort()
for i in month_value:
#收支统计特征:总和、最大、最小、均值、标准差、数目
temp_data=data[data['month']==i]
temp1 = temp_data[temp_data['cny_trx_amt'] > 0].groupby(by=['id'], as_index=False)['cny_trx_amt'].agg(
{
i+'_income_sum': 'sum', i+'_income_max': 'max', i+'_income_min': 'min', i+'_income_mean': 'mean', i+'_income_count': 'count',
i+'_income_std': 'std'})
temp2 = temp_data[temp_data['cny_trx_amt'] < 0].groupby(by=['id'], as_index=False)['cny_trx_amt'].agg( # 支出为负,min、max互换
{
i+'_expend_sum': 'sum', i+'_expend_max': 'min', i+'_expend_min': 'max', i+'_expend_mean': 'mean', i+'_expend_count': 'count',
i+'_ecpend_std': 'std'})
temp3 = temp_data.groupby(by=['id'], as_index=False)['cny_trx_amt'].agg({
i+'_trd_sum':'sum',i+'_trd_count':'count',
i+'_trd_mean': 'mean', i+'_trd_std': 'std'})
#收支交易天数
temp4 = temp_data.groupby(by=['id'], as_index=False)['trx_tm'].agg({
i+'_trd_day_count': 'nunique'}) # 交易天数
temp5 = temp_data[temp_data['cny_trx_amt'] > 0].groupby(by=['id'], as_index=False)['trx_tm'].agg(
{
i+'_income_day_count': 'nunique'}) # 收入天数
temp6 = temp_data[temp_data['cny_trx_amt'] < 0].groupby(by=['id'], as_index=False)['trx_tm'].agg(
{
i+'_expend_day_count': 'nunique'}) # 支出天数
if i==month_value[1]:
data_t = pd.merge(data_t, temp3, on=['id'], how='left')
data_t = pd.merge(data_t, temp1, on=['id'], how='left')
else:
data_t = pd.merge(temp3, temp1, on=['id'], how='left')
data_t = pd.merge(data_t, temp2, on=['id'], how='left')
data_t = pd.merge(data_t, temp4, on=['id'], how='left')
data_t = pd.merge(data_t, temp5, on=['id'], how='left')
data_t = pd.merge(data_t, temp6, on=['id'], how='left')
# 求总和
data_t[i+'_trd_total'] = data_t[i+'_income_sum'] - data_t[i+'_expend_sum'] # 余额
# 收入、支出、交易额求平均
data_t[i+'_trd_every_total'] = data_t[i+'_trd_total'] / data_t[i+'_trd_count'] # 平均每笔交易额
data_t[i+'_trd_day_total'] = data_t[i+'_trd_total'] / data_t[i+'_trd_day_count'] # 平均每天交易额
data_t[i+'_income_every_sum'] = data_t[i+'_income_sum'] / data_t[i+'_income_count'] # 平均每笔收入
data_t[i+'_income_day_sum'] = data_t[i+'_income_sum'] / data_t[i+'_income_day_count'] # 平均每天收入
data_t[i+'_expend_every_sum'] = data_t[i+'_expend_sum'] / data_t[i+'_expend_count'] # 平均每笔支出
data_t[i+'_expend_day_sum'] = data_t[i+'_expend_sum'] / data_t[i+'_expend_day_count'] # 平均每天支出
# 收入数、支出数、交易数求平均
data_t[i+'_trd_daymean_count'] = data_t[i+'_trd_count'] / data_t[i+'_trd_day_count'] # 平均每天交易数
data_t[i+'_income_daymean_count'] = data_t[i+'_income_count'] / data_t[i+'_income_day_count'] # 平均每天收入数
data_t[i+'_expend_daymean_count'] = data_t[i+'_expend_count'] / data_t[i+'_expend_day_count'] # 平均每天支出数
# 数量、交易额占比
data_t[i+'_income_count_ratio']=data_t[i+'_income_count']/data_t[i+'_trd_count']
data_t[i+'_expend_count_ratio']=data_t[i+'_expend_count']/data_t[i+'_trd_count']
data_t[i+'_income_total_ratio']=data_t[i+'_income_sum']/data_t[i+'_trd_total']
data_t[i+'_expend_total_ratio']=-data_t[i+'_expend_sum']/data_t[i+'_trd_total']
data_t = data_t.fillna(0)
return data_t
def month_trd2(data):
data['trx_tm'] = pd.to_datetime(data['trx_tm'], format='%Y/%m/%d') # 转变为8位时间字符
data['trx_tm'] = data['trx_tm'].dt.strftime('%Y%m%d')
data['month']=data['trx_tm'].apply(lambda x:x[4:6])
month_value=data.month.unique().tolist()
month_value.sort()
temp=data.groupby(by=['id'],as_index=False)['trx_tm'].agg({
'trx_date_min':'min','trx_date_max':'max'}) #交易的第一天和最后一天
for i in month_value:
aa=data[data['month']==i].groupby(by=['id'],as_index=False)['trx_tm'].agg(
{
i+'_trx_date_min':'min',i+'_trx_date_max':'max'}) #交易的第一天和最后一天
aa[i+'_trx_date_interval']=pd.to_numeric(aa[i+'_trx_date_max'])-pd.to_numeric(aa[i+'_trx_date_min'])
temp=pd.merge(temp, aa, on=['id'], how='left')
temp=temp.fillna(0)
temp.drop(['trx_date_min','trx_date_max'],axis=1,inplace=True)
return temp
def trx_cod1(data):
Trx_Cod1_values=data.Trx_Cod1_Cd.unique().tolist() #取值列表
Trx_Cod1_values.sort() #排序
temp_data=data[['id','Trx_Cod1_Cd','cny_trx_amt']].copy()
temp_data['cny_trx_amt']=temp_data['cny_trx_amt'].apply(lambda x:abs(x))
aa=data.groupby(by=['id'],as_index=False)['Trx_Cod1_Cd'].agg({
'trd_count':'count'}) #总的交易次数
bb=temp_data.groupby(by=['id'],as_index=False)['cny_trx_amt'].agg({
'trd_total':'sum'}) #总交易额
aa=pd.merge(aa,bb,on='id',how='left')
for i in Trx_Cod1_values:
#各小类计数、占比
temp1=data[data['Trx_Cod1_Cd']==i].groupby(by=['id'],as_index=False)['Trx_Cod1_Cd'].agg({
'Trx_Cod1_'+str(i)+'_count':'count'})
aa=pd.merge(aa,temp1,on='id',how='left')
aa['Trx_Cod1_'+str(i)+'_ratio']=aa['Trx_Cod1_'+str(i)+'_count']/aa['trd_count']
#各小类统计特征
temp2=data[data['Trx_Cod1_Cd']==i].groupby(by=['id'],as_index=False)['cny_trx_amt'].agg({
'Trx_Cod1_'+str(i)+'_sum':'sum','Trx_Cod1_'+str(i)+'_min':'min','Trx_Cod1_'+str(i)+'_max':'max',
'Trx_Cod1_'+str(i)+'_mean':'mean','Trx_Cod1_'+str(i)+'_std':'std'})
aa=pd.merge(aa,temp2,on='id',how='left')
#各小类交易额、占比
temp3=temp_data[temp_data['Trx_Cod1_Cd']==i].groupby(by=['id'],as_index=False)['cny_trx_amt'].agg(
{
'Trx_Cod1_'+str(i)+'_total':'sum'})
aa=pd.merge(aa,temp3,on='id',how='left')
aa['total_'+str(i)+'_ratio']=aa['Trx_Cod1_'+str(i)+'_total']/aa['trd_total']
aa=aa.fillna(0)
aa.drop(['trd_count','trd_total'],axis=1,inplace=True)
return aa
def dat_flg3(data):
Dat_Flg3_values=data.Dat_Flg3_Cd.unique().tolist() #取值列表
Dat_Flg3_values.sort() #排序
temp_data=data[['id','Dat_Flg3_Cd','cny_trx_amt']].copy()
temp_data['cny_trx_amt']=temp_data['cny_trx_amt'].apply(lambda x:abs(x))
aa=data.groupby(by=['id'],as_index=False)['Dat_Flg3_Cd'].agg({
'trd_count':'count'}) #总的交易次数
bb=temp_data.groupby(by=['id'],as_index=False)['cny_trx_amt'].agg({
'trd_total':'sum'}) #总交易额
aa=pd.merge(aa,bb,on='id',how='left')
for i in Dat_Flg3_values:
#各小类计数、占比
temp1=data[data['Dat_Flg3_Cd']==i].groupby(by=['id'],as_index=False)['Dat_Flg3_Cd'].agg({
'Dat_Flg3_'+str(i)+'_count':'count'})
aa=pd.merge(aa,temp1,on='id',how='left')
aa['Dat_Flg3_'+str(i)+'_ratio']=aa['Dat_Flg3_'+str(i)+'_count']/aa['trd_count']
#各小类统计特征
temp2=data[data['Dat_Flg3_Cd']==i].groupby(by=['id'],as_index=False)['cny_trx_amt'].agg({
'Dat_Flg3_'+str(i)+'_sum':'sum','Dat_Flg3_'+str(i)+'_min':'min','Dat_Flg3_'+str(i)+'_max':'max',
'Dat_Flg3_'+str(i)+'_mean':'mean','Dat_Flg3_'+str(i)+'_std':'std'})
aa=pd.merge(aa,temp2,on='id',how='left')
#各小类交易额、占比
temp3=temp_data[temp_data['Dat_Flg3_Cd']==i].groupby(by=['id'],as_index=False)['cny_trx_amt'].agg(
{
'Dat_Flg3_'+str(i)+'_total':'sum'})
aa=pd.merge(aa,temp3,on='id',how='left')
aa['total_'+str(i)+'_ratio']=aa['Dat_Flg3_'+str(i)+'_total']/aa['trd_total']
aa=aa.fillna(0)
aa.drop(['trd_count','trd_total'],axis=1,inplace=True)
return aa
def trx_cod2(data):
Trx_Cod2_values=data.Trx_Cod2_Cd.unique().tolist() #取值列表
Trx_Cod2_values.sort() #排序
aa=data.groupby(by=['id'],as_index=False)['Trx_Cod2_Cd'].agg({
'trd_count':'count'}) #总的点击次数
for i in Trx_Cod2_values:
temp2=data[data['Trx_Cod2_Cd']==i].groupby(by=['id'],as_index=False)['cny_trx_amt'].agg({
'Trx_Cod2_'+str(i)+'_sum':'sum'})
aa=pd.merge(aa,temp2,on='id',how='left')
aa=aa.fillna(0)
aa.drop(['trd_count'],axis=1,inplace=True)
return aa
此表为APP页面点击表,确实感觉没什么挖的空间,只对基本的点击事件,次数,小类统计等进行构造。(小类部分A榜掉分,B榜上分)
def beh(data):
data['page_tm']=pd.to_datetime(data['page_tm'], format='%Y/%m/%d') #改变时间格式!!!
data['page_tm']=data['page_tm'].dt.strftime('%Y%m%d')
aa=data.groupby(by=['id'],as_index=False)['page_no'].agg({
'pageno_count':'count'}) #总的点击次数
pageno_values=data.page_no.unique().tolist() #page_no的列取值
pageno_values.sort() #排序
#各小类点击次数
for i in pageno_values:
temp=data[data['page_no']==i].groupby(by=['id'],as_index=False)['page_no'].agg({
i+'_count':'count'})
aa=pd.merge(aa,temp,on='id',how='left')
aa[i+'_ratio']=aa[i+'_count']/aa['pageno_count']
aa.drop([i+'_count'],axis=1,inplace=True)
aa=aa.fillna(0)
#按天特征
a1=data.groupby(['id','page_tm'])['page_no'].count().unstack().reset_index() #单日最大点击次数
col=a1.columns.tolist()[1:]
a1['click_day_max']=a1[col].max(axis=1)
a1=a1[['id','click_day_max']]
##
a2=data.groupby(by=['id'], as_index=False)['page_tm'].agg({
'click_daynum': 'nunique'}) #点击天数
aa=pd.merge(aa,a1,on='id',how='left')
aa=pd.merge(aa,a2,on='id',how='left')
aa['click_day_count']=aa['pageno_count']/aa['click_daynum'] #平均每天点击次数
a3=data.groupby(by=['id'],as_index=False)['page_tm'].agg({
'click_date_min':'min',
'click_date_max':'max'}) #点击的第一天和最后一天
a3['date_interval']=pd.to_numeric(a3['click_date_max'])-pd.to_numeric(a3['click_date_min'])
aa=pd.merge(aa,a3,on='id',how='left')
return aa
简单的根据lightgbm特征重要性,筛选删除46个特征。
bad=['expend_day_count','total_C_ratio','06_income_day_count','06_expend_count','05_income_day_count','Dat_Flg3_C_ratio',
'CQC_ratio','05_expend_day_count','trd_count','Dat_Flg3_C_std','Dat_Flg3_C_total','Dat_Flg3_C_sum','pl_crd_lmt_cd',
'loan_act_ind','cur_debit_crd_lvl','06_income_count','Dat_Flg3_C_count','EGA_ratio','Dat_Flg3_C_mean','Dat_Flg3_C_min',
'XAG_ratio','l12mon_buy_fin_mng_whl_tms','cust_inv_rsk_endu_lvl_cd','ic_ind','JJK_ratio','05_expend_mean','Trx_Cod1_2_total',
'05_income_mean','expend_mean','SYK_ratio','MTA_ratio','LCT_ratio','06_expend_mean','LC0_ratio','JJD_ratio','06_income_mean',
'ovd_30d_loan_tot_cnt','JF2_ratio','FLS_ratio','income_mean','FDA_ratio','EGB_ratio','l12_mon_gld_buy_whl_tms','ZY1_ratio',
'l6mon_agn_ind','l12_mon_insu_buy_whl_tms']
前期使用XGBoost,上分上到0.741进入瓶颈,后面就换lightgbm模型,并结合十折交叉验证。
#----------------------------lightgbm模型------------------------------
model = lgb.LGBMClassifier(boosting_type='gbdt',
num_leaves=30,
max_depth=-1,
learning_rate=0.02,
n_estimators=3000,
max_bin=500,
subsample_for_bin=100000,
objective='binary',
min_split_gain=0,
min_child_weight=5,
min_child_samples=10,
subsample=0.8,
subsample_freq=1,
colsample_bytree=1,
reg_alpha=3,
reg_lambda=5,
seed=2018,
n_jobs=10,
verbose=10,
silent=True)
# 十折交叉训练,构造十个模型
skf=list(StratifiedKFold(y_loc_train, n_folds=10, shuffle=True, random_state=888))
base_auc = []
loss = 0
oof_preds = np.zeros(train.shape[0])
for i, (train_index, test_index) in enumerate(skf):
print("Fold", i)
lgb_model = model.fit(X_loc_train[train_index], y_loc_train[train_index],
eval_names =['train','valid'],
eval_metric='auc',
eval_set=[(X_loc_train[train_index], y_loc_train[train_index]),
(X_loc_train[test_index], y_loc_train[test_index])],early_stopping_rounds=100)
base_auc.append(lgb_model.best_score_['valid']['auc'])
loss += lgb_model.best_score_['valid']['auc']
oof_preds[test_index] = lgb_model.predict_proba(train.iloc[test_index], num_iteration=lgb_model.best_iteration_)[:,1]
test_pred= lgb_model.predict_proba(X_loc_test, num_iteration=lgb_model.best_iteration_)[:, 1]
print('test mean:', test_pred.mean())
res['prob_%s' % str(i)] = test_pred
print('roc_auc_score:', base_auc, loss/10)
res['flag'] = 0
for i in range(10):
res['flag'] += res['prob_%s' % str(i)]
res['flag'] = res['flag']/10
# 提交结果
mean = res['flag'].mean()
print('mean:',mean)
endtime = datetime.datetime.now()
线下0.76299(十折平均),线上0.78026,排名57,招行放榜53名。
0.68—>0.72:加入trd函数特征
0.72—>0.732:数据预处理+trd交易数特征
0.732—>0.733:trd2函数+beh函数
0.733—>0.737:小类特征 flg3+cod1函数
0.737—>0.741:加入占比、比率特征
0.741—>0.744:换模型lgb+tag强特增益(天数*等级)
0.744—>0.747:month函数,月平均特征
--------换榜---------
0.7767:基本上和大家的平均值差不多,高0.03,所以对我影响不大。
0.7767—>0.7806:强特继续构造+month2函数+cod2函数
这一阶段上分就很难了,也是最后收尾阶段,思路受限,没进前50小遗憾。
一次很不错的比赛,也十分重视。期间五一回老家烧烤间隙,也要拿出电脑上上分,小菜鸡单模能到这个分数已经知足了。(求轻喷)
写在最后:
感谢朋友“爱撒谎的小超”大力支持。
感谢群里两位大佬“hello world”,“华夏狼崽”的思路分享,受益匪浅。
nku_zhengty
2020.5.15