其中,payment表中有44854条数据,有42433条数据Y值非空。
备注:1.贷款次数特定时间段指的是还款日期前7天、半个月、一个月、三个月、六个月、九个月、一年;
2.预期预警、地理位置变化、每个人设备数、每一设备多少人用的特定时间段指的是还款日期前3天、7天、15天、30天、90、180、270、360天。
'device_code', 'customer_id','SSMONTH', 'DLSBH_ENCODER', 'RZQS','QC', 'notified', 'num_of_posting_before_payment_7days',
'num_of_posting_before_payment_15days', 'num_of_posting_before_payment_30days', 'num_of_posting_before_payment_90days',
'num_of_posting_before_payment_180days','num_of_posting_before_payment_270days','num_of_posting_before_payment_365days',
'num_of_posting_before_payment_all_day', 'work_sum_time','num_of_longitude_int_before_7day', 'num_of_latitude_int_before_7day',
'num_of_longitude_int_before_15day', 'num_of_latitude_int_before_15day','num_of_longitude_int_before_30day', 'num_of_latitude_int_before_30day','num_of_longitude_int_before_90day', 'num_of_latitude_int_before_90day',
'num_of_longitude_int_before_180day','num_of_latitude_int_before_180day','num_of_longitude_int_before_270day',
'num_of_latitude_int_before_270day','num_of_longitude_int_before_365day','num_of_latitude_int_before_365day', 'latitude_int', 'longitude_int','if_first_post_and_notify', 'QC_RZQS', 'customer_id_D3_num','customer_id_D3_orgnum', 'customer_id_D7_num', 'customer_id_D7_orgnum','customer_id_D15_num', 'customer_id_D15_orgnum', 'customer_id_D30_num','customer_id_D30_orgnum', 'customer_id_D90_num','customer_id_D90_orgnum', 'customer_id_D180_num','customer_id_D180_orgnum', 'customer_id_D270_num',
'customer_id_D270_orgnum', 'customer_id_D360_num','customer_id_D360_orgnum', 'device_code_D3_num','device_code_D3_orgnum', 'device_code_D7_num', 'device_code_D7_orgnum','device_code_D15_num', 'device_code_D15_orgnum', 'device_code_D30_num',
'device_code_D30_orgnum', 'device_code_D90_num','device_code_D90_orgnum', 'device_code_D180_num','device_code_D180_orgnum', 'device_code_D270_num','device_code_D270_orgnum', 'device_code_D360_num','device_code_D360_orgnum', 'num_of_notify_before_7day',
'num_of_notify_before_15day', 'num_of_notify_before_30day','num_of_notify_before_90day', 'num_of_notify_before_180day',
'num_of_notify_before_270day', 'num_of_notify_before_365day','num_of_notify_before_all_day',
'num_of_longitude_int_before_7dayper_person',
'num_of_latitude_int_before_7dayper_person',
'num_of_longitude_int_before_15dayper_person',
'num_of_latitude_int_before_15dayper_person',
'num_of_longitude_int_before_30dayper_person',
'num_of_latitude_int_before_30dayper_person',
'num_of_longitude_int_before_90dayper_person',
'num_of_latitude_int_before_90dayper_person',
'num_of_longitude_int_before_180dayper_person',
'num_of_latitude_int_before_180dayper_person',
'num_of_longitude_int_before_270dayper_person',
'num_of_latitude_int_before_270dayper_person',
'num_of_longitude_int_before_365dayper_person',
'num_of_latitude_int_before_365dayper_person'
lgbm=lightgbm.train(train_set=lightgbm.Dataset(data_x, label=data_y), num_boost_round=500, params={"objective": "binary", "learning_rate": 0.01, "max_depth": 5, "num_leaves": 128, "verbose": -1, "bagging_fraction": 0.8, "feature_fraction": 0.9,'lambda_l1':0.4,'lambda_l2':0.5})
joblib.dump(lgbm,'/home/workspace/output/process_dir/lgbm_0614.model')
lgbm=joblib.load(os.path.join(path,"lgbm_0614.model"))
payment_1=payment.copy()
payment_1["预测得分"] = lgbm.predict(data_x)
payment_1 = payment_1.sort_values("预测得分", ascending=False, ignore_index=True)
print('预测得分计算出并赋值!')
payment_1['Y']=0
payment_1.loc[:int(0.15 * len(payment_1)), ["Y"]] = 1
print('Y已生成!')
#获取每个还款日期前每个设备、每个人贷款次数;每个人用多少台设备;每台设备多少人用
df_iot=pd.read_csv('/home/workspace/output/iot_a.csv')
df_order=pd.read_csv('/home/workspace/output/orders_a.csv')
df_payment=pd.read_csv('/home/workspace/output/payment_a.csv')
df_order_1=df_order.copy()
df_order_1['posting_date'] = pd.to_datetime(df_order_1['posting_date'])
df_payment_1=df_payment[['device_code','customer_id','SSMONTH']].copy()
df_payment_1['SSMONTH_date'] = df_payment_1['SSMONTH'].astype('str').apply(lambda x: x+'31' if x[-2:] in['01','03','05','07','08','10',12']else x+'28' if x[-2:] == '02' else x+'30')
df_payment_1['SSMONTH_date'] = pd.to_datetime(df_payment_1['SSMONTH_date'])
day_lst = [3,7,15,30,90,180,270,360]
day_name = ['D3', 'D7', 'D15', 'D30', 'D90', 'D180', 'D270', 'D360']
day_dic = dict(zip(day_lst,day_name))
# 拼接左表为还款记录
df_left = df_payment_1[['device_code','customer_id','SSMONTH','SSMONTH_date']].copy()
# 右表为过账记录表
df_right = df_order_1.copy()
# 还款记录的3个主键用于接收变量
dt = df_payment_1[['device_code','customer_id','SSMONTH','SSMONTH_date']].copy()
# 从customer_id维度拼接
df_merge = pd.merge(df_left,df_right,on = 'customer_id')
df_merge.rename(columns = {'device_code_x':'device_code'},inplace = True)
# 日期相减
df_merge['time_delta'] =df_merge['SSMONTH_date'] - df_merge['posting_date']
for i in day_lst:
df_time = df_merge.loc[(df_merge['time_delta'] <= timedelta(days = i))&(df_merge['time_delta'] >= timedelta(days = 0))]
# customer计数
dt_tmp = df_time.groupby(['device_code','customer_id','SSMONTH_date']).size().reset_index()
dt = pd.merge(dt,dt_tmp,on = ['device_code','customer_id','SSMONTH_date'],how = 'left')
dt.rename(columns = {0:'customer_id'+'_'+day_dic[i]+'_num'},inplace = True)
# dt
# customer 对应的device的个数
dt_tmp = df_time.groupby(['customer_id','SSMONTH_date'])['device_code_y'].nunique().reset_index()
dt = pd.merge(dt,dt_tmp,on = ['customer_id','SSMONTH_date'],how = 'left')
dt.rename(columns = {'device_code_y':'customer_id'+'_'+day_dic[i]+'_orgnum'},inplace = True)
dt.columns
# 生成在还款表中出现的设备列表 df_payment_iot_list
df_payment_6=df_payment[['device_code']].copy()
df_payment_6.drop_duplicates('device_code',inplace=True)
df_iot_2=df_iot_1[['device_code_iot']].copy()
df_iot_2.drop_duplicates('device_code_iot',inplace=True)
df_payment_6['if_exist']=1
df_payment_iot=pd.merge(df_payment_6,df_iot_2,\
left_on='device_code',right_on='device_code_iot',how='right')
df_payment_iot=df_payment_iot.loc[df_payment_iot['if_exist'].notnull()]
df_payment_iot.drop(columns=['if_exist','device_code'],inplace=True)
df_payment_iot.drop_duplicates('device_code_iot',inplace=True)
df_payment_iot_list=df_payment_iot['device_code_iot'].tolist()
begin_time=time.time()
month_list=[3,7,15,30,90,180,270,360]
location_list=['latitude_int','longitude_int']
for i in month_list:
globals()['df_payment_iot_concat_'+str(i)]=pd.DataFrame()
for h in range(0,len(df_payment_iot_list),10):
df_payment_excess=df_payment.loc[df_payment['device_code'].isin(df_payment_iot_list[h:h+10])]
df_payment_excess['SSMONTH_date']=df_payment_excess['SSMONTH'].astype('str').apply(lambda x:\
x+'31' if x[-2:] in ['01','03','05','07','08','10','12']\
else x+'28' \
if x[-2:]=='02' else x+'30')
df_payment_excess['SSMONTH_date']=pd.to_datetime(df_payment_excess['SSMONTH_date'])
df_iot_excess=df_iot_1.loc[df_iot_1['device_code_iot'].isin(df_payment_iot_list[h:h+10])]
df_iot_excess.columns=['device_code', 'work_sum_time', 'reporttime', 'latitude_int', 'longitude_int']
df_payment_iot_excess=pd.merge(df_payment_excess[['device_code','SSMONTH_date','SSMONTH']],df_iot_excess\
[['device_code','reporttime','latitude_int','longitude_int']],\
on='device_code',how='left')
df_payment_iot_excess['day_delta']=df_payment_iot_excess.apply(lambda x : (x['SSMONTH_date']-x['reporttime']).days,axis=1)
df_payment_iot_excess['if_before_'+str(i)+'day' ]=df_payment_iot_excess.apply(lambda x:1 if (x['day_delta']<=i and x\
['day_delta']>=0) else 0,axis=1)
df_payment_iot_excess_merge_lat=df_payment_iot_excess.loc[df_payment_iot_excess['if_before_'+str(i)+'day']==1].\
drop_duplicates(['latitude_int','device_code','SSMONTH'])
df_payment_iot_excess_merge_lat=df_payment_iot_excess_merge_lat[['latitude_int','device_code','SSMONTH']].\
groupby(['device_code', 'SSMONTH']).count()
df_payment_iot_excess_merge_lat.columns=['num_of_latitude_int_before_'+str(i)+'day']
df_payment_iot_excess_merge_long=df_payment_iot_excess.loc[df_payment_iot_excess['if_before_'+str(i)+'day']==1].\
drop_duplicates(['longitude_int','device_code','SSMONTH'])
df_payment_iot_excess_merge_long=df_payment_iot_excess_merge_long[['longitude_int','device_code','SSMONTH']].\
groupby(['device_code', 'SSMONTH']).count()
df_payment_iot_excess_merge_long.columns=['num_of_longitude_int_before_'+str(i)+'day']
df_payment_iot_excess_merge=pd.merge(df_payment_iot_excess_merge_long,df_payment_iot_excess_merge_lat,\
on=['device_code', 'SSMONTH'],how='inner')
df_payment_iot_excess_merge.reset_index(inplace=True)
globals()['df_payment_iot_concat_'+str(i)]=pd.concat([globals()['df_payment_iot_concat_'+str(i)],df_payment_iot_excess_merge])
print(len(globals()['df_payment_iot_concat_'+str(i)]))
print(globals()['df_payment_iot_concat_'+str(i)].info())
print(globals()['df_payment_iot_concat_'+str(i)].shape)
globals()['df_payment_iot_concat_'+str(i)].to_csv('/home/workspace/output/feature_derive/process_location_change/df_payment_iot_concat_%s_fillna.csv' \
%(str(i)),index=False)
end_time=time.time()
print((end_time-begin_time)/3600)
df_payment_5为左表,df_payment_6为右表。左表保存customer_id,SSMONTH,右表保存customer_id,SSMONTH,notified。
# 获取每个人还款日期前逾期预警次数变量
df_payment_5=df_payment[['customer_id', 'SSMONTH']].copy()
df_payment_5.columns=['customer_id', 'SSMONTH_1']
df_payment_6=df_payment[['customer_id', 'SSMONTH','notified']].copy()
df_payment_5['SSMONTH_date']=df_payment_5['SSMONTH_1'].astype('str').apply(lambda x:x+'31' if x[-2:] in ['01','03','05','07','08','10','12'] \
else x+'28' if x[-2:]=='02' else x+'30')
df_payment_5['SSMONTH_date']=pd.to_datetime(df_payment_5['SSMONTH_date'])
df_payment_6['SSMONTH_date']=df_payment_6['SSMONTH'].astype('str').apply(lambda x:x+'31' if x[-2:] in ['01','03','05','07','08','10','12'] \
else x+'28' if x[-2:]=='02' else x+'30')
df_payment_6['SSMONTH_date']=pd.to_datetime(df_payment_6['SSMONTH_date'])
df_payment_5.rename(columns={'SSMONTH_date':'SSMONTH_date_1'},inplace=True)
df_payment_6.rename(columns={'SSMONTH_date':'SSMONTH_date_2'},inplace=True)
df_payment_3=pd.merge(df_payment_5[['customer_id','SSMONTH_1','SSMONTH_date_1']],df_payment_6[['customer_id','notified','SSMONTH_date_2']]\
,on='customer_id',how='left')
# 计算还款日期前7,15,30,90,180,270,365天、一直的预期预警个数
month_list=[7,15,30,90,180,270,365]
df_payment_3['day_delta']=df_payment_3.apply(lambda x : (x['SSMONTH_date_1']-x['SSMONTH_date_2']).days ,axis=1)
for i in month_list:
df_payment_3['if_notify_before_'+str(i)+'day' ]=df_payment_3.apply(lambda x:1 if (x['day_delta']<=i and x['day_delta']>=0) else 0,axis=1)
# 生成if_notifu_before*day变量,用于选取处在特定时间段的样本,用于计算该时间段内预警个数
df_payment_3_merge=df_payment_3.loc[df_payment_3['if_notify_before_'+str(i)+'day']==1][['customer_id', 'SSMONTH_1', 'notified']]\
.groupby(['customer_id', 'SSMONTH_1']).sum()
df_payment_3_merge.columns=['num_of_notify_before_'+str(i)+'day']
df_payment_3.drop(columns='if_notify_before_'+str(i)+'day',inplace=True)
#将customer_id,SSMONTH_1作为连接键
df_payment_3=pd.merge(df_payment_3,df_payment_3_merge,on=['customer_id', 'SSMONTH_1'],how='left')
df_payment_3['if_notify_before_all_day' ]=df_payment_3.apply(lambda x:1 if x['day_delta']>=0 else 0,axis=1)
df_payment_3_merge=df_payment_3.loc[df_payment_3['if_notify_before_all_day']==1][['customer_id', 'SSMONTH_1', 'notified']]\
.groupby(['customer_id', 'SSMONTH_1']).sum()
df_payment_3_merge.columns=['num_of_notify_before_all_day']
df_payment_3.drop(columns='if_notify_before_all_day',inplace=True)
df_payment_3=pd.merge(df_payment_3,df_payment_3_merge,on=['customer_id', 'SSMONTH_1'],how='left')
print(df_payment_3.columns)
df_payment_3.head(20)
df_payment_3.drop(columns=['SSMONTH_date_2','day_delta','notified'],inplace=True)
df_payment_3.drop_duplicates(['customer_id','SSMONTH_date_1'],inplace=True)
print(df_payment_3.columns)
print(df_payment_3.info())
df_left=df_payment[['device_code','customer_id','SSMONTH']].copy()
df_right=df_payment_iot_concat.copy()
df_tmp=pd.merge(df_left,df_right,on=['device_code','SSMONTH'],how='inner')
df_tmp.drop(columns='device_code',inplace=True)
df_payment_iot_concat_dropdup_per_person=(df_tmp.groupby(['customer_id','SSMONTH']).sum()-df_tmp.groupby(\
['customer_id','SSMONTH']).\
count()).reset_index()
for i in df_payment_iot_concat_dropdup_per_person.columns[2:]:
df_payment_iot_concat_dropdup_per_person.rename(columns={i:i+'per_person'},inplace=True)
df_all=pd.merge(df_all,df_payment_iot_concat_dropdup_per_person,on=['customer_id','SSMONTH'],how='left')
fill_col=['num_of_longitude_int_before_7dayper_person',
'num_of_latitude_int_before_7dayper_person',
'num_of_longitude_int_before_15dayper_person',
'num_of_latitude_int_before_15dayper_person',
'num_of_longitude_int_before_30dayper_person',
'num_of_latitude_int_before_30dayper_person',
'num_of_longitude_int_before_90dayper_person',
'num_of_latitude_int_before_90dayper_person',
'num_of_longitude_int_before_180dayper_person',
'num_of_latitude_int_before_180dayper_person',
'num_of_longitude_int_before_270dayper_person',
'num_of_latitude_int_before_270dayper_person',
'num_of_longitude_int_before_365dayper_person',
'num_of_latitude_int_before_365dayper_person']
for i in fill_col:
df_all[i].fillna(-99,inplace=True)