《Python数据科学:技术详解与商业实践》商业数据挖掘案列
import numpy as np
import pandas as pd
import os
import datetime
#数据导入
os.chdir('D:\python2\个人贷款预测模型') #设置工作路径
loanfile = os.listdir()
createVar = locals()
for i in loanfile:
if i.endswith('csv'):
createVar[i.split('.')[0]]=pd.read_csv(i,encoding='gbk')
print(i.split('.')[0])
#生成解释变量
pd.set_option('display.max_columns',8) #设置数据框最大显示列数
bad_good = {'B':1,'D':1,'A':0,'C':2}
loans['bad_good']=loans.status.map(bad_good) #对status做更改
#以贷款账号和权限账号进行连接
data2 = pd.merge(loans,disp,on='account_id',how ='left')
#以表中客户号和顾客表中客户号连接
data2 = pd.merge(data2,clients,on = 'client_id',how='left')
#以地区号与人口地区调查连接,提取居住地失业率变量
data3=pd.merge(data2,district,left_on='district_id',right_on='A1',how='left')
#将贷款表和交易表以账号号进行内连接
data4_t1=pd.merge(loans[['account_id','date']],
trans[['account_id','type','amount','balance','date']],
on= 'account_id')
data4_t1.columns=['account_id','date','type','amount','balance','t_date']
data4_t1=data4_t1.sort_values(by=['account_id','t_date']) #根据客户号,日期排列
# 将日期格式化为日期形式
data4_t1['date'] = pd.to_datetime(data4_t1['date'])
data4_t1['t_date'] = pd.to_datetime(data4_t1['t_date'])
#对交易金额以及余额做数据清洗
data4_t1['balance2']=data4_t1['balance'].map(lambda x:int(''.join(x[1:].split(','))))
data4_t1['amount2']=data4_t1['amount'].map(lambda x:int(''.join(x[1:].split(','))))
#获取贷款日期前365到前1天的交易数据
data4_t2=data4_t1[data4_t1.date>data4_t1.t_date][data4_t1.date best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
print('aic is {},continuing!'.format(current_score))
else:
print('forward selection over!')
break
formula = "{} ~ {} ".format(response, ' + '.join(selected))
print('final formula is {}'.format(formula))
model = smf.glm(
formula=formula, data=data,
family=sm.families.Binomial(sm.families.links.logit)
).fit()
return (model)
candidates = ['bad_good', 'A1', 'GDP', 'A4', 'A10', 'A11', 'A12', 'amount', 'duration',
'A13', 'A14', 'A15', 'a16', 'avg_balance', 'st_balance',
'cv_balance', 'income', 'out', 'out_in', 'r_lb', 'r_lincome']
data_for_select = train[candidates]
train.index
print([column for column in train])
lg_m1 = forward_select(data=data_for_select, response='bad_good')
lg_m1.summary().tables[1]
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
fpr, tpr, th = metrics.roc_curve(test.bad_good, lg_m1.predict(test))
plt.figure(figsize=[6, 6])
plt.plot(fpr, tpr, 'b--')
plt.title('ROC curve')
plt.show()
print('AUC = %.4f' %metrics.auc(fpr, tpr))
for_predict['prob']=lg_m1.predict(for_predict)
for_predict[['account_id','prob']].head()