Python笔记:个人贷款预测模型

《Python数据科学:技术详解与商业实践》商业数据挖掘案列

import  numpy as np
import  pandas as pd
import  os
import  datetime
#数据导入
os.chdir('D:\python2\个人贷款预测模型')  #设置工作路径
loanfile = os.listdir()
createVar = locals()
for i in loanfile:
    if i.endswith('csv'):
        createVar[i.split('.')[0]]=pd.read_csv(i,encoding='gbk')
        print(i.split('.')[0])
#生成解释变量
pd.set_option('display.max_columns',8)  #设置数据框最大显示列数
bad_good = {'B':1,'D':1,'A':0,'C':2}
loans['bad_good']=loans.status.map(bad_good)   #对status做更改
#以贷款账号和权限账号进行连接
data2 = pd.merge(loans,disp,on='account_id',how ='left')
#以表中客户号和顾客表中客户号连接
data2 = pd.merge(data2,clients,on = 'client_id',how='left')
#以地区号与人口地区调查连接,提取居住地失业率变量
data3=pd.merge(data2,district,left_on='district_id',right_on='A1',how='left')
 #将贷款表和交易表以账号号进行内连接
data4_t1=pd.merge(loans[['account_id','date']],
                trans[['account_id','type','amount','balance','date']],
                on= 'account_id')
data4_t1.columns=['account_id','date','type','amount','balance','t_date']
data4_t1=data4_t1.sort_values(by=['account_id','t_date']) #根据客户号,日期排列
# 将日期格式化为日期形式
data4_t1['date'] = pd.to_datetime(data4_t1['date'])
data4_t1['t_date'] = pd.to_datetime(data4_t1['t_date'])
#对交易金额以及余额做数据清洗
data4_t1['balance2']=data4_t1['balance'].map(lambda x:int(''.join(x[1:].split(','))))
data4_t1['amount2']=data4_t1['amount'].map(lambda x:int(''.join(x[1:].split(','))))
#获取贷款日期前365到前1天的交易数据
data4_t2=data4_t1[data4_t1.date>data4_t1.t_date][data4_t1.date best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            print('aic is {},continuing!'.format(current_score))
        else:
            print('forward selection over!')
            break

    formula = "{} ~ {} ".format(response, ' + '.join(selected))
    print('final formula is {}'.format(formula))
    model = smf.glm(
        formula=formula, data=data,
        family=sm.families.Binomial(sm.families.links.logit)
    ).fit()
    return (model)


candidates = ['bad_good', 'A1', 'GDP', 'A4', 'A10', 'A11', 'A12', 'amount', 'duration',
              'A13', 'A14', 'A15', 'a16', 'avg_balance', 'st_balance',
              'cv_balance', 'income', 'out', 'out_in', 'r_lb', 'r_lincome']
data_for_select = train[candidates]
train.index
print([column for column in train])
lg_m1 = forward_select(data=data_for_select, response='bad_good')
lg_m1.summary().tables[1]


import sklearn.metrics as metrics
import matplotlib.pyplot as plt
fpr, tpr, th = metrics.roc_curve(test.bad_good, lg_m1.predict(test))
plt.figure(figsize=[6, 6])
plt.plot(fpr, tpr, 'b--')
plt.title('ROC curve')
plt.show()


print('AUC = %.4f' %metrics.auc(fpr, tpr))

for_predict['prob']=lg_m1.predict(for_predict)
for_predict[['account_id','prob']].head()

你可能感兴趣的:(Python笔记:个人贷款预测模型)