金融反欺诈模型

整个项目做下来,感受最深的其实是如何从众多的特征中选出最实用的特征变量。

数据处理 ,下面通过代码来感受这个过程

#数据读取,skiprows的含义表示跳过第一行,从第二行开始读取
data = pd.read_csv("G:\data\LoanStats_2016Q2\LoanStats_2016Q2.csv",skiprows=1,low_memory = True)
#删去缺失数据较多的列
data.drop("id",axis=1,inplace=True);
data.drop("member_id",axis=1,inplace=True)
#截取每条数据中的数字部分
data.term.replace(to_replace='[^0-9]+',value = "",inplace = True, regex = True)
data.int_rate.replace("%",value = "",inplace = True)
#删除状态过多的文本列,状态过多会导致后面做哑变量编码时产生大量的数据,建议先先删除
data.drop("sub_grade",axis=1,inplace=True)
data.drop("emp_title",axis=1,inplace=True)
#处理工作年限:替换空值为np.nan,提取数值部分
data.emp_length.replace("n/a",np.nan,inplace = True)
data.emp_length.replace(to_replace='[^0-9]+',value= "",inplace = True,regex = True)
#删除全部为空的列,all:全部为空才匹配;any:任意一个单元格为空都匹配
# axis=1表示已列为单位,axis=0表示以行为单位
data.dropna(axis=1,how="all",inplace=True)
#删除全部为空的行
data.dropna(axis=0,how="all",inplace=True)
#统计所有列的非空信息
print(data.info(verbose=True,null_counts = True))
#批量删除下面的列,下面的列大部分的数据都是空的,因此将他们删除
data.drop(["hardship_type","hardship_reason","hardship_status","deferral_term","hardship_amount","hardship_start_date",\
"hardship_end_date","payment_plan_start_date","hardship_length","hardship_dpd","hardship_loan_status",\
"orig_projected_additional_accrued_interest","hardship_payoff_balance_amount","hardship_last_payment_amount",\
"debt_settlement_flag_date","settlement_status","settlement_date","settlement_amount","settlement_percentage",\
"settlement_term"],axis=1,inplace=True)
#计算数据中列和列之间的关联度
cor = data.corr()
#取矩阵中下半部的部分,cor.iloc一行一行的记录
cor.iloc[:,:]= np.tril(cor,k = -1);
cor = cor.stack()#把所有行堆成一列
#print(cor[cor>0.95])#筛选出大于0.95的记录
#删除关联度高的多方中的一方
data.drop(["funded_amnt","funded_amnt_inv","out_prncp_inv","total_pymnt_inv","total_rec_prncp",\
           "collection_recovery_fee","num_rev_tl_bal_gt_0","num_sats",\
           "tot_hi_cred_lim","total_il_high_credit_limit"],axis=1,inplace=True)
#对于类型是object的列,如果类型太少或者太多,都可以将它删掉
for col in data.select_dtypes(include = ["object"]).columns:
    #print(len(data[col].unique()))
    print("col {} has {}".format(col,len(data[col].unique())))
#删除类型过少或者过多的列
data.drop([
"grade","home_ownership","verification_status","issue_d","pymnt_plan",
"desc","zip_code","initial_list_status","next_pymnt_d","application_type","verification_status_joint",
"hardship_flag","disbursement_method","debt_settlement_flag","earliest_cr_line","revol_util"],axis=1,inplace=True)
#因变量处理,这里我们暂时只考虑2分类问题,所以只保留Fully Paid,Charged Off
data.loan_status.replace("Fully Paid",value = int(1),inplace = True)
data.loan_status.replace("Charged Off",value = int(0),inplace = True)
data.loan_status.replace("Current",value = np.nan,inplace = True)
data.loan_status.replace("Late (31-120 days)",value = np.nan,inplace = True)
data.loan_status.replace("In Grace Period",value = np.nan,inplace = True)
data.loan_status.replace("Late (16-30 days)",value = np.nan,inplace = True)
data.loan_status.replace("Default",value = np.nan,inplace = True)
data.dropna(subset=["loan_status"],axis=0,how="any",inplace=True)
#用0.0来填充np.nan
data.fillna(0.0,inplace=True)
#亚编码
data = pd.get_dummies(data)
data.to_csv("G:\data\LoanStats_2016Q2\LoanStats_2016Q2_3.csv")

模型构建及预测

下面实用逻辑回归来金融发欺诈模型的构建

path = "G:\data\LoanStats_2016Q2\LoanStats_2016Q2_3.csv"
data = pd.read_csv(path)
Y = data.loan_status
X = data.drop("loan_status",axis=1,inplace=False)
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

lr = LogisticRegression()
lr.fit(x_train,y_train)
test_predict = lr.predict(x_test)

print(metrics.accuracy_score(test_predict,y_test))
print(metrics.recall_score(test_predict,y_test))

你可能感兴趣的:(金融反欺诈模型)