源码下载:
http://download.csdn.net/download/adam_zs/10230326
import pandas as pd
# loans_2007 = pd.read_csv('LoanStats3a.csv', skiprows=1)
# print(loans_2007.shape) #(42538, 111)
# print(len(loans_2007)) #42538
# half_count = len(loans_2007) / 2
# loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)
# loans_2007 = loans_2007.drop(['desc', 'url'], axis=1)
# loans_2007.to_csv('loans_2007.csv', index=False)
# LoanStats3a = pd.read_csv('LoanStats3a.csv', skiprows=1)
# loans_2007 = pd.read_csv('loans_2007.csv')
# print(LoanStats3a.shape)
# print(loans_2007.shape)
# loans_2007 = pd.read_csv('loans_2007.csv')
# print(loans_2007.iloc[0])
# print(loans_2007.shape) #(42538, 52)
# drop_columns = ["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d",
# "zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp",
# "total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d",
# "last_pymnt_amnt"]
# loans_2007 = loans_2007.drop(drop_columns, axis=1)
# print(loans_2007.shape) #(42538, 32)
# loan_status 贷款状态
# print(loans_2007['loan_status'].value_counts())
'''
sys:1: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
Fully Paid 33902 全额贷款
Charged Off 5658 没有被批准
Does not meet the credit policy. Status:Fully Paid 1988
Does not meet the credit policy. Status:Charged Off 761
Current 201
Late (31-120 days) 10
In Grace Period 9
Late (16-30 days) 5
Default 1
'''
# loans_2007 = loans_2007[(loans_2007['loan_status'] == 'Fully Paid') | (loans_2007['loan_status'] == 'Charged Off')]
# status_replace = {
# "loan_status": {
# "Fully Paid": 1,
# "Charged Off": 0
# }
# }
# loans_2007 = loans_2007.replace(status_replace)
# 删除列值一样的列
# orig_columns = loans_2007.columns
# drop_columns = []
# for col in orig_columns:
# col_series = loans_2007[col].dropna().unique()
# if len(col_series) == 1:
# drop_columns.append(col)
# print(drop_columns)
# loans_2007 = loans_2007.drop(drop_columns, axis=1)
# loans_2007.to_csv("filtered_loans_2007.csv", index=False)
# loans = pd.read_csv('filtered_loans_2007.csv')
# null_counts = loans.isnull().sum()
# print(null_counts)
# print(loans.shape) #(39560, 24)
# loans.drop(['pub_rec_bankruptcies'], inplace=True, axis=1)
# loans.dropna(axis=0, inplace=True)
# print(loans.shape) #(38428, 23)
# print(loans.dtypes.value_counts())
'''
object 12
float64 10
int64 1
dtype: int64
'''
# object_columns_df = loans.select_dtypes(include=['object'])
# print(object_columns_df.iloc[0])
# loans_columns = loans.columns
# for col in loans_columns:
# print(loans[col].value_counts())
# loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)
# print(loans['emp_length'].value_counts())
mapping_dict = {
"emp_length": {
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0,
"n/a": 0
}
}
# loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
# loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
# loans.replace(mapping_dict)
# cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
# dummy_df = pd.get_dummies(loans[cat_columns])
# loans = pd.concat([loans, dummy_df], axis=1)
# loans = loans.drop(cat_columns, axis=1)
# loans = loans.drop("pymnt_plan", axis=1)
# loans.to_csv('cleaned_loans2007.csv', index=False)
# loans = pd.read_csv("cleaned_loans2007.csv")
# print(loans.info())
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import KFold, cross_val_predict
#
loans = pd.read_csv('cleaned_loans2007.csv')
loans = loans.sample(n=100)
cols = loans.columns
features = loans[cols.drop('loan_status')]
target = loans['loan_status']
# lr = LogisticRegression(class_weight='balanced') # class_weight 调整正负样本的比例,balanced正负样本平衡
# kf = KFold(len(features), random_state=1)
# predictions = cross_val_predict(lr, features, target, cv=kf)
# predictions = pd.Series(predictions)
#
# # True positives.
# tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
# tp = len(predictions[tp_filter])
#
# # False positives.
# fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
# fp = len(predictions[fp_filter])
#
# # False negatives.
# fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
# fn = len(predictions[fn_filter])
#
# # True negatives
# tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
# tn = len(predictions[tn_filter])
#
# # Rates
# tpr = tp / float((tp + fn))
# fpr = fp / float((fp + tn))
'''随机森林'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_predict
kf = KFold(features.shape[0], random_state=1)
rf = RandomForestClassifier(class_weight='balanced', random_state=1)
predictions = cross_val_predict(rf, features, target, cv=kf)
predictions = cross_val_predict(rf, features, target, cv=kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])
# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])
# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])
# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))