根据提供的金融数据,分别使用SVM和决策树两种算法预测贷款用户是否会逾期。
一共4754行,89列(除去首行、首列)
代码实现与zuolinye一起完成。首先是数据处理,包括删除不要信息、缺失值填充、映射替换以及数据归一化。
"""1. 导包"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,r2_score
from sklearn.svm import LinearSVC
"""2. 读取数据"""
dataset = pd.read_csv('F:\AI\mission_data\mission_data\data.csv',encoding='gbk')
"""3. 数据处理"""
# 删除固定信息列
dataset = dataset.drop(["custid","trade_no","bank_card_no","id_name","first_transaction_time","latest_query_time","loans_latest_time","source"],axis=1)
# 对于sstudent_feature列,我们进行NAN转成0,2转为0
# 缺失值填充
dataset["student_feature"] = dataset["student_feature"].fillna(0)
# 2替换为0
dataset["student_feature"] = dataset["student_feature"].replace([2],[0])
# 针对城市列'reg_preference_for_trad',进行数据替换
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("一线城市", "1")
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("二线城市", "2")
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("三线城市", "3")
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("其他城市", "4")
dataset["reg_preference_for_trad"] = dataset["reg_preference_for_trad"].replace("境外", "0")
# 填充其他空值
# 使用均值进行填充
# dataset.fillna(dataset.mean(), inplace=True)
# 使用众数进行填充
dataset = dataset.fillna(0) # 使用 0 替换所有 NaN 的值
col = dataset.columns.tolist()[1:]
def missing(df, columns):
"""
使用众数填充缺失值
df[i].mode()[0] 获取众数第一个值
"""
col = columns
for i in col:
df[i].fillna(df[i].mode()[0], inplace=True)
df[i] = df[i].astype('float')
missing(dataset, col)
# 将object类型转成folat
dataset = dataset.convert_objects(convert_numeric=True)
"""4. 数据划分"""
X = dataset.drop(["status"],axis=1)
Y = dataset["status"]
# 数据按正常的2、8划分
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=666)
# not enough values to unpack (expected 4, got 2)
from sklearn.preprocessing import minmax_scale # minmax_scale归一化,缩放到0-1
X_train = minmax_scale(X_train)
X_test = minmax_scale(X_test)
# Input contains NaN, infinity or a value too large for dtype('float64').
"""5. 数据归一化"""
from sklearn.preprocessing import minmax_scale
# 归一化,缩放到0-1
X_train = minmax_scale(X_train)
X_test = minmax_scale(X_test)
然后对数据采用SVM模型预测并评分分析
"""6. 模型训练"""
linearSVC = LinearSVC()
linearSVC.fit(X_train, y_train)
linearSVC_predict = linearSVC.predict(X_test)
"""7. 输出结果"""
# print("predict:",log_reg.score(X_test, y_test))
print("predict:",linearSVC.score(X_test, y_test))
print("f1_score:",f1_score(y_test, linearSVC_predict))
print("r2_score:",r2_score(y_test, linearSVC_predict))
得到如下结果:
使用决策树模型预测并得到结果:
"""6. 模型训练"""
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree_predict = dtree.predict(X_test)
"""7. 输出结果"""
print("decision_tree准确率:", dtree.score(X_test, y_test))
print("f1_score准确率:", f1_score(y_test, dtree_predict))
print("r2_score准确率:", r2_score(y_test, dtree_predict))