一、数据预处理
1.导入需要的包且读取数据
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif']=['SinHei']
plt.rcParams['axes.unicode_minus']=False
print('数据基本信息:')
data.info()
print('\n数据前五行预览')
print(data.head())
2.缺失值与异常值处理
discrete_features =data.select_dtypes(include=['object']).columns.tolist()
discrete_features
for feature in discrete_features:
print(f"\n{feature}的唯一值:")
print(data[feature].value_counts())
紧接着
home_ownership_mapping ={
'Own Home':1,
'Rent':2,
'Have Mortgage':3,
'Home Mortgage':4
}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)
years_in_job_mapping ={
'< 1 year':1,
'1 year':2,
'2 years':3,
'3 years':4,
'4 years':5,
'5 years':6,
'6 years':7,
'7 years':8,
'8 years':9,
'9 years':10,
'10 years':11
}
data['Years in current job'] =data['Years in current job'].map(years_in_job_mapping)
data = pd.get_dummies(data,columns=['Purpose'])
data2 = pd.read_csv("data.csv")
list_final = []
for i in data.columns:
if i not in data2.columns:
list_final.append(i)
for i in list_final:
data[i] = data[i].astype(int)
term_mapping ={
'Short Term':0,
'Long Term':1
}
data['Term']=data['Term'].map(term_mapping)
# data.rename(columns={'Term':'Long Term'},inplace=True)
continuous_features = data.select_dtypes(include=['int64','float64']).columns.tolist()
for feature in continuous_features:
mode_value = data[feature].mode()[0]
data[feature].fillna(mode_value,inplace=True)
使用svm,knn,随机随林,决策树,XGBoost,LightGBM取训练模型,通过混淆矩阵,对比模型的准确率,精确率,召回率,F1值
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
X =data.drop(['Credit Default'],axis=1)
y = data['Credit Default']
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.2,random_state=42)
print(f'训练集形状:{X_train.shape},测试集形状:{X_test.shape}')
(1)支持向量机
# SVM
svm_model = SVC(random_state=42)
svm_model.fit(X_train,y_train)
svm_pred = svm_model.predict(X_test)
print("\nSVM 分类报告:")
print(classification_report(y_test,svm_pred))
print("SVM 混淆矩阵:")
print(confusion_matrix(y_test,svm_pred))
svm_accuracy = accuracy_score(y_test,svm_pred)
svm_precision = precision_score(y_test,svm_pred)
svm_recall = recall_score(y_test,svm_pred)
svm_f1 = f1_score(y_test,svm_pred)
print("SVM 模型评估指标:")
print(f"准确率:{svm_accuracy:.4f}")
print(f"准确率:{svm_precision:.4f}")
print(f"召回率:{svm_recall:.4f}")
print(f"F1 值:{svm_accuracy:.4f}")
2.K近邻方法
#knn
knn_model =KNeighborsClassifier()
knn_model.fit(X_train,y_train)
knn_pred = knn_model.predict(X_test)
print("\nKNN 分类报告:")
print(classification_report(y_test,knn_pred))
print("KNN 混淆矩阵:")
print(confusion_matrix(y_test,knn_pred))
knn_accuracy = accuracy_score(y_test,knn_pred)
knn_precision = precision_score(y_test,knn_pred)
knn_recall = recall_score(y_test,knn_pred)
knn_f1 = f1_score(y_test,knn_pred)
print("KNN 模型评估指标:")
print(f"准确率:{knn_accuracy:.4f}")
print(f"精确率:{knn_precision:.4f}")
print(f"召回率:{knn_recall:.4f}")
print(f"F1 值:{knn_f1:.4f}")
3.逻辑回归
# 逻辑回归
logreg_model = LogisticRegression()
logreg_model.fit(X_train,y_train)
logreg_pred = logreg_model.predict(X_test)
print("\n逻辑回归 分类报告:")
print(classification_report(y_test, logreg_pred))
print("逻辑回归 混淆矩阵:")
print(confusion_matrix(y_test, logreg_pred))
logreg_accuracy = accuracy_score(y_test,logreg_pred)
logreg_precision = precision_score(y_test,logreg_pred)
logreg_recall = recall_score(y_test, logreg_pred)
logreg_f1 = f1_score(y_test, logreg_pred)
print("逻辑回归 模型评估指标:")
print(f"准确率: {logreg_accuracy:.4f}")
print(f"精确率: {logreg_precision:.4f}")
print(f"召回率: {logreg_recall:.4f}")
print(f"F1 值: {logreg_f1:.4f}")
4.朴素贝叶斯
#朴素贝叶斯
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
print("\n朴素贝叶斯 分类报告:")
print(classification_report(y_test, nb_pred))
print("朴素贝叶斯 混淆矩阵:")
print(confusion_matrix(y_test, nb_pred))
nb_accuracy = accuracy_score(y_test, nb_pred)
nb_precision = precision_score(y_test, nb_pred)
nb_recall = recall_score(y_test, nb_pred)
nb_f1 = f1_score(y_test, nb_pred)
print("朴素贝叶斯 模型评估指标:")
print(f"准确率: {nb_accuracy:.4f}")
print(f"精确率: {nb_precision:.4f}")
print(f"召回率: {nb_recall:.4f}")
print(f"F1 值: {nb_f1:.4f}")
5.决策树
# 决策树
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
print("\n决策树 分类报告:")
print(classification_report(y_test, dt_pred))
print("决策树 混淆矩阵:")
print(confusion_matrix(y_test, dt_pred))
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_precision = precision_score(y_test, dt_pred)
dt_recall = recall_score(y_test, dt_pred)
dt_f1 = f1_score(y_test, dt_pred)
print("决策树 模型评估指标:")
print(f"准确率: {dt_accuracy:.4f}")
print(f"精确率: {dt_precision:.4f}")
print(f"召回率: {dt_recall:.4f}")
print(f"F1 值: {dt_f1:.4f}")
6.随机森林
# 随机森林
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("\n随机森林 分类报告:")
print(classification_report(y_test, rf_pred))
print("随机森林 混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
print("随机森林 模型评估指标:")
print(f"准确率: {rf_accuracy:.4f}")
print(f"精确率: {rf_precision:.4f}")
print(f"召回率: {rf_recall:.4f}")
print(f"F1 值: {rf_f1:.4f}")
7.XGBoost
# XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
print("\nXGBoost 分类报告:")
print(classification_report(y_test, xgb_pred))
print("XGBoost 混淆矩阵:")
print(confusion_matrix(y_test, xgb_pred))
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_precision = precision_score(y_test, xgb_pred)
xgb_recall = recall_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred)
print("XGBoost 模型评估指标:")
print(f"准确率: {xgb_accuracy:.4f}")
print(f"精确率: {xgb_precision:.4f}")
print(f"召回率: {xgb_recall:.4f}")
print(f"F1 值: {xgb_f1:.4f}")
8.LightGBM
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train,y_train)
lgb_pred = lgb_model.predict(X_test)
print("\nLIghtGBm 分类报告:")
print(classification_report(y_test, lgb_pred))
print("LightGBM 混淆矩阵:")
print(confusion_matrix(y_test,lgb_pred))
lgb_accuracy = accuracy_score(y_test, lgb_pred)
lgb_precision = precision_score(y_test, lgb_pred)
lgb_recall = recall_score(y_test, lgb_pred)
lgb_f1 = f1_score(y_test, lgb_pred)
print("LightGBM 模型评估指标:")
print(f"准确率: {lgb_accuracy:.4f}")
print(f"精确率: {lgb_precision:.4f}")
print(f"召回率: {lgb_recall:.4f}")
print(f"F1 值: {lgb_f1:.4f}")