python打卡day10@浙大疏锦行

一、数据预处理

1.导入需要的包且读取数据

import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
import seaborn as sns
 
plt.rcParams['font.sans-serif']=['SinHei'] 
plt.rcParams['axes.unicode_minus']=False  
 
print('数据基本信息:')
data.info()
print('\n数据前五行预览')
print(data.head())

2.缺失值与异常值处理

discrete_features =data.select_dtypes(include=['object']).columns.tolist()
discrete_features
 
for feature in discrete_features:
    print(f"\n{feature}的唯一值:")
    print(data[feature].value_counts()) 

紧接着

home_ownership_mapping ={
    'Own Home':1,
    'Rent':2,
    'Have Mortgage':3,
    'Home Mortgage':4
}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)
 
years_in_job_mapping ={
    '< 1 year':1,
    '1 year':2,
    '2 years':3,
    '3 years':4,
    '4 years':5,
    '5 years':6,
    '6 years':7,
    '7 years':8,
    '8 years':9,
    '9 years':10,
    '10 years':11
}
data['Years in current job'] =data['Years in current job'].map(years_in_job_mapping)
 
 
data = pd.get_dummies(data,columns=['Purpose'])
data2 = pd.read_csv("data.csv")
list_final = []
for i in data.columns:
    if i not in data2.columns:
        list_final.append(i)
for i in list_final:
    data[i] = data[i].astype(int)
 
 
term_mapping ={
    'Short Term':0,
    'Long Term':1
}
data['Term']=data['Term'].map(term_mapping)
# data.rename(columns={'Term':'Long Term'},inplace=True)
 
 
 
continuous_features = data.select_dtypes(include=['int64','float64']).columns.tolist()
 
for feature in continuous_features:
    mode_value = data[feature].mode()[0]
    data[feature].fillna(mode_value,inplace=True)
 
 

二.模型建模

使用svm,knn,随机随林,决策树,XGBoost,LightGBM取训练模型,通过混淆矩阵,对比模型的准确率,精确率,召回率,F1值

from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression 
import xgboost as xgb 
import lightgbm as lgb 
from sklearn.ensemble import RandomForestClassifier 
from catboost import CatBoostClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
from sklearn.metrics import classification_report, confusion_matrix 
import warnings 
from sklearn.model_selection import train_test_split
 
warnings.filterwarnings("ignore") 

X =data.drop(['Credit Default'],axis=1)
y = data['Credit Default']
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.2,random_state=42)
 
print(f'训练集形状:{X_train.shape},测试集形状:{X_test.shape}')

(1)支持向量机

# SVM
svm_model = SVC(random_state=42)
svm_model.fit(X_train,y_train)
svm_pred = svm_model.predict(X_test)
 
print("\nSVM 分类报告:")
print(classification_report(y_test,svm_pred)) 
print("SVM 混淆矩阵:")
print(confusion_matrix(y_test,svm_pred)) 
 
svm_accuracy = accuracy_score(y_test,svm_pred)
svm_precision = precision_score(y_test,svm_pred)
svm_recall = recall_score(y_test,svm_pred)
svm_f1 = f1_score(y_test,svm_pred)
print("SVM 模型评估指标:")
print(f"准确率:{svm_accuracy:.4f}")
print(f"准确率:{svm_precision:.4f}")
print(f"召回率:{svm_recall:.4f}")
print(f"F1 值:{svm_accuracy:.4f}")

 2.K近邻方法

#knn
knn_model =KNeighborsClassifier()
knn_model.fit(X_train,y_train)
knn_pred = knn_model.predict(X_test)
 
print("\nKNN 分类报告:")
print(classification_report(y_test,knn_pred))
print("KNN 混淆矩阵:")
print(confusion_matrix(y_test,knn_pred))
 
knn_accuracy = accuracy_score(y_test,knn_pred)
knn_precision = precision_score(y_test,knn_pred)
knn_recall = recall_score(y_test,knn_pred)
knn_f1 = f1_score(y_test,knn_pred)
print("KNN 模型评估指标:")
print(f"准确率:{knn_accuracy:.4f}")
print(f"精确率:{knn_precision:.4f}")
print(f"召回率:{knn_recall:.4f}")
print(f"F1 值:{knn_f1:.4f}")

3.逻辑回归

# 逻辑回归
logreg_model = LogisticRegression()
logreg_model.fit(X_train,y_train)
logreg_pred = logreg_model.predict(X_test)
 
print("\n逻辑回归 分类报告:")
print(classification_report(y_test, logreg_pred))
print("逻辑回归 混淆矩阵:")
print(confusion_matrix(y_test, logreg_pred))
 
logreg_accuracy = accuracy_score(y_test,logreg_pred)
logreg_precision = precision_score(y_test,logreg_pred)
logreg_recall = recall_score(y_test, logreg_pred)
logreg_f1 = f1_score(y_test, logreg_pred)
print("逻辑回归 模型评估指标:")
print(f"准确率: {logreg_accuracy:.4f}")
print(f"精确率: {logreg_precision:.4f}")
print(f"召回率: {logreg_recall:.4f}")
print(f"F1 值: {logreg_f1:.4f}")

 4.朴素贝叶斯

#朴素贝叶斯
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
 
print("\n朴素贝叶斯 分类报告:")
print(classification_report(y_test, nb_pred))
print("朴素贝叶斯 混淆矩阵:")
print(confusion_matrix(y_test, nb_pred))
 
nb_accuracy = accuracy_score(y_test, nb_pred)
nb_precision = precision_score(y_test, nb_pred)
nb_recall = recall_score(y_test, nb_pred)
nb_f1 = f1_score(y_test, nb_pred)
print("朴素贝叶斯 模型评估指标:")
print(f"准确率: {nb_accuracy:.4f}")
print(f"精确率: {nb_precision:.4f}")
print(f"召回率: {nb_recall:.4f}")
print(f"F1 值: {nb_f1:.4f}")

5.决策树

# 决策树
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
 
print("\n决策树 分类报告:")
print(classification_report(y_test, dt_pred))
print("决策树 混淆矩阵:")
print(confusion_matrix(y_test, dt_pred))
 
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_precision = precision_score(y_test, dt_pred)
dt_recall = recall_score(y_test, dt_pred)
dt_f1 = f1_score(y_test, dt_pred)
print("决策树 模型评估指标:")
print(f"准确率: {dt_accuracy:.4f}")
print(f"精确率: {dt_precision:.4f}")
print(f"召回率: {dt_recall:.4f}")
print(f"F1 值: {dt_f1:.4f}")

 6.随机森林

# 随机森林
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
 
print("\n随机森林 分类报告:")
print(classification_report(y_test, rf_pred))
print("随机森林 混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))
 
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
print("随机森林 模型评估指标:")
print(f"准确率: {rf_accuracy:.4f}")
print(f"精确率: {rf_precision:.4f}")
print(f"召回率: {rf_recall:.4f}")
print(f"F1 值: {rf_f1:.4f}")

7.XGBoost

# XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
 
print("\nXGBoost 分类报告:")
print(classification_report(y_test, xgb_pred))
print("XGBoost 混淆矩阵:")
print(confusion_matrix(y_test, xgb_pred))
 
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_precision = precision_score(y_test, xgb_pred)
xgb_recall = recall_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred)
print("XGBoost 模型评估指标:")
print(f"准确率: {xgb_accuracy:.4f}")
print(f"精确率: {xgb_precision:.4f}")
print(f"召回率: {xgb_recall:.4f}")
print(f"F1 值: {xgb_f1:.4f}")

 8.LightGBM

lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train,y_train)
lgb_pred = lgb_model.predict(X_test)
 
print("\nLIghtGBm 分类报告:")
print(classification_report(y_test, lgb_pred))
print("LightGBM 混淆矩阵:")
print(confusion_matrix(y_test,lgb_pred))
lgb_accuracy = accuracy_score(y_test, lgb_pred)
lgb_precision = precision_score(y_test, lgb_pred)
lgb_recall = recall_score(y_test, lgb_pred)
lgb_f1 = f1_score(y_test, lgb_pred)
print("LightGBM 模型评估指标:")
print(f"准确率: {lgb_accuracy:.4f}")
print(f"精确率: {lgb_precision:.4f}")
print(f"召回率: {lgb_recall:.4f}")
print(f"F1 值: {lgb_f1:.4f}")

你可能感兴趣的:(python打卡60天行动,python,开发语言)