#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
@Time : 2021/6/15 7:52
@Author : Loden
@FileName: customer_default_demo.py
@Software: PyCharm
@desc :
"""
import pandasas pd
from sklearn.model_selectionimport train_test_split
from lightgbm import LGBMClassifier
from sklearn.metricsimport accuracy_score
from sklearn.metricsimport roc_curve
import matplotlib.pyplotas plt
from sklearn.metricsimport roc_auc_score
from sklearn.model_selectionimport GridSearchCV
df = pd.read_excel(r'D:\python_data\MuffetTrader\data_file\客户信息及违约表现.xlsx')
# print(df.head(10))
# 提取特征变量和目标变量
X = df.drop(columns='是否违约')
Y = df['是否违约']
# 划分训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
# 模型训练和搭建
model = LGBMClassifier()
model.fit(X_train, Y_train)
# 模型预测与评估
y_pred = model.predict(X_test)
print(y_pred)
# 汇总预测值和实际值
a = pd.DataFrame()
a['预测值'] =list(y_pred)
a['实际值'] =list(Y_test)
print(a.head(10))
# 查看模型整体的预测准确度
score = accuracy_score(y_pred, Y_test)
print(score)
print(model.score(X_test, Y_test))
# 查看预测属于各个分类的概率
y_pred_proba = model.predict_proba(X_test)
fpr, tpr, thres = roc_curve(Y_test, y_pred_proba[:, 1])
plt.plot(fpr, tpr)
# plt.show()
# 计算模型的AUC值
score = roc_auc_score(Y_test.values, y_pred_proba[:, 1])
print(score)
# 筛选特征总最重要的变量
features = X.columns
importances = model.feature_importances_
importances_df = pd.DataFrame()
importances_df['特征名称'] = features
importances_df['特征重要性'] = importances
importances_df.sort_values('特征重要性', ascending=False)
print(importances_df)
# 模型参数调优
parameters = {'num_leaves': [10, 15, 31], 'n_estimators': [10, 20, 30], 'learning_rate': [0.05, 0.1, 0.2]}
model = LGBMClassifier()
grid_search = GridSearchCV(model, parameters, scoring='roc_auc', cv=5)
grid_search.fit(X_train, Y_train)
print('最优参数:')
print(grid_search.best_params_)
model = LGBMClassifier(num_leaves=10, learning_rate=0.2, n_estimators=10)
model.fit(X_train, Y_train)
y_pred_proba = model.predict_proba(X_test)
score = roc_auc_score(Y_test, y_pred_proba[:, 1])
print(score)