pip install xgboost
# 如果是在Jupyter Notebook编辑器中,则可输入如下内容,然后运行该代码块即可(需取消注释):
# !pip install xgboost
# XGBoost分类模型的引入方式:
from xgboost import XGBClassifier
# 在Jupyter Notebook编辑器中,在引入该库后,可以通过如下代码获取官方讲解内容(需取消注释):
# XGBClassifier?
import numpy as np
np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
array([[ 1, 2],
[ 3, 4],
[ 5, 6],
[ 7, 8],
[ 9, 10]])
# XGBoost分类模型简单代码演示如下所示:
from xgboost import XGBClassifier
import numpy as np
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) # 2020年升级后必须是numpy或者DataFrame格式
y = [0, 0, 0, 1, 1]
model = XGBClassifier()
model.fit(X, y)
print(model.predict(np.array([[5, 5]])))
# XGBoost回归模型的引入方式:
from xgboost import XGBRegressor
# 在Jupyter Notebook编辑器中,在引入该库后,可以通过如下代码获取官方讲解内容(需取消注释):
# XGBRegressor?
# XGBoost回归模型简单代码演示如下所示:
from xgboost import XGBRegressor
import numpy as np
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y = [1, 2, 3, 4, 5]
model = XGBRegressor()
model.fit(X, y)
print(model.predict(np.array([[5, 5]])))
12.2.1 案例背景
12.2.2 模型搭建
import pandas as pd
df = pd.read_excel('信用卡交易数据.xlsx')
换设备次数 | 支付失败次数 | 换IP次数 | 换IP国次数 | 交易金额 | 欺诈标签 | |
0 | 0 | 11 | 3 | 5 | 28836 | 1 |
1 | 5 | 6 | 1 | 4 | 21966 | 1 |
2 | 6 | 2 | 0 | 0 | 18199 | 1 |
3 | 5 | 8 | 2 | 2 | 24803 | 1 |
4 | 7 | 10 | 5 | 0 | 26277 | 1 |
# 通过如下代码将特征变量和目标变量单独提取出来,代码如下:
X = df.drop(columns='欺诈标签')
y = df['欺诈标签']
# 提取完特征变量后,通过如下代码将数据拆分为训练集及测试集:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
# 划分为训练集和测试集之后,就可以引入XGBoost分类器进行模型训练了,代码如下:
from xgboost import XGBClassifier
clf = XGBClassifier(n_estimators=100, learning_rate=0.05)
clf.fit(X_train, y_train)
XGBClassifier(base_score=None, booster=None, callbacks=None,In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.05, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.05, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)
12.2.3 模型预测及评估
# 模型搭建完毕后,通过如下代码预测测试集数据:
y_pred = clf.predict(X_test)
y_pred # 打印预测结果
array([0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
1, 1])
# 通过和之前章节类似的代码,我们可以将预测值和实际值进行对比:
a = pd.DataFrame() # 创建一个空DataFrame
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
预测值 | 实际值 | |
0 | 0 | 1 |
1 | 1 | 1 |
2 | 1 | 1 |
3 | 0 | 0 |
4 | 0 | 1 |
# 可以看到此时前五项的预测准确度为60%,如果想看所有测试集数据的预测准确度,可以使用如下代码:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
# 我们还可以通过XGBClassifier()自带的score()函数来查看模型预测的准确度评分,代码如下,获得的结果同样是0.875。
clf.score(X_test, y_test)
# XGBClassifier分类器本质预测的并不是准确的0或1的分类,而是预测其属于某一分类的概率,可以通过predict_proba()函数查看预测属于各个分类的概率,代码如下:
y_pred_proba = clf.predict_proba(X_test)
print(y_pred_proba[0:5]) # 查看前5个预测的概率
[[0.8265032 0.1734968 ]
[0.02098632 0.9790137 ]
[0.0084281 0.9915719 ]
[0.8999369 0.1000631 ]
[0.8290514 0.17094862]]
# 此时的y_pred_proba是个二维数组,其中第一列为分类为0(也即非欺诈)的概率,第二列为分类为1(也即欺诈)的概率,因此如果想查看欺诈(分类为1)的概率,可采用如下代码:
# y_pred_proba[:,1] # 分类为1的概率
# 下面我们利用4.3节相关代码绘制ROC曲线来评估模型预测的效果:
from sklearn.metrics import roc_curve
fpr, tpr, thres = roc_curve(y_test, y_pred_proba[:,1])
import matplotlib.pyplot as plt
plt.plot(fpr, tpr)
# 通过如下代码求出模型的AUC值:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred_proba[:,1])
# 我们可以通过查看各个特征的特征重要性(feature importance)来得出信用卡欺诈行为判断中最重要的特征变量:
array([0.40674362, 0.19018467, 0.04100984, 0.33347663, 0.02858528],
# 通过如下5.2.2节特征重要性相关知识点进行整理,方便结果呈现,代码如下:
features = X.columns # 获取特征名称
importances = clf.feature_importances_ # 获取特征重要性
# 通过二维表格形式显示
importances_df = pd.DataFrame()
importances_df['特征名称'] = features
importances_df['特征重要性'] = importances
importances_df.sort_values('特征重要性', ascending=False)
特征名称 | 特征重要性 | |
0 | 换设备次数 | 0.406744 |
3 | 换IP国次数 | 0.333477 |
1 | 支付失败次数 | 0.190185 |
2 | 换IP次数 | 0.041010 |
4 | 交易金额 | 0.028585 |
10.2.4 模型参数调优
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth': [1, 3, 5], 'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1, 0.2]} # 指定模型中参数的范围
clf = XGBClassifier() # 构建模型
grid_search = GridSearchCV(clf, parameters, scoring='roc_auc', cv=5)
# 下面我们将数据传入网格搜索模型并输出参数最优值:
grid_search.fit(X_train, y_train) # 传入数据
grid_search.best_params_ # 输出参数的最优值
{'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 100}
# 下面我们根据新的参数建模,首先重新搭建XGBoost分类器,并将训练集数据传入其中:
clf = XGBClassifier(max_depth=1, n_estimators=100, learning_rate=0.05)
clf.fit(X_train, y_train)
XGBClassifier(base_score=None, booster=None, callbacks=None,In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.05, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=1, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.05, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=1, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)
# 因为我们是通过ROC曲线的AUC评分作为模型评价准则来进行参数调优的,因此通过如下代码我们来查看新的AUC值:
y_pred_proba = clf.predict_proba(X_test)
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred_proba[:,1])
12.3.1 案例背景
12.3.2 多元线性回归模型
import pandas as pd
df = pd.read_excel('信用评分卡模型.xlsx')
月收入 | 年龄 | 性别 | 历史授信额度 | 历史违约次数 | 信用评分 | |
0 | 7783 | 29 | 0 | 32274 | 3 | 73 |
1 | 7836 | 40 | 1 | 6681 | 4 | 72 |
2 | 6398 | 25 | 0 | 26038 | 2 | 74 |
3 | 6483 | 23 | 1 | 24584 | 4 | 65 |
4 | 5167 | 23 | 1 | 6710 | 3 | 73 |
# 通过如下代码将特征变量和目标变量单独提取出来,代码如下:
X = df.drop(columns='信用评分')
Y = df['信用评分']
# 从Scikit-Learn库中引入LinearRegression()模型进行模型训练,代码如下:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
# 4.线性回归方程构造
print('各系数为:' + str(model.coef_))
print('常数项系数k0为:' + str(model.intercept_))
各系数为:[ 5.58658996e-04 1.62842002e-01 2.18430276e-01 6.69996665e-05
# 利用3.2节模型评估的方法对此多元线性回归模型进行评估,代码如下:
import statsmodels.api as sm
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2).fit()
Dep. Variable: | 信用评分 | R-squared: | 0.629 |
Model: | OLS | Adj. R-squared: | 0.628 |
Method: | Least Squares | F-statistic: | 337.6 |
Date: | Mon, 17 Apr 2023 | Prob (F-statistic): | 2.32e-211 |
Time: | 10:56:44 | Log-Likelihood: | -2969.8 |
No. Observations: | 1000 | AIC: | 5952. |
Df Residuals: | 994 | BIC: | 5981. |
Df Model: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
const | 67.1669 | 1.121 | 59.906 | 0.000 | 64.967 | 69.367 |
月收入 | 0.0006 | 8.29e-05 | 6.735 | 0.000 | 0.000 | 0.001 |
年龄 | 0.1628 | 0.022 | 7.420 | 0.000 | 0.120 | 0.206 |
性别 | 0.2184 | 0.299 | 0.730 | 0.466 | -0.369 | 0.806 |
历史授信额度 | 6.7e-05 | 7.78e-06 | 8.609 | 0.000 | 5.17e-05 | 8.23e-05 |
历史违约次数 | -1.5106 | 0.140 | -10.811 | 0.000 | -1.785 | -1.236 |
Omnibus: | 13.180 | Durbin-Watson: | 1.996 |
Prob(Omnibus): | 0.001 | Jarque-Bera (JB): | 12.534 |
Skew: | -0.236 | Prob(JB): | 0.00190 |
Kurtosis: | 2.721 | Cond. No. | 4.27e+05 |
可以看到模型整体的R-squared为0.629,Adj. R-Squared为0.628,整体拟合效果一般,可能是因为数据量偏少的原因。同时我们再来观察P值,可以发现大部分特征变量的P值都较小(小于0.05),的确是和目标变量:信用评分显著相关,而性别这一特征变量的P值达到了0.466,即与目标变量没有显著相关性,这个也的确符合经验认知,所以在多元线性回归模型中,我们其实可以把性别这一特征变量舍去。
12.3.3 GBDT回归模型
# 这里使用第九章讲过的GBDT回归模型同样来做一下回归分析,首先读取1000条信用卡客户的数据并划分特征变量和目标变量,这部分代码和上面线性回归的代码是一样的。
# 1.读取数据
import pandas as pd
df = pd.read_excel('信用评分卡模型.xlsx')
# 2.提取特征变量和目标变量
X = df.drop(columns='信用评分')
y = df['信用评分']
# 通过如下代码划分训练集和测试集数据:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
# 划分训练集和测试集完成后,就可以从Scikit-Learn库中引入GBDT模型进行模型训练了,代码如下:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor() # 使用默认参数
model.fit(X_train, y_train)
GradientBoostingRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
# 模型搭建完毕后,通过如下代码预测测试集数据:
y_pred = model.predict(X_test)
[70.77631652 71.40032104 73.73465155 84.52533945 71.09188294 84.9327599
73.72232388 83.44560704 82.61221486 84.86927209]
# 通过和之前章节类似的代码,我们可以将预测值和实际值进行对比:
a = pd.DataFrame() # 创建一个空DataFrame
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
预测值 | 实际值 | |
0 | 70.776317 | 79 |
1 | 71.400321 | 80 |
2 | 73.734652 | 62 |
3 | 84.525339 | 89 |
4 | 71.091883 | 80 |
# 因为GradientBoostingRegressor()是一个回归模型,所以我们通过查看其R-squared值来评判模型的拟合效果:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
# 我们还可以通过GradientBoostingRegressor()自带的score()函数来查看模型预测的效果:
model.score(X_test, y_test)
12.3.4 XGBoost回归模型
# 如下所示,其中前3步读取数据,提取特征变量和目标变量,划分训练集和测试集都与GBDT模型相同,因此不再重复,直接从第四步模型开始讲解:
# 1.读取数据
import pandas as pd
df = pd.read_excel('信用评分卡模型.xlsx')
# 2.提取特征变量和目标变量
X = df.drop(columns='信用评分')
y = df['信用评分']
# 3.划分测试集和训练集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
# 划分训练集和测试集完成后,就可以从Scikit-Learn库中引入XGBRegressor()模型进行模型训练了,代码如下:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=20,learning_rate=0.3) # 使用默认参数
model.fit(X_train, y_train)
XGBRegressor(base_score=None, booster=None, callbacks=None,In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.3, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=20, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.3, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=20, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)
# 模型搭建完毕后,通过如下代码预测测试集数据:
y_pred = model.predict(X_test)
[71.72233 71.767944 74.16581 84.49712 71.25674 84.90718 74.56596
82.27333 81.87485 84.925186]
# 通过和之前章节类似的代码,我们可以将预测值和实际值进行对比:
a = pd.DataFrame() # 创建一个空DataFrame
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
预测值 | 实际值 | |
0 | 71.722328 | 79 |
1 | 71.767944 | 80 |
2 | 74.165810 | 62 |
3 | 84.497124 | 89 |
4 | 71.256737 | 80 |
# 因为XGBRegressor()是一个回归模型,所以通过查看R-squared来评判模型的拟合效果:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
# 我们还可以通过XGBRegressor()自带的score()函数来查看模型预测的效果:
model.score(X_test, y_test)
# 通过12.2.3节讲过的feature_importances_属性,我们来查看模型的特征重要性:
features = X.columns # 获取特征名称
importances = model.feature_importances_ # 获取特征重要性
# 通过二维表格形式显示
importances_df = pd.DataFrame()
importances_df['特征名称'] = features
importances_df['特征重要性'] = importances
importances_df.sort_values('特征重要性', ascending=False)
特征名称 | 特征重要性 | |
0 | 月收入 | 0.372411 |
4 | 历史违约次数 | 0.328623 |
3 | 历史授信额度 | 0.190565 |
1 | 年龄 | 0.066498 |
2 | 性别 | 0.041902 |
# 通过和10.2.4节类似的代码,我们可以对XGBoost回归模型进行参数调优,代码如下:
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth': [1, 3, 5], 'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1, 0.2]} # 指定模型中参数的范围
clf = XGBRegressor() # 构建回归模型
grid_search = GridSearchCV(model, parameters, scoring='r2', cv=5)
grid_search.fit(X_train, y_train) # 传入数据
grid_search.best_params_ # 输出参数的最优值
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
{‘learning_rate’: 0.1, ‘max_depth’: 3, ‘n_estimators’: 50}
# 在模型中设置参数,代码如下:
model = XGBRegressor(max_depth=3, n_estimators=50, learning_rate=0.1)
model.fit(X_train, y_train)
XGBRegressor(base_score=None, booster=None, callbacks=None,In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.1, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=50, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.1, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=3, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=50, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, ...)
# 此时再通过r2_score()函数进行模型评估,代码如下(也可以用model.score(X_test, y_test)进行评分,效果一样):
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
from sklearn.preprocessing import StandardScaler
X_new = StandardScaler().fit_transform(X)
X_new # 打印标准化后的数据
array([[-0.88269208, -1.04890243, -1.01409939, -0.60873764, 0.63591822],
[-0.86319167, 0.09630122, 0.98609664, -1.55243002, 1.27956013],
[-1.39227834, -1.46534013, -1.01409939, -0.83867808, -0.0077237 ],
[ 1.44337605, 0.61684833, 0.98609664, 1.01172301, -0.0077237 ],
[ 0.63723633, -0.21602705, 0.98609664, -0.32732239, -0.0077237 ],
[ 1.57656755, 0.61684833, -1.01409939, 1.30047599, -0.0077237 ]])
# 3.划分测试集和训练集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=123)
# 4.建模
# 划分训练集和测试集完成后,就可以从Scikit-Learn库中引入XGBRegressor()模型进行模型训练了,代码如下:
from xgboost import XGBRegressor
model = XGBRegressor() # 使用默认参数
model.fit(X_train, y_train)
# 因为XGBRegressor()是一个回归模型,所以通过查看R-squared来评判模型的拟合效果:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
pip install lightgbm
# 如果是y在Jupyter Notebook编辑器中,则可输入如下内容(需取消注释),然后运行该代码块即可:
# !pip install lightgbm
# 在Jupyter Notebook编辑器中,在引入该库后,可以通过如下代码获取官方讲解内容(需取消注释):
# LGBMClassifier?
# LightGBM分类模型简单代码演示如下所示:
from lightgbm import LGBMClassifier
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
y = [0, 0, 0, 1, 1]
model = LGBMClassifier()
model.fit(X, y)
print(model.predict([[5, 5]]))
# LightGBM回归模型的引入方式:(需取消注释)
# from lightgbm import LGBMRegressor
# 在Jupyter Notebook编辑器中,在引入该库后,可以通过如下代码获取官方讲解内容:(需取消注释)
# LGBMRegressor?
# LightGBM回归模型简单代码演示如下所示:
from lightgbm import LGBMRegressor
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
y = [1, 2, 3, 4, 5]
model = LGBMRegressor()
model.fit(X, y)
print(model.predict([[5, 5]]))
12.5.1 案例背景
12.5.2 模型搭建
# 1.读取数据
import pandas as pd
df = pd.read_excel('客户信息及违约表现.xlsx')
收入 | 年龄 | 性别 | 历史授信额度 | 历史违约次数 | 是否违约 | |
0 | 462087 | 26 | 1 | 0 | 1 | 1 |
1 | 362324 | 32 | 0 | 13583 | 0 | 1 |
2 | 332011 | 52 | 1 | 0 | 1 | 1 |
3 | 252895 | 39 | 0 | 0 | 1 | 1 |
4 | 352355 | 50 | 1 | 0 | 0 | 1 |
# 2.提取特征变量和目标变量
X = df.drop(columns='是否违约')
Y = df['是否违约']
# 3.划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123)
# 4.模型训练及搭建
from lightgbm import LGBMClassifier
model = LGBMClassifier()
model.fit(X_train, y_train)
LGBMClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
# 通过如下代码可以查看官方讲解
# LGBMClassifier?
12.5.3 模型预测及评估
# 预测测试集数据
y_pred = model.predict(X_test)
[1 0 1 0 1 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 0 1 0 0 0 1 0 1 0 1 0
1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1
0 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 1 0 1 1 0
0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 1
0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0
0 0 1 1 0 1 0 0 1 1 0 1 0 1 1]
# 预测值和实际值对比
a = pd.DataFrame() # 创建一个空DataFrame
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
预测值 | 实际值 | |
0 | 1 | 1 |
1 | 0 | 1 |
2 | 1 | 1 |
3 | 0 | 0 |
4 | 1 | 1 |
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
# 查看得分
model.score(X_test, y_test)
# 查看预测属于各个分类的概率
y_pred_proba = model.predict_proba(X_test)
# 绘制ROC曲线
from sklearn.metrics import roc_curve
fpr, tpr, thres = roc_curve(y_test, y_pred_proba[:,1])
import matplotlib.pyplot as plt
plt.plot(fpr, tpr)
# AUC值
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred_proba[:,1])
# 特征重要性
array([1175, 668, 118, 895, 126])
features = X.columns # 获取特征名称
importances = model.feature_importances_ # 获取特征重要性
# 通过二维表格形式显示
importances_df = pd.DataFrame()
importances_df['特征名称'] = features
importances_df['特征重要性'] = importances
importances_df.sort_values('特征重要性', ascending=False)
特征名称 | 特征重要性 | |
0 | 收入 | 1175 |
3 | 历史授信额度 | 895 |
1 | 年龄 | 668 |
4 | 历史违约次数 | 126 |
2 | 性别 | 118 |
12.5.4 模型参数调优
# 参数调优
from sklearn.model_selection import GridSearchCV # 网格搜索合适的超参数
parameters = {'num_leaves': [10, 15, 31], 'n_estimators': [10, 20, 30], 'learning_rate': [0.05, 0.1, 0.2]}
model = LGBMClassifier() # 构建分类器
grid_search = GridSearchCV(model, parameters, scoring='roc_auc', cv=5) # cv=5表示交叉验证5次,scoring='roc_auc'表示以ROC曲线的AUC评分作为模型评价准则
# 输出参数最优值
grid_search.fit(X_train, y_train) # 传入数据
grid_search.best_params_ # 输出参数的最优值
{'learning_rate': 0.2, 'n_estimators': 10, 'num_leaves': 10}
# 重新搭建分类器
model = LGBMClassifier(num_leaves=10, n_estimators=10,learning_rate=0.2)
model.fit(X_train, y_train)
LGBMClassifier(learning_rate=0.2, n_estimators=10, num_leaves=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LGBMClassifier(learning_rate=0.2, n_estimators=10, num_leaves=10)
# 查看ROC曲线
y_pred_proba = model.predict_proba(X_test)
from sklearn.metrics import roc_curve
fpr, tpr, thres = roc_curve(y_test, y_pred_proba[:,1])
import matplotlib.pyplot as plt
plt.plot(fpr, tpr)
# 查看AUC值
y_pred_proba = model.predict_proba(X_test)
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred_proba[:, 1])
12.6.1 案例背景
12.6.2 模型搭建
# 读取数据
import pandas as pd
df = pd.read_excel('广告收益数据.xlsx')
电视 | 广播 | 报纸 | 收益 | |
0 | 230.1 | 37.8 | 69.2 | 331.5 |
1 | 44.5 | 39.3 | 45.1 | 156.0 |
2 | 17.2 | 45.9 | 69.3 | 139.5 |
3 | 151.5 | 41.3 | 58.5 | 277.5 |
4 | 180.8 | 10.8 | 58.4 | 193.5 |
# 1.提取特征变量和目标变量
X = df.drop(columns='收益')
y = df['收益']
# 2.划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
# 3. 模型训练和搭建
from lightgbm import LGBMRegressor
model = LGBMRegressor()
model.fit(X_train, y_train)
LGBMRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
12.6.3 模型预测及评估
# 预测测试数据
y_pred = model.predict(X_test)
array([192.6139063 , 295.11999665, 179.92649365, 293.45888909,
# 预测值和实际值对比
a = pd.DataFrame() # 创建一个空DataFrame
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
预测值 | 实际值 | |
0 | 192.613906 | 190.5 |
1 | 295.119997 | 292.5 |
2 | 179.926494 | 171.0 |
3 | 293.458889 | 324.0 |
4 | 166.861594 | 144.0 |
# 手动输入数据进行预测
X = [[71, 11, 2]]
# 查看R-square
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
# 查看评分
model.score(X_test, y_test)
# 特征重要性
array([ 950, 1049, 963])
12.6.4 模型参数调优
# 参数调优
from sklearn.model_selection import GridSearchCV # 网格搜索合适的超参数
parameters = {'num_leaves': [15, 31, 62], 'n_estimators': [20, 30, 50, 70], 'learning_rate': [0.1, 0.2, 0.3, 0.4]} # 指定分类器中参数的范围
model = LGBMRegressor() # 构建模型
grid_search = GridSearchCV(model, parameters,scoring='r2',cv=5) # cv=5表示交叉验证5次,scoring='r2'表示以R-squared作为模型评价准则
# 输出参数最优值
grid_search.fit(X_train, y_train) # 传入数据
grid_search.best_params_ # 输出参数的最优值
{'learning_rate': 0.3, 'n_estimators': 50, 'num_leaves': 31}
# 重新搭建LightGBM回归模型
model = LGBMRegressor(num_leaves=31, n_estimators=50,learning_rate=0.3)
model.fit(X_train, y_train)
# 查看得分
model.score(X_test, y_test)