导入需要的一些包,并给定简单的测试数据集
from sklearn.model_selection import train_test_split
from pandas import DataFrame
from xgboost import XGBRegressor
from xgboost import plot_tree
import matplotlib.pyplot as plt
import shap
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_iris
from sklearn import tree
import sys
import os
os.environ["PATH"] += os.pathsep + '/usr/lib'
import pandas as pd
import numpy as np
X = pd.DataFrame(np.array([[1,2,4,5,4],[2,3,6,4,2],[3,2,5,3,4]]).reshape(-1,3),columns = ['a','b','c'])
y = np.array([2,3,3,5,4])
X
y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
print(y_train)
def train_predict():
"""
param:label be list
"""
X_train_shap = X_train
print(X_train_shap)
X_test_shap = X_test
clf = XGBRegressor(
n_estimators=4,#四棵树
learning_rate =0.3,# 学习率
max_depth=4,
)
model_sklearn=clf.fit(X_train_shap, y_train)
y_sklearn= clf.predict(X_train_shap,output_margin = True)
print(clf.get_params())
model_sklearn.get_booster().dump_model('./xgboost_test.txt')
# print(clf.trees_to_dateframe(str.encode(encoding="utf-8", errors="strict")))
plot_tree(model_sklearn, num_trees=0)
plt.show()
plot_tree(model_sklearn, num_trees=1)
plt.show()
plot_tree(model_sklearn, num_trees=2)
plt.show()
plot_tree(model_sklearn, num_trees=3)
plt.show()
print(model_sklearn.get_booster())
return y_sklearn
train_predict()
以下为模型生成的四棵树,其中第一颗就是一个叶子,叶子对应的预测值为0.72。第二颗树深度为2,有左右两个叶子节点。
xgboost 模型默认的初始值为0.5,所以可以计算:(2*(y_train-0.5).sum()/(2*(4+1)))*0.3=0.72
,其中0.5是默认的初始值,0.3是学习率,4是训练样本的个数,1
对于第二颗树的左边叶子, a < 2 a<2 a<2的条件将样本分成两个部分。
0.3*2*(2-0.72-0.5)/4 = 0.3*(1.28*2-1)/(2+2)
## 输出的数值为0.11699999999999999
## 第二颗树左边叶子节点为0.11699999
对于第二颗树的右边叶子
0.3*2*(np.array([3.28, 4.28, 2.28])-0.5).sum()/8
## 输出结果为0.6255
## 第二颗树的右节点数值0.625500023
以此类推可以计算第三颗树和第四颗树的叶子节点值
有了叶子节点值后,就可以预测样本了。
未完待续!