from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassfier
iris = load_iris() #加载数据集
X = iris[:, 2:] #选最后两行特征(花瓣的长度和宽度)作为训练样本
y = iris.target #数据集的标签
decicion_tree_classfier = DecisionTreeClassfier(max_depth = 2)
decicion_tree_classfier.fit(X,y)
DecisionTreeClassifier(
ccp_alpha=0.0,
class_weight=None, criterion=‘gini’,
max_depth=2,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort=‘deprecated’,
random_state=42,
splitter=‘best’)
from graphviz import Source
from sklearn.tree import export_graphviz
export_graphviz(decicion_tree_classfier,
out_file=os.path.join(IMAGES_PATH, "iris_tree.dot"),
feature_names = iris.feature_names[2:],
class_names = iris.target_names,
rounded = True,
filled = True)
Source.from_file(os.path.join(IMAGES_PATH, "iris_tree.dot"))
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
def plot_decision_boundary(classifier, X, y, axs=[0,7.5,0,3]):
# 1.画网格(横纵坐标的范围作为参数传入)
xx1 = np,linspace(axs[0],axs[1],100)
xx2 = np,linspace(axs[2],axs[3],100)
x1, x2 = np.meshgrid(xx1, xx2)
X_new = np.array([x1.ravel(),x2.ravel()]).T
# 2.算出网格中各个点的预测结果(x1.shape=(100,100))
y_pred = classfier.predict(X_new).reshape(x1.shape)
# 3.为不同的类别分配不同的颜色,并依照上面得到的网格预测结果画出不同区域不同颜色
colors = ('red', 'blue', 'green', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))
plt.contourf(x1, x2, y_pred, alpha=0.5, cmap=cmap)
# 4.画出训练样本(需要将draw_train指定为True)
plt.plot(X[:,0][y==0],X[:,1][y==0],"ro",label="Iris setosa")
plt.plot(X[:,0][y==1],X[:,1][y==1],"bs",label="Iris versicolor")
plt.plot(X[:,0][y == 2],X[:,1][y==2],"g^",label="Iris virginica")
# 画出横纵坐标的标签
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.figure(figsize=(8, 4))
# X = iris[:, 2:] —— 选最后两行特征(长度和宽度)作为训练样本
# y = iris.target —— 数据集的标签
plot_decision_boundary(decicion_tree_classfier, X, y)
plt.text(1.40, 1.0, "Depth=0", fontsize=15)
plt.text(3.2, 1.80, "Depth=1", fontsize=13)
plt.show()
# 两个[]中分别为所画线条的x轴的左右界坐标、y轴的上下界坐标
plt.plot([2.45, 2.45], [0, 3], "k-", linewidth=2)
plt.plot([2.45, 7.5], [1.75, 1.75], "k--", linewidth=2)
plt.plot([4.95, 4.95], [0, 1.75], "k:", linewidth=2)
plt.plot([4.85, 4.85], [1.75, 3], "k:", linewidth=2)
plt.text(4.05, 0.5, "(Depth=2)", fontsize=11)
基尼杂质度:
G i = 1 − ∑ k = 1 n p i , k 2 G_i=1-\sum_{k=1}^n p_{i,k}^2 Gi=1−k=1∑npi,k2
熵:
H i = − ∑ k = 1 , p i , k ≠ 0 n p i , k 2 l o g 2 ( p i , k ) H_i=-\sum_{k=1, p_{i,k} \ne 0}^n p_{i,k}^2 \ log_2(p_{i,k}) Hi=−k=1,pi,k=0∑npi,k2 log2(pi,k)
分类误差:
I E = 1 − m a x { p ( i ∣ t ) } I_E=1-max\{p(i\ |\ t)\} IE=1−max{ p(i ∣ t)}
CART算法用于分类时的损失函数:
J ( k , t k ) = m l e f t m G l e f t + m r i g h t m G r i g h t J(k,t_k)=\frac {m_{left}}{m}G_{left}+\frac {m_{right}}{m}G_{right} J(k,tk)=mmleftGleft+mmrightGright
训练的目的:
递归向上地求损失函数,最终得到一个根节点处的损失函数。训练的最终目的是找到一组合适的阈值,使得最该损失函数最小。
预测的复杂度: O ( l o g 2 m ) O(log_2\ m) O(log2 m)
训练的复杂度: O ( n × m l o g 2 m ) O(n×m \ log_2 \ m) O(n×m log2 m)
训练的复杂度较高,对于数量较少的训练集,可以使用预排序。
max_depth
:用于控制决策树的最大深度maximum_depth;min_samples_split
: 一个节点在被分割前必须拥有的最小样本数 ;min_samples_leaf
:叶节点必须拥有的最小样本数min_weight_fraction_leaf
:叶节点必须拥有的最小样本数与总样本数之比max_leaf_nodes
:叶节点必须拥有的最大样本数max_features
:用于每个节点进行分割的最大特征数观察 min_samples_leaf 对决策树拟合效果的影响
from sklearn.datasets import make_moons
Xmoon, ymoon = make_moons(n_samples=100, noise=0.25, random_state=53)
# 不指定,min_samples_leaf默认为1
decision_tree_clf1 = DecisionTreeClassifier(random_state=42)
decision_tree_clf2 = DecisionTreeClassifier(min_samples_leaf=2, random_state=42)
decision_tree_clf3 = DecisionTreeClassifier(min_samples_leaf=10,random_state=42)
# 使用三棵min_samples_leaf不同的决策树分别拟合数据
decision_tree_clf1.fit(Xmoon, ymoon)
decision_tree_clf2.fit(Xmoon, ymoon)
decision_tree_clf3.fit(Xmoon, ymoon)
figure, axs = plt.subplots(ncols=2, figsize=(10, 4), sharey=True)
# 左图
plt.sca(axs[0])
plot_decision_boundary(decision_tree_clf1, Xm, ym, axs=[-1.5, 2.4, -1, 1.5])
plt.title("min_samples_leaf = {}".format(decision_tree_clf1.min_samples_leaf),
fontsize=14)
plt.xlabel("x1")
plt.ylabel("x2")
# 中图
plt.sca(axes[1])
plot_decision_boundary(deep_tree_clf2, Xm, ym, axes=[-1.5, 2.4, -1, 1.5],
iris=False)
plt.title("min_samples_leaf = {}".format(deep_tree_clf2.min_samples_leaf),
fontsize=14)
plt.xlabel("x1")
plt.ylabel("")
# 右图
plt.sca(axes[2])
plot_decision_boundary(decision_tree_clf2, Xm, ym, axs=[-1.5, 2.4, -1, 1.5])
plt.title("min_samples_leaf = {}".format(decision_tree_clf3.min_samples_leaf),
fontsize=14)
plt.xlabel("x1")
plt.ylabel(" ")
plt.show()
观察 max_depth 对决策树拟合效果的影响
from sklearn.datasets import make_moons
Xm2, ym2 = make_moons(n_samples=100, noise=0.3, random_state=53)
decision_tree_clf4 = DecisionTreeClassifier(max_depth = 2, random_state=42)
decision_tree_clf5 = DecisionTreeClassifier(max_depth=6, random_state=42)
decision_tree_clf6 = DecisionTreeClassifier(max_depth=10, random_state=42)
decision_tree_clf4.fit(Xm, ym)
decision_tree_clf5.fit(Xm, ym)
decision_tree_clf6.fit(Xm, ym)
figure, axs = plt.subplots(ncols=2, figsize=(10, 4))
# 左图
plt.sca(axes[0])
plot_decision_boundary(decision_tree_clf4, Xm, ym, axs=[-1.5, 2.4, -1, 1.5])
plt.title("max_depth = {}".format(deep_tree_clf4.max_depth), fontsize=14)
plt.xlabel("x1")
plt.ylabel("x2")
# 中图
plt.sca(axes[1])
plot_decision_boundary(decision_tree_clf5, Xm, ym, axs=[-1.5, 2.4, -1, 1.5],
iris=False)
plt.title("max_depth = {}".format(deep_tree_clf5.max_depth), fontsize=14)
plt.xlabel("x1")
plt.ylabel(" ")
# 右图
plt.sca(axes[1])
plot_decision_boundary(decision_tree_clf6, Xm, ym, axs=[-1.5, 2.4, -1, 1.5])
plt.title("max_depth = {}".format(deep_tree_clf6.max_depth), fontsize=14)
plt.xlabel("x1")
plt.ylabel(" ")
plt.show()
树的不同 max_depth 对模型拟合效果的影响
from sklearn.tree import DecisionTreeRegressor
import numpy as np
# 模拟出随机数
np.random.seed(42)
X = np.random.rand(150, 1)
y = 4 * (X - 0.5) ** 2
y = y + np.random.randn(150, 1) / 10
# 定义三棵最大深度不同的回归决策树
regre_tree1 = DecisionTreeRegressor(random_state=42, max_depth=2)
regre_tree2 = DecisionTreeRegressor(random_state=42, max_depth=3)
regre_tree3 = DecisionTreeRegressor(random_state=42, max_depth=5)
# 分别用三棵决策树拟合训练数据
regre_tree1.fit(X,y)
regre_tree2.fit(X,y)
regre_tree3.fit(X,y)
# 画出回归预测的结果
def draw_regression_prediction(regre_tree, X, y, axs=[0, 1, -0.2, 1]):
x1 = np.linspace(axs[0], axs[1], 500).reshape(-1, 1)
y_pred = regre_tree.predict(x1)
# 加这一句可以确保图中横纵坐标的范围与axs一致
plt.axis(axs)
plt.plot(X, y, "b.")
plt.plot(x1, y_pred, "r-", linewidth=2, label=r"$\hat{y}$")
# 画图
figure, axs = plt.subplots(ncols=3, figsize=(18,4))
# 左
plt.sca(axs[0])
树的不同min_samples_leaf 对模型拟合效果的影响
draw_regression_prediction(regre_tree1, X, y)
plt.legend(loc="upper center", fontsize=14)
plt.title("max_depth={}".format(regre_tree1.max_depth), fontsize=14)
plt.xlabel('x1',fontsize=20)
plt.ylabel("y", fontsize=20, rotation=0 )
# 中
plt.sca(axs[1])
draw_regression_prediction(regre_tree2, X, y)
plt.title("max_depth={}".format(regre_tree2.max_depth), fontsize=14)
plt.xlabel('x1',fontsize=20)
# 右
plt.sca(axs[2])
draw_regression_prediction(regre_tree3, X, y)
plt.title("max_depth={}".format(regre_tree3.max_depth), fontsize=14)
plt.xlabel('x1',fontsize=20)
树的不同 min_samples_leaf 对模型拟合效果的影响
# 不指定min_samples_leaf,没有限制,默认为1
regre_tree4 = DecisionTreeRegressor(random_state=42)
regre_tree5 = DecisionTreeRegressor(random_state=42, min_samples_leaf=10)
regre_tree6 = DecisionTreeRegressor(random_state=42, min_samples_leaf=10)
# 分别用三棵决策树拟合数据
regre_tree4.fit(X, y)
regre_tree5.fit(X, y)
regre_tree6.fit(X, y)
x1 = np.linspace(0, 1, 500).reshape(-1, 1)
y_pred4 = regre_tree4.predict(x1)
y_pred5 = regre_tree5.predict(x1)
y_pred6 = regre_tree6.predict(x1)
figure, axs = plt.subplots(ncols=3, figsize=(18, 4))
# 左
plt.sca(axs[0])
plt.plot(X, y, "b.")
plt.plot(x1, y_pred1, "r.-", linewidth=2, label=r"$\hat{y}$")
plt.axis([0, 1, -0.2, 1.1])
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", fontsize=18, rotation=0)
plt.legend(loc="upper center", fontsize=18)
plt.title("min_samples_leaf=1", fontsize=14)
# 中
plt.sca(axs[1])
plt.plot(X, y, "b.")
plt.plot(x1, y_pred2, "r.-", linewidth=2, label=r"$\hat{y}$")
plt.axis([0, 1, -0.2, 1.1])
plt.xlabel("$x_1$", fontsize=18)
plt.title("min_samples_leaf={}".format(tree_reg2.min_samples_leaf), fontsize=14)
# 右
plt.sca(axs[2])
plt.plot(X, y, "b.")
plt.plot(x1, y_pred3, "r.-", linewidth=2, label=r"$\hat{y}$")
plt.axis([0, 1, -0.2, 1.1])
plt.xlabel("$x_1$", fontsize=18)
plt.title("min_samples_leaf={}".format(tree_reg3.min_samples_leaf), fontsize=14)
plt.show()
CART算法用于回归时的损失函数:
J ( k , t k ) = m l e f t m M S E l e f t + M S E r i g h t m G r i g h t J(k,t_k)=\frac {m_{left}}{m}MSE_{left}+\frac {MSE_{right}}{m}G_{right} J(k,tk)=mmleftMSEleft+mMSErightGright
训练的目的:
递归向上地求损失函数,最终得到一个根节点处的损失函数。训练的最终目的是找到一组合适的阈值,使得最该损失函数最小。