

class sklearn.tree.DecisionTreeClassifier(*, criterion=‘gini’, splitter=‘best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=‘deprecated’, ccp_alpha=0.0)
scikit-learn 使用 CART 算法的优化版本


criterion: {‘gini’,‘entropy’}, default=‘gini’

注意:当使用entropy时,sklearn实际计算的是基于信息熵的信息增益(information gain),即父节点的信息熵和子节点的信息熵之差。比起基尼系数,信息熵对不纯度更加敏感,对不纯度的惩罚最强。但在实际使用中,信息熵和基尼系数的效果基本相同。信息熵的计算比基尼系数缓慢一些,因为基尼系数的计算不涉及对数。另外,由于信息熵对不纯度更加敏感,所以信息熵作为指标时,决策树的生长会更加精细,因此对于高维数据或者噪音数据很多的数据,信息熵容易过拟合,基尼系数在这种情况下往往效果比较好。当模型欠拟合的时候,可考虑使用信息熵。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split #训练集和测试集的切分函数
wine = load_wine() #实例化数据集
(178, 13) 
pd.concat([pd.DataFrame(, pd.DataFrame(],axis=1).head()
0 1 2 3 4 5 6 7 8 9 10 11 12 0
0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0 0
1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0 0
2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0 0
3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0 0
4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0 0
array(['class_0', 'class_1', 'class_2'], dtype='
xtrain, xtest, ytrain, ytest = train_test_split(,, test_size=0.3)
(124, 13)
(54, 13)
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf =, ytrain)
score = clf.score(xtest, ytest) #返回预测的准确度
# 绘制决策树
tree.plot_tree(, ytrain),feature_names=wine.feature_names, class_names=wine.target_names )
[Text(167.4, 195.696, 'proline <= 953.5\nentropy = 1.571\nsamples = 124\nvalue = [40, 49, 35]\nclass = class_1'),
 Text(111.60000000000001, 152.208, 'color_intensity <= 3.825\nentropy = 1.202\nsamples = 88\nvalue = [4, 49, 35]\nclass = class_1'),
 Text(55.800000000000004, 108.72, 'entropy = 0.0\nsamples = 44\nvalue = [0, 44, 0]\nclass = class_1'),
 Text(167.4, 108.72, 'flavanoids <= 1.4\nentropy = 0.934\nsamples = 44\nvalue = [4, 5, 35]\nclass = class_2'),
 Text(111.60000000000001, 65.232, 'entropy = 0.0\nsamples = 35\nvalue = [0, 0, 35]\nclass = class_2'),
 Text(223.20000000000002, 65.232, 'proline <= 679.0\nentropy = 0.991\nsamples = 9\nvalue = [4, 5, 0]\nclass = class_1'),
 Text(167.4, 21.744, 'entropy = 0.0\nsamples = 5\nvalue = [0, 5, 0]\nclass = class_1'),
 Text(279.0, 21.744, 'entropy = 0.0\nsamples = 4\nvalue = [4, 0, 0]\nclass = class_0'),
 Text(223.20000000000002, 152.208, 'entropy = 0.0\nsamples = 36\nvalue = [36, 0, 0]\nclass = class_0')]


random_state:int, RandomState instance, default=None

  1. 随机生成器种子设置,默认设置为None,如此,则每次模型结果都会有所不同。否生成一个最优决策树的问题在于,不管你是需要一个在多方面都是最优的决策树,还是说仅仅只需要一个理论上最优的决策树,他都是一个NP完全问题。所以在实际使用中的决策树学习算法都是基于像启发式算法般这样的贪婪算法,使得在每个节点上都取其局部最优值。当然,使用这样的算法并不能保证产生的树是一个全局最优树,所以可以让一组训练器训练出多个树,其中训练器的特征和数据点都使用随机获取。然后再去评估这些树以选出"全局最优树"。
  2. random_state在高纬度数据集中随机性会表现得更为明显,低纬度的数据集(如鸢尾花)中, 随机性几乎不会显现。输入任意整数,会一直长出同一棵树,让模型稳定。
  3. 一般会设置0或者42
clf = tree.DecisionTreeClassifier(criterion='gini', random_state=42)
clf =, ytrain)
score = clf.score(xtest, ytest)

splitter:{“best”, “random”}, default=”best”

用于在每个节点上选择拆分的策略。 best在特征的所有划分点中找出最优的划分点。random是随机的在部分划分点中找局部最优的划分点。默认的"best"适合样本量不大的时候,而如果样本数据量非常大,此时决策树构建推荐"random"。

clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='random', random_state=42)
clf =, ytrain)
score = clf.score(xtest, ytest)
tree.plot_tree(, ytrain),feature_names=wine.feature_names, class_names=wine.target_names )
[Text(139.5, 199.32, 'od280/od315_of_diluted_wines <= 2.123\nentropy = 1.571\nsamples = 124\nvalue = [40, 49, 35]\nclass = class_1'),
 Text(63.77142857142857, 163.07999999999998, 'hue <= 0.936\nentropy = 0.669\nsamples = 40\nvalue = [0, 7, 33]\nclass = class_2'),
 Text(31.885714285714286, 126.83999999999999, 'od280/od315_of_diluted_wines <= 1.785\nentropy = 0.323\nsamples = 34\nvalue = [0, 2, 32]\nclass = class_2'),
 Text(15.942857142857143, 90.6, 'entropy = 0.0\nsamples = 24\nvalue = [0, 0, 24]\nclass = class_2'),
 Text(47.82857142857143, 90.6, 'flavanoids <= 0.872\nentropy = 0.722\nsamples = 10\nvalue = [0, 2, 8]\nclass = class_2'),
 Text(31.885714285714286, 54.359999999999985, 'entropy = 0.0\nsamples = 7\nvalue = [0, 0, 7]\nclass = class_2'),
 Text(63.77142857142857, 54.359999999999985, 'ash <= 2.617\nentropy = 0.918\nsamples = 3\nvalue = [0, 2, 1]\nclass = class_1'),
 Text(47.82857142857143, 18.119999999999976, 'entropy = 0.0\nsamples = 2\nvalue = [0, 2, 0]\nclass = class_1'),
 Text(79.71428571428572, 18.119999999999976, 'entropy = 0.0\nsamples = 1\nvalue = [0, 0, 1]\nclass = class_2'),
 Text(95.65714285714286, 126.83999999999999, 'alcohol <= 13.386\nentropy = 0.65\nsamples = 6\nvalue = [0, 5, 1]\nclass = class_1'),
 Text(79.71428571428572, 90.6, 'entropy = 0.0\nsamples = 5\nvalue = [0, 5, 0]\nclass = class_1'),
 Text(111.6, 90.6, 'entropy = 0.0\nsamples = 1\nvalue = [0, 0, 1]\nclass = class_2'),
 Text(215.22857142857143, 163.07999999999998, 'alcohol <= 13.013\nentropy = 1.138\nsamples = 84\nvalue = [40, 42, 2]\nclass = class_1'),
 Text(159.42857142857144, 126.83999999999999, 'flavanoids <= 0.622\nentropy = 0.324\nsamples = 42\nvalue = [1, 40, 1]\nclass = class_1'),
 Text(143.4857142857143, 90.6, 'entropy = 0.0\nsamples = 1\nvalue = [0, 0, 1]\nclass = class_2'),
 Text(175.37142857142857, 90.6, 'od280/od315_of_diluted_wines <= 3.584\nentropy = 0.165\nsamples = 41\nvalue = [1, 40, 0]\nclass = class_1'),
 Text(159.42857142857144, 54.359999999999985, 'entropy = 0.0\nsamples = 39\nvalue = [0, 39, 0]\nclass = class_1'),
 Text(191.31428571428572, 54.359999999999985, 'nonflavanoid_phenols <= 0.233\nentropy = 1.0\nsamples = 2\nvalue = [1, 1, 0]\nclass = class_0'),
 Text(175.37142857142857, 18.119999999999976, 'entropy = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = class_1'),
 Text(207.25714285714287, 18.119999999999976, 'entropy = 0.0\nsamples = 1\nvalue = [1, 0, 0]\nclass = class_0'),
 Text(271.0285714285714, 126.83999999999999, 'hue <= 0.772\nentropy = 0.437\nsamples = 42\nvalue = [39, 2, 1]\nclass = class_0'),
 Text(239.14285714285714, 90.6, 'total_phenols <= 1.633\nentropy = 1.0\nsamples = 2\nvalue = [0, 1, 1]\nclass = class_1'),
 Text(223.2, 54.359999999999985, 'entropy = 0.0\nsamples = 1\nvalue = [0, 0, 1]\nclass = class_2'),
 Text(255.0857142857143, 54.359999999999985, 'entropy = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = class_1'),
 Text(302.9142857142857, 90.6, 'hue <= 1.308\nentropy = 0.169\nsamples = 40\nvalue = [39, 1, 0]\nclass = class_0'),
 Text(286.9714285714286, 54.359999999999985, 'entropy = 0.0\nsamples = 39\nvalue = [39, 0, 0]\nclass = class_0'),
 Text(318.8571428571429, 54.359999999999985, 'entropy = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = class_1')]


score = clf.score(xtrain, ytrain)



  1. max_depth

  2. min_samples_leaf
    该参数限定了一个结点在分枝后的子节点中都必须包含至少min_samples_leaf个训练样本,否则分枝就不会发生。与max_depth一起搭配使用,可以让模型变得更加平滑。这个参数的数量设置得太小容易引起过拟合,太大则会阻止模型学习数据。一般可从min_samples_leaf=5开始尝试。如果叶子结点中的样本量变化很大,可输入浮点数作为样本量的百分比来使用。同时,该参数可以保证每个叶子的最小尺寸,可以咋回归问题中避免低方差,过拟合的叶子结点出现。对于类别不多的分类问题, 1通常就是最佳选择。

  3. min_samples_split

clf = tree.DecisionTreeClassifier(criterion='entropy', random_state=42, splitter='random', max_depth=3, min_samples_leaf=10, min_samples_split=10)
clf =, ytrain)
score = clf.score(xtest, ytest)
tree.plot_tree(, ytrain))
[Text(153.45000000000002, 190.26, 'X[11] <= 2.123\nentropy = 1.571\nsamples = 124\nvalue = [40, 49, 35]'),
 Text(83.7, 135.9, 'X[1] <= 3.601\nentropy = 0.669\nsamples = 40\nvalue = [0, 7, 33]'),
 Text(55.800000000000004, 81.53999999999999, 'X[9] <= 6.71\nentropy = 0.84\nsamples = 26\nvalue = [0, 7, 19]'),
 Text(27.900000000000002, 27.180000000000007, 'entropy = 0.997\nsamples = 15\nvalue = [0, 7, 8]'),
 Text(83.7, 27.180000000000007, 'entropy = 0.0\nsamples = 11\nvalue = [0, 0, 11]'),
 Text(111.60000000000001, 81.53999999999999, 'entropy = 0.0\nsamples = 14\nvalue = [0, 0, 14]'),
 Text(223.20000000000002, 135.9, 'X[0] <= 13.155\nentropy = 1.138\nsamples = 84\nvalue = [40, 42, 2]'),
 Text(167.4, 81.53999999999999, 'X[5] <= 1.774\nentropy = 0.574\nsamples = 46\nvalue = [4, 41, 1]'),
 Text(139.5, 27.180000000000007, 'entropy = 0.469\nsamples = 10\nvalue = [0, 9, 1]'),
 Text(195.3, 27.180000000000007, 'entropy = 0.503\nsamples = 36\nvalue = [4, 32, 0]'),
 Text(279.0, 81.53999999999999, 'X[7] <= 0.306\nentropy = 0.35\nsamples = 38\nvalue = [36, 1, 1]'),
 Text(251.10000000000002, 27.180000000000007, 'entropy = 0.235\nsamples = 26\nvalue = [25, 1, 0]'),
 Text(306.90000000000003, 27.180000000000007, 'entropy = 0.414\nsamples = 12\nvalue = [11, 0, 1]')]


clf.score(xtrain, ytrain)
  1. max_features限制分枝时考虑的特征个数,超过限制个数的特征都会被舍弃。和max_depth异曲同工。max_features是用来限制高纬度数据的过拟合的剪枝参数,但其方法比较暴力。在不知道决策树中各个特征的重要性的情况下,强行设定这个参数可能会导致模型学习不足。如果希望通过降维的方式防止过拟合,建议使用PCA, ICA或者特征选择模块中的降维算法。

  2. min_impurity_decrease限制信息增益的大小,信息增益小于设定数值的分枝不会发生。



import matplotlib.pyplot as plt

for i in range(10):
    clf = tree.DecisionTreeClassifier(max_depth=i+1, criterion='entropy', random_state=42, splitter='random')
    clf =, ytrain)
    score = clf.score(xtest, ytest)
plt.plot(range(1,11), test, color='red', label='max_depth')




  1. 对决策树来说,最重要的 是feature_importances_,能够查看各个特征对模型的重要性。

  2. fit

  3. score

  4. apply: 输入测试集,返回每个测试样本所在的叶子结点的索引

  1. predict:输入测试集, 返回每个测试样本的标签/分类结果
clf = tree.DecisionTreeClassifier(max_depth=4, criterion='entropy', random_state=42, splitter="random")
clf =, ytrain)
score = clf.score(xtest, ytest)








  1. class_weightdict, list of dict or “balanced”, default=None
    默认None, 此模式表示假设数据集中的所有标签是均衡的,即自动认为标签的比例为1:1。当样本不均衡的时候,可以使用形如{‘标签1’:权重1, ‘标签2’:权重2}的字典来输入真实的样本标签比例。或者使用‘balanced’模式,直接使用n_samples/(n_classes*np.bincount(y))作为权重,可以比较好的修正样本不均衡情况

有了权重之后,样本量就不在单纯地记录数目, 而是受输入的权重影响。因此,这时候的剪枝需要使用基于权重的剪枝参数min_weight_fraction_leaf。该参数的使用将比min_sample_leaf更少偏向主类。若样本是加权的,则使用基于权重的预修剪标准更容易优化树结构,这确保叶结点至少包含样本权重的总和的一小部分。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.datasets import make_blobs #创建团状数据
class_1 = 500 #类别1只有500个样本
class_2 = 50  #类别2有50个样本
centers = [[0,0],[2.0,2.0]] #设置两个类别的中心
clusters_std = [1.5, 0.5] #设定两个类别的方差,通常来说,样本量比价大的类别会更加松散
x,y = make_blobs(n_samples=[class_1, class_2], centers=centers, cluster_std=clusters_std, random_state=430, shuffle=False)
plt.scatter(x[:, 0], x[:, 1], c=y, cmap='rainbow', s=10)


clf = DTC(max_depth=4), y)
wclf = DTC(max_depth=4, class_weight={1:10}) #设定1类样本权重为10,默认0类样本权重为1,y)
clf.score(x, y)







表示预测的少数类样本中,预测正确的少数类样本所占的比例。该比值可以 衡量“将多数类判错后所需付出的成本”

# 所有判断正确并确实为1的样本/所有被判断为1的样本
# 对于没有class_weight的决策树而言:
(y[y == clf.predict(x)]==1).sum()/(clf.predict(x)== 1).sum()

召回率(敏感度、真正率)recall(sensitivity, ture positive rate)


# 对于有class_weight的决策树来说:
(y[y == wclf.predict(x)] == 1).sum()/(y == 1).sum()



F1 measure

是精确度和召回率的调和平均数。该参数倾向于靠近两个数中比较小的一个数,因此追求高F1 measure能够保证精确度和召回率都比较高。该参数的值分布在[0,1]之间。
F   −   m e a s u r e = 2 1 P r e c i s i o n + 1 R e c a l l   =   2 ∗ P r e c i s i o n ∗ R e c a l l P r e c i s i o n + R e c a l l F\ -\ measure=\frac{2}{\frac{1}{Precision}+\frac{1}{Recall}}\ = \ \frac{2*Precision*Recall}{Precision+Recall} F  measure=Precision1+Recall12 = Precision+Recall2PrecisionRecall



# 所有被正确预测为0的样本/所有的0样本
# 对于没有class_weight的决策树而言:
(y[y == clf.predict(x)] == 0).sum()/(y == 0).sum()
(y[y == wclf.predict(x)]==0).sum()/(y == 0).sum()
from  sklearn.metrics import confusion_matrix as cm
cm(y, clf.predict(x))
cm(y, wclf.predict(x))
from sklearn .metrics import accuracy_score
accuracy_score(y, clf.predict(x))
from sklearn.metrics import precision_score
precision_score(y, clf.predict(x))
from sklearn.metrics import recall_score
recall_score(y, clf.predict(x))
from sklearn.metrics import plot_precision_recall_curve
from matplotlib import pyplot as plt

disp = plot_precision_recall_curve( clf,x,y)
disp.ax_.set_title('2-class Precision-Recall curve')
Text(0.5, 1.0, '2-class Precision-Recall curve')


from sklearn.metrics import plot_precision_recall_curve
from matplotlib import pyplot as plt

disp = plot_precision_recall_curve( wclf,x,y)
disp.ax_.set_title('2-class Precision-Recall curve')
Text(0.5, 1.0, '2-class Precision-Recall curve')


#sklearn中的F1 measure
from sklearn.metrics import f1_score
f1_score(y, clf.predict(x))


class sklearn.tree.DecisionTreeRegressor(*, criterion=‘mse’, splitter=‘best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=‘deprecated’, ccp_alpha=0.0)[source]¶



criterion:{“mse”, “friedman_mse”, “mae”}, default=”mse”

  1. “mse”:使用均方误差mean squared error, 父节点和子节点之间的均方误差的差额将被用来作为特征选择的标准,这种方法通过使用叶子结点的均值来最小化损失。在回归树中,MSE不仅是衡量分枝质量的指标,也是衡量回归树回归质量的指标。
  2. “friedman_mse”:使用费尔德曼均方误差,该指标使用费尔德曼针对潜在分枝中的问题改进后的均方误差。
  3. “mae”:使用绝对平均误差(mean adsolute error),这种指标使用叶结点的中值来最小化L1损失。

feature_importances_: ndarray of shape (n_features,)



from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score  #导入交叉验证的函数
from sklearn.tree import DecisionTreeRegressor
boston = load_boston()
regressor = DecisionTreeRegressor(random_state=0)
cross_val_score(regressor,,, cv=10, scoring = 'neg_mean_squared_error')
boston = load_boston()
(506, 13)
regressor = DecisionTreeRegressor(random_state=0)
cross_val_score(regressor,,, cv=10,scoring='neg_mean_squared_error' )
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
rng = np.random.RandomState(1) #使用numpys设置随机数种子
x = np.sort(5*rng.rand(80, 1), axis=0) #sort对numpy进行排序
y = np.sin(x).ravel() #生成正弦曲线之后将y降维
y[::5] += 3*(0.5 - rng.rand(16)) #对标准的正弦结果y加上噪音

plt.scatter(x, y, s=20, edgecolor='black', c='darkorange', label='data')


regr_1 = DecisionTreeRegressor(max_depth=1)
regr_2 = DecisionTreeRegressor(max_depth=10),y)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=1,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best'),y)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')
np.arange(0.0, 5.0, 0.01)
x_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = regr_1.predict(x_test)
y_2 = regr_2.predict(x_test)
plt.scatter(x, y, s=20, edgecolor='black', c='darkorange', label='data')
plt.plot(x_test, y_1,color='cornflowerblue', label='max_depth=2', linewidth=2)
plt.plot(x_test, y_2,color='yellowgreen', label='max_depth=10', linewidth=2)
plt.title('Decision Tree Regression')



