本篇内容是如何使用sklearn构建训练评估决策树模型,并使用官方API,或stackoverflow等一些网站上的大牛自定义的方法来可视化决策树。
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import pandas as pd
import numpy as np
path = "你的csv.csv" # 5分类的
data = pd.read_csv(path)
# 打乱数据集
from sklearn import utils
data = utils.shuffle(data)
Y = data["score"].values
X = data.drop("score", axis=1).values
# 拆分训练集和测试集
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.3, random_state=0)
# 决策树
from sklearn.tree import DecisionTreeClassifier
# 实例化决策树,香农熵,训练集训练
dtc = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=3, max_depth=15)
# clf为拟合好的模型
clf = dtc.fit(X=train_X, y=train_Y)
# 对测试集的自变量矩阵进行预测
predict_Y = dtc.predict(X=test_X)
print(f"predict_Y={predict_Y}")
# 模型评估
from sklearn.metrics import classification_report, make_scorer, accuracy_score, f1_score
# print(dtc.score(test_X, test_Y.astype(int)))
print(accuracy_score(y_true=test_Y.astype(int), y_pred=predict_Y))
# print(classification_report(predict_Y, test_Y.astype(int)))
from sklearn.tree import export_text, export_graphviz
# 加入Graphviz的环境路径
import os
os.environ["PATH"] += os.pathsep + "G:/24_graphviz_msi/bin"
# 绘图并导出
dot_data = export_graphviz(clf, out_file=None,
feature_names=(list(data.drop("score", axis=1).columns))) # 第1个参数Decision_tree是dtc或clf都可以
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FFF2DD")
if os.path.exists("out.png"):
pass
else:
graph.write_png("out.png") # 当前文件夹生成out.png
# 打印决策路径
r = export_text(dtc, feature_names=(list(data.drop("score", axis=1).columns))) # 第1个参数Decision_tree是dtc或clf都可以
print(r)
# 打印string类型路径
string_data = export_graphviz(clf, out_file=None, feature_names=(list(data.drop("score", axis=1).columns)))
print(f"string_data={string_data}")
如果你使用export_text遇到报错sklearn.tree export_textc error,请直接更新scikit-learn
pip install scikit-learn --upgrade
def get_lineage(tree, feature_names):
left = tree.tree_.children_left
print(f"left={left}")
right = tree.tree_.children_right
print(f"right={right}")
threshold = tree.tree_.threshold
print(f"threshold={threshold}")
features = [feature_names[i] for i in tree.tree_.feature]
print(f"features={features}")
# get ids of child nodes
idx = np.argwhere(left == -1)[:, 0]
print(f"idx={idx}")
def recurse(left, right, child, lineage=None):
if lineage is None:
lineage = [child]
print(f"当前lineage={lineage}")
if child in left:
parent = np.where(left == child)[0].item()
print(f"当前左,parent={parent}")
split = 'l'
else:
parent = np.where(right == child)[0].item()
print(f"当前右,parent={parent}")
split = 'r'
lineage.append((parent, split, threshold[parent], features[parent]))
if parent == 0:
lineage.reverse()
return lineage
else:
return recurse(left, right, parent, lineage)
for child in idx:
for node in recurse(left, right, child):
print(node)
get_lineage(clf, data.columns)
def try2(clf):
# 决策估算器具有一个名为tree_的属性,该属性存储整个树状结构,并允许访问低级属性。
# The decision estimator has an attribute called tree_ which stores the entire
# tree structure and allows access to low level attributes.
# 二叉树-tree_表示为多个并行数组。每个数组的第i个元素保存有关节点“ i”的信息。节点0是树的根。
# The binary tree —— tree_ is represented as a number of parallel arrays. The i-th element of each array holds information about the node `i`.
# Node 0 is the tree's root.
# 注意!某些数组仅适用于叶子节点或拆分节点。在这种情况下,其他类型的节点的值是任意的!
# NOTE:
# Some of the arrays only apply to either leaves or split nodes, resp. In this case the values of nodes of the other type are arbitrary!
#
# Among those arrays, we have:
# - left_child, id of the left child of the node 节点的左子节点的ID
# - right_child, id of the right child of the node 节点的右子节点的ID
# - feature, feature used for splitting the node 用于拆分节点的功能
# - threshold, threshold value at the node 节点上的阈值
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
# 可以遍历树结构以计算各种属性,例如:作为每个节点的深度以及是否为叶子。
# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth # seed 是根节点ID及其父级深度
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1
# 如果我们有一个测试节点
# If we have a test node
# 如果左子节点和右子节点不同,说明不是叶子结点
if children_left[node_id] != children_right[node_id]:
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
# 如果是叶子结点
else:
is_leaves[node_id] = True
print(f"The binary tree structure has {n_nodes} nodes and has the following tree structure:")
for i in range(n_nodes):
if is_leaves[i]:
print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
else:
print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
"node %s."
% (node_depth[i] * "\t",
i,
children_left[i],
feature[i],
threshold[i],
children_right[i],
))
print()
# 首先,让我们检索每个样本的决策路径。decision_path方法允许检索节点指示符函数。
# First let's retrieve the decision path of each sample. The decision_path method allows to retrieve the node indicator functions.
# 指标矩阵在位置 (i,j) 的非零元素表示样本i通过节点j。
# A non zero element of indicator matrix at the position (i, j) indicates that the sample i goes through the node j.
node_indicator = clf.decision_path(test_X)
# Similarly, we can also have the leaves ids reached by each sample.
leave_id = clf.apply(test_X)
# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.
# HERE IS WHAT YOU WANT
sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
node_indicator.indptr[sample_id + 1]]
print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
if leave_id[sample_id] == node_id: # <-- changed != to ==
# continue # <-- comment out
print("leaf node {} reached, no decision here".format(leave_id[sample_id])) # <--
else: # < -- added else to iterate through decision nodes
if test_X[sample_id, feature[node_id]] <= threshold[node_id]:
threshold_sign = "<="
else:
threshold_sign = ">"
# print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
# "node %s."
# % (node_depth[i] * "\t",
# i,
# children_left[i],
# feature[i],
# threshold[i],
# children_right[i],
# ))
print(f"decision id node {node_id} : "
f"(X[{sample_id}, {feature[node_id]}] (= {test_X[sample_id, feature[node_id]]}) {threshold_sign} {threshold[node_id]})")
try2(clf)
(4)自定义方法3
from sklearn.tree import _tree
def try3(tree, feature_names):
tree_ = tree.tree_
feature_name = [feature_names[i]
if i != _tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature]
print("def tree({}):".format(", ".join(feature_names)))
def recurse(node, depth):
indent = " " * depth
if tree_.feature[node] != _tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
print("{}if {} <= {}:".format(indent, name, threshold))
recurse(tree_.children_left[node], depth + 1)
print("{}else: # if {} > {}".format(indent, name, threshold))
recurse(tree_.children_right[node], depth + 1)
else:
print("{}return {}".format(indent, np.argmax(tree_.value[node])))
recurse(0, 1)
# try3(clf, feature_names=list(data.drop("score", axis=1).columns))
参考:
导入export_text出错
How to extract the decision rules from scikit-learn decision-tree?
try2方法