使用scikit-learn对csv数据文件构建决策树并可视化

本篇内容是如何使用sklearn构建训练评估决策树模型,并使用官方API,或stackoverflow等一些网站上的大牛自定义的方法来可视化决策树。

1、对数据进行处理并训练评估模型

from sklearn.model_selection import train_test_split, cross_val_score, KFold
import pandas as pd
import numpy as np

path = "你的csv.csv"  # 5分类的
data = pd.read_csv(path)


# 打乱数据集
from sklearn import utils

data = utils.shuffle(data)

Y = data["score"].values
X = data.drop("score", axis=1).values

# 拆分训练集和测试集
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.3, random_state=0)

# 决策树
from sklearn.tree import DecisionTreeClassifier

# 实例化决策树,香农熵,训练集训练
dtc = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=3, max_depth=15)
# clf为拟合好的模型
clf = dtc.fit(X=train_X, y=train_Y)

# 对测试集的自变量矩阵进行预测
predict_Y = dtc.predict(X=test_X)
print(f"predict_Y={predict_Y}")

# 模型评估
from sklearn.metrics import classification_report, make_scorer, accuracy_score, f1_score

# print(dtc.score(test_X, test_Y.astype(int)))
print(accuracy_score(y_true=test_Y.astype(int), y_pred=predict_Y))
# print(classification_report(predict_Y, test_Y.astype(int)))

2、下面绘制决策树

from sklearn.tree import export_text, export_graphviz
# 加入Graphviz的环境路径
import os
os.environ["PATH"] += os.pathsep + "G:/24_graphviz_msi/bin"

# 绘图并导出
dot_data = export_graphviz(clf, out_file=None,
                           feature_names=(list(data.drop("score", axis=1).columns)))  # 第1个参数Decision_tree是dtc或clf都可以
import pydotplus

graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FFF2DD")
if os.path.exists("out.png"):
   pass
else:
    graph.write_png("out.png")  # 当前文件夹生成out.png

3、打印决策路径(官方API与自定义方法)

(1)官方API——export_text
# 打印决策路径
r = export_text(dtc, feature_names=(list(data.drop("score", axis=1).columns)))    # 第1个参数Decision_tree是dtc或clf都可以
print(r)

# 打印string类型路径
string_data = export_graphviz(clf, out_file=None, feature_names=(list(data.drop("score", axis=1).columns)))
print(f"string_data={string_data}")

如果你使用export_text遇到报错sklearn.tree export_textc error,请直接更新scikit-learn

pip install scikit-learn --upgrade
(2)自定义方法1
def get_lineage(tree, feature_names):
    left = tree.tree_.children_left
    print(f"left={left}")
    right = tree.tree_.children_right
    print(f"right={right}")
    threshold = tree.tree_.threshold
    print(f"threshold={threshold}")
    features = [feature_names[i] for i in tree.tree_.feature]
    print(f"features={features}")

    # get ids of child nodes
    idx = np.argwhere(left == -1)[:, 0]
    print(f"idx={idx}")

    def recurse(left, right, child, lineage=None):
        if lineage is None:
            lineage = [child]
            print(f"当前lineage={lineage}")
        if child in left:
            parent = np.where(left == child)[0].item()
            print(f"当前左,parent={parent}")
            split = 'l'
        else:
            parent = np.where(right == child)[0].item()
            print(f"当前右,parent={parent}")
            split = 'r'

        lineage.append((parent, split, threshold[parent], features[parent]))

        if parent == 0:
            lineage.reverse()
            return lineage
        else:
            return recurse(left, right, parent, lineage)

    for child in idx:
        for node in recurse(left, right, child):
            print(node)

get_lineage(clf, data.columns)
(3)自定义方法3
def try2(clf):

    # 决策估算器具有一个名为tree_的属性,该属性存储整个树状结构,并允许访问低级属性。
    # The decision estimator has an attribute called tree_  which stores the entire
    # tree structure and allows access to low level attributes.
    # 二叉树-tree_表示为多个并行数组。每个数组的第i个元素保存有关节点“ i”的信息。节点0是树的根。
    # The binary tree —— tree_ is represented as a number of parallel arrays. The i-th element of each array holds information about the node `i`.
    # Node 0 is the tree's root.
    # 注意!某些数组仅适用于叶子节点或拆分节点。在这种情况下,其他类型的节点的值是任意的!
    # NOTE:
    # Some of the arrays only apply to either leaves or split nodes, resp. In this case the values of nodes of the other type are arbitrary!
    #
    # Among those arrays, we have:
    #   - left_child, id of the left child of the node      节点的左子节点的ID
    #   - right_child, id of the right child of the node    节点的右子节点的ID
    #   - feature, feature used for splitting the node      用于拆分节点的功能
    #   - threshold, threshold value at the node            节点上的阈值

    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold

    # 可以遍历树结构以计算各种属性,例如:作为每个节点的深度以及是否为叶子。
    # The tree structure can be traversed to compute various properties such
    # as the depth of each node and whether or not it is a leaf.
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth  # seed 是根节点ID及其父级深度
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # 如果我们有一个测试节点
        # If we have a test node
        # 如果左子节点和右子节点不同,说明不是叶子结点
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        # 如果是叶子结点
        else:
            is_leaves[node_id] = True

    print(f"The binary tree structure has {n_nodes} nodes and has the following tree structure:")
    for i in range(n_nodes):
        if is_leaves[i]:
            print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
        else:
            print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
                  "node %s."
                  % (node_depth[i] * "\t",
                     i,
                     children_left[i],
                     feature[i],
                     threshold[i],
                     children_right[i],
                     ))
    print()

    # 首先,让我们检索每个样本的决策路径。decision_path方法允许检索节点指示符函数。
    # First let's retrieve the decision path of each sample. The decision_path method allows to retrieve the node indicator functions.
    # 指标矩阵在位置 (i,j) 的非零元素表示样本i通过节点j。
    # A non zero element of indicator matrix at the position (i, j) indicates that the sample i goes through the node j.

    node_indicator = clf.decision_path(test_X)

    # Similarly, we can also have the leaves ids reached by each sample.

    leave_id = clf.apply(test_X)

    # Now, it's possible to get the tests that were used to predict a sample or
    # a group of samples. First, let's make it for the sample.

    # HERE IS WHAT YOU WANT
    sample_id = 0
    node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                        node_indicator.indptr[sample_id + 1]]

    print('Rules used to predict sample %s: ' % sample_id)
    for node_id in node_index:

        if leave_id[sample_id] == node_id:  # <-- changed != to ==
            # continue # <-- comment out
            print("leaf node {} reached, no decision here".format(leave_id[sample_id]))  # <--

        else:  # < -- added else to iterate through decision nodes
            if test_X[sample_id, feature[node_id]] <= threshold[node_id]:
                threshold_sign = "<="
            else:
                threshold_sign = ">"

            # print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
            #       "node %s."
            #       % (node_depth[i] * "\t",
            #          i,
            #          children_left[i],
            #          feature[i],
            #          threshold[i],
            #          children_right[i],
            #          ))
            print(f"decision id node {node_id} : "
                  f"(X[{sample_id}, {feature[node_id]}] (= {test_X[sample_id, feature[node_id]]}) {threshold_sign} {threshold[node_id]})")

try2(clf)

(4)自定义方法3

from sklearn.tree import _tree
def try3(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [feature_names[i]
                    if i != _tree.TREE_UNDEFINED else "undefined!"
                    for i in tree_.feature]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, np.argmax(tree_.value[node])))

    recurse(0, 1)

# try3(clf, feature_names=list(data.drop("score", axis=1).columns))

参考:
导入export_text出错
How to extract the decision rules from scikit-learn decision-tree?
try2方法


你可能感兴趣的:(python,机器学习,sklearn)