决策树学习的目标: 根据给定的训练数据集构建一个决策树模型,使它能够对实例进行正确的分类。
决策树的主要内容:
- 数据集的准备
- 实现的主要内容
- 计算数据集的信息熵
- 计算特征值的信息增益
- 找到最佳的特征进行划分,根据信息熵和信息增益计算得到Gain值,选择最大的Gain值对应的特征
- 创建决策树
- 数据相关:
- 数据集:data_set,N*(M+1),最后一列表示分类结果
- 特征集:labels,M,对应数据集每一列的
- 属性:每个特征下会有多个属性
根据决策树相关的内容需要了解一下几点:
- 数据集的信息熵计算
- 对应特征信息增益的计算
- Gaini值得计算
决策树生成的伪代码:(以字典类型保存Tree)
输入:数据集、特征集 if 当前是否只剩下一个分类结果了: 当前的结果就作为叶子结点 if 当前是否所有特征都分完了,只剩下分类结果了 选择当前结果作为最后的叶子结点 从当前数据集中选择最佳划分的特征 字典result对应的key为最佳特征,value for 特征值对应的属性 根据这个属性划分数据集 新的数据集和特征集进行递归,返回的结果作为result的value值 返回结果字典
以【周志华 机器学习】这本书的介绍,学习相关的内容。具体内容这里不做过多介绍,具体计算见书
data_set = [
["青绿", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
["乌黑", "蜷缩", "沉闷", "清晰", "凹陷", "硬滑", "是"],
["乌黑", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
["青绿", "蜷缩", "沉闷", "清晰", "凹陷", "硬滑", "是"],
["浅白", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
["青绿", "稍蜷", "浊响", "清晰", "稍凹", "软粘", "是"],
["乌黑", "稍蜷", "浊响", "稍糊", "稍凹", "软粘", "是"],
["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "硬滑", "是"],
["乌黑", "稍蜷", "沉闷", "硝糊", "稍凹", "硬滑", "否"],
["青绿", "硬挺", "清脆", "清晰", "平坦", "软粘", "否"],
["洁白", "硬挺", "清脆", "模糊", "平坦", "硬滑", "否"],
["洁白", "蜷缩", "浊响", "模糊", "平坦", "软粘", "否"],
["青绿", "稍蜷", "浊响", "稍糊", "凹陷", "硬滑", "否"],
["浅白", "稍蜷", "沉闷", "稍糊", "凹陷", "硬滑", "否"],
["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "软粘", "否"],
["践自", "蜷缩", "浊响", "模糊", "平坦", "硬滑", "否"],
["青绿", "蜷缩", "沉闷", "稍糊", "稍凹", "硬滑", "否"]
]
labels = ["色泽","根蒂","敲声","纹理","脐部","触感","好瓜"]
主要函数:
- cal_information_entropy:计算当前数据集的信息熵值
- cal_information_gain:计算数据集下某个特征的信息Gain
- get_major_class:当所有特征已经分完了,只剩下一列数据,即分类的结果的时候,用于获取最多的结果的分类
- get_best_feature:找出数据集中最佳划分的特征
- create_tree:创建树
- classify:预测结果
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project :Python机器学习
@File :Test03.py
@IDE :PyCharm
@Author :羽丶千落
@Date :2023-04-01 15:24
@content:
简单的实现决策树
主要内容:
1-计算数据集的信息熵,以及对应特征的信息增益
2-当划分到只剩下一个特征,即当前数据集只剩下一个特征的时候,获取最多的属性类别作为结果
3-计算最佳划分特征--根据基尼值
4-根据最佳划分数据集
5-创建决策树,1-4位创建决策树的主要内容
数据类型:
数据集:data_set,N*(M+1),最后一列表示分类结果
特征集:labels,M,对应数据集每一列的
特征:包含有多个属性
"""
from collections import Counter
from math import log
def cal_information_entropy(data_set: list) -> float:
"""
计算当前数据集的信息熵值
:param data_set: 数据集
:return:信息熵值
"""
label_list = [data[-1] for data in data_set] # 样本的结果
count = Counter(label_list)
len_data = sum(count.values())
information_entropy = 0.0
for k, v in count.items():
pi = float(v) / len_data
information_entropy -= pi * log(pi, 2)
return information_entropy
def cal_information_gain(data_set, feature):
information_entropy = cal_information_entropy(data_set) # 先计算得到整个数据集的信息熵
feature_data = [(data[feature], data[-1]) for data in data_set] # 对应特征与分类结果
feature_list = [data[feature] for data in data_set] # 当前特征的数据
# 开始计算对应的信息增益
feature_classify = set(feature_data) # 获取当前特征下属性分类情况
feature_data_count = Counter(feature_data) # 当前数据集此特征对应属性分类数量情况
feature_list_count = Counter(feature_list) # 当前数据集此特征下属性的分类数量
len_data = sum(feature_data_count.values()) # 数据量,len(data_set)
information_gain = 0.0
# 开始计算信息增益
pi_feature = {} # 计算每一个属性对应的信息熵
for feat_class in feature_classify:
pi_classify = float(feature_data_count[feat_class]) / feature_list_count[feat_class[0]] # 当前特征对应的属性概率
gain = -pi_classify * log(pi_classify, 2) # 当前特征对应的属性的信息熵
if feat_class[0] not in pi_feature.keys():
pi_feature[feat_class[0]] = gain
else:
pi_feature[feat_class[0]] += gain
for pi_item in pi_feature.keys():
information_gain += (feature_list_count[pi_item] / float(len_data)) * pi_feature[pi_item]
return information_entropy - information_gain
def get_major_class(class_list: list):
"""
判断列表中元素中数量最多的一个。使用Counter计算
:param class_list: 列表
:return: 列表元素中数量多的一个元素
"""
count = Counter(class_list)
res = count.most_common(1)[0][0]
return res
def get_best_feature(data_set):
num_feature = len(data_set[0]) - 1 # 当前数据集的属性数量
best_gini = 0.0
best_feature = -1
for feature_i in range(num_feature):
new_gini = cal_information_gain(data_set, feature_i)
if best_gini < new_gini:
best_gini = new_gini
best_feature = feature_i
return best_feature
def create_tree(data_set, labels):
class_list = [example[-1] for example in data_set] # 从当前数据集中取最后一个特征
if class_list.count(class_list[0]) == len(class_list): # 只剩下一个分类结果了
return class_list[0]
if len(data_set[0]) == 1: # 特征已经分完了
return get_major_class(data_set) # 选取最多的结果
best_feature = get_best_feature(data_set)
feature_label = labels[best_feature] # 最好划分属性对应的名称label
my_tree = {feature_label: {}}
del (labels[best_feature]) # 除去已划分的属性
feature_values = [example[best_feature] for example in data_set]
unique_values = set(feature_values)
for value in unique_values:
copy_labels = labels[:] # 进行递归,需要先复制label
new_data = []
for data in data_set:
if data[best_feature] == value:
item_data = data[:best_feature]
item_data.extend(data[best_feature + 1:])
new_data.append(item_data)
my_tree[feature_label][value] = create_tree(new_data, copy_labels)
return my_tree
def classify(input_tree, feat_labels, test_vec):
feat = list(input_tree.keys())[0] # 获取决策树的父节点
chlid_tree = input_tree[feat] # 获取决策树的子树,左右子树
feat_index = feat_labels.index(feat) # 获取父节点对应的labels的索引
key = test_vec[feat_index] # 获取预测值对应父节点对应的值
key_tree = chlid_tree[key] # 找到对应的key的子树
if isinstance(key_tree, dict): # 判断key_tree是否到了叶子节点,不是叶子节点就还是树(字典)
class_label = classify(key_tree, feat_labels, test_vec)
else:
class_label = key_tree
return class_label
def creatDataSet():
dataSet = [
["青绿", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
["乌黑", "蜡缩", "沉闷", "清晰", "凹陷", "硬滑", "是"],
["乌黑", "蜡缩", "虫响", "清晰", "凹陷", "硬滑", "是"],
["青绿", "蜷缩", "沉闷", "清晰", "凹陷", "硬滑", "是"],
["浅白", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
["青绿", "稍蜷", "浊响", "清晰", "稍凹", "软粘", "是"],
["乌黑", "稍蜷", "浊日向", "稍糊", "稍凹", "软粘", "是"],
["乌黑", "稍蜷", "独日向", "清晰", "稍凹", "硬滑", "是"],
["乌黑", "稍蜷", "祝闷", "硝糊", "稍凹", "硬滑", "否"],
["青绿", "硬挺", "清脆", "清晰", "平坦", "软粘", "否"],
["洁白", "硬挺", "清脆", "模糊", "平坦", "硬滑", "否"],
["洁白", "蜷缩", "浊响", "模糊", "平坦", "软粘", "否"],
["青绿", "稍蜷", "浊响", "稍糊", "凹陷", "硬滑", "否"],
["浅白", "稍蜷", "沉闷", "稍糊", "凹陷", "硬情", "否"],
["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "软粘", "否"],
["践自", "蜷缩", "浊响", "模糊", "平坦", "硬滑", "否"]
]
labels = ["色泽","根蒂","敲声","纹理","脐部","触感","好瓜"]
return dataSet, labels
if __name__ == '__main__':
dataSet, labels = creatDataSet()
myTree = create_tree(dataSet, labels)
print(myTree)
labels = ["色泽","根蒂","敲声","纹理","脐部","触感","好瓜"] # 否
print(classify(myTree,labels, ["青绿", "蜡缩", "沉闷", "稍糊", "稍凹", "硬滑", "否"]))