机器学习实战
决策树的python3实现代码:
"""
@Descripttion: 决策树
优点:计算复杂度不高,输出结果易于理解,对中间值的缺失不敏感,可以处理不相关特征数据
缺点:可能会产生过度匹配问题
适用数据类型:数值型和标称型
@version: 0.0.1
@Author: tqrs
@dev: python3 vscode
@Date: 2019-06-03 12:15:20
@LastEditors: tqrs
@LastEditTime: 2019-10-26 02:17:15
"""
from math import log
import operator
import pickle
def createDataSet():
"""
[summary]:创建数据集
Returns:
dataSet - 数据集
labels - 分类属性
"""
dataSet = [
[0, 0, 0, 0, "no"],
[0, 0, 0, 1, "no"],
[0, 1, 0, 1, "yes"],
[0, 1, 1, 0, "yes"],
[0, 0, 0, 0, "no"],
[1, 0, 0, 0, "no"],
[1, 0, 0, 1, "no"],
[1, 1, 1, 1, "yes"],
[1, 0, 1, 2, "yes"],
[1, 0, 1, 2, "yes"],
[2, 0, 1, 2, "yes"],
[2, 0, 1, 1, "yes"],
[2, 1, 0, 1, "yes"],
[2, 1, 0, 2, "yes"],
[2, 0, 0, 0, "no"],
]
labels = ["年龄", "有工作", "有自己的房子", "信贷情况"]
# 返回数据集和分类属性
return dataSet, labels
def calcShannonEnt(dataSet):
"""
[summary]:计算给定数据集的香农熵
Arguments:
dataSet -- 数据集
Returns:
shannonEnt - 经验熵(香农熵)
"""
# 返回数据集的行数
numEntries = len(dataSet)
labelCounts = {} # 为所有可能的分类创建字典
for featVec in dataSet: # 对每组特征向量进行统计
currentLabel = featVec[-1] # 提取标签(Label)信息
# Label计数
labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1
shannonEnt = 0.0 # 经验熵(香农熵)
for key in labelCounts:
prob = float(labelCounts[key] / numEntries)
shannonEnt -= prob * log(prob, 2) # log base 2
return shannonEnt
def splitDataSet(dataSet, axis, value):
"""
[summary]: 按照给定特征划分数据集
Arguments:
dataSet {[type]} -- 待划分的数据集
axis {[type]} -- 划分数据集的特征
value {[type]} -- 需要返回的特征的值
Returns:
retDataSet -- 返回的数据集列表
"""
retDataSet = []
for featVec in dataSet: # 遍历数据集
if featVec[axis] == value:
reducedFeatVec = featVec[:axis] + featVec[axis + 1 :]
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
"""
[summary]:选择最好的数据集划分方式(信息增益最大)
Arguments:
dataSet {[type]} -- 数据集
Returns:
bestFeature -- 最优特征特征的索引值
"""
numFeatures = len(dataSet[0]) - 1 # 特征数量
baseENtropy = calcShannonEnt(dataSet) # 整个数据集的原始香农熵
bestInfoGain = 0.0 # 信息增益
bestFeature = -1 # 最优特征的索引值
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList) # 创建唯一的分类标签,set元素不可重复
newEntropy = 0.0 # 经验条件熵
# 计算每种划分方式的信息熵
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value) # subDataSet划分后的子集
prob = len(subDataSet) / float(len(dataSet)) # 计算子集的概率
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseENtropy - newEntropy # 信息增益
print("第%d个特征的增益为%.3f" % (i, infoGain))
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList):
"""
[summary]:统计classList中出现次数最多的元素(类标签)
Arguments:
classList {[type]} -- 类标签列表
Returns:
sortedClassCount[0][0] -- 出现此处最多的元素(类标签)
"""
classCount = {}
for vote in classList: # 统计classList中每个元素出现的次数
classCount[vote] = classCount.get(vote, 0) + 1
sortedClassCount = sorted(
classCount.items(), key=operator.itemgetter(1), reverse=True
) # 根据字典的值降序排序
return sortedClassCount[0][0] # 返回classList中出现次数最多的元素
def createTree(dataSet, labels, featLabels):
"""
[summary]:创建决策树
Arguments:
dataSet -- 训练数据集
labels -- 分类属性标签
featLabels -- 存储选择的最优特征标签
Returns:
myTree - 决策树
"""
classList = [example[-1] for example in dataSet] # 取分类标签(是否放贷:yes or no)
if classList.count(classList[0]) == len(classList): # 如果类别完全相同则停止继续划分
return classList[0]
if len(dataSet[0]) == 1: # 遍历完所有特征时返回出现次数最多的类标签
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) # 选择最优特征
bestFeatLabel = labels[bestFeat] # 最优特征的标签
featLabels.append(bestFeatLabel)
myTree = {bestFeatLabel: {}} # 根据最优特征的标签生成树
del labels[bestFeat] # 删除已经使用特征标签
featValues = [example[bestFeat] for example in dataSet] # 得到训练集中所有最优特征的属性值
uniqueVals = set(featValues) # 去掉重复的属性值
for value in uniqueVals: # 遍历特征,创建决策树。
myTree[bestFeatLabel][value] = createTree(
splitDataSet(dataSet, bestFeat, value), labels, featLabels
)
return myTree
def classify(inputTree, featLabels, testVec):
"""
[summary]:使用决策树分类
Arguments:
inputTree -- 已经生成的决策树
featLabels -- 存储选择的最优特征标签
testVec -- 测试数据列表,顺序对应最优特征标签
Returns:
classLabel - 分类结果
"""
firstStr = next(iter(inputTree)) # 获取决策树结点
secondDict = inputTree[firstStr] # 下一个字典
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == "dict":
classLabel = classify(secondDict[key], featLabels, testVec)
else:
classLabel = secondDict[key]
return classLabel
def storeTree(inputTree, filename):
"""
[summary]:存储决策树
Arguments:
inputTree {[type]} -- 已经生成的决策树
filename {[type]} -- 决策树的存储文件名
"""
with open(filename, "wb") as fw:
pickle.dump(inputTree, fw)
def grabTree(filename):
"""
[summary]:读取决策树
Arguments:
filename {[type]} -- 决策树的存储文件名
Returns:
pickle.load(fr) -- 决策树字典
"""
fr = open(filename, "rb")
return pickle.load(fr)
if __name__ == "__main__":
dataSet, labels = createDataSet()
featLabels = []
myTree = createTree(dataSet, labels, featLabels)
testVec = [0, 1] # 测试数据
result = classify(myTree, featLabels, testVec)
if result == "yes":
print("放贷")
if result == "no":
print("不放贷")
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.externals.six import StringIO
from sklearn import tree
import pandas as pd
import numpy as np
import pydotplus
import os
if __name__ == "__main__":
os.environ["PATH"] += os.pathsep + "D:/Program Files (x86)/graphviz/bin"
with open("./03-决策树/lenses.txt", "r") as fr: # 加载文件
lenses = [inst.strip().split("\t") for inst in fr.readlines()] # 处理文件
lenses_target = [] # 提取每组数据的类别,保存在列表里
for each in lenses:
lenses_target.append(each[-1])
print(lenses_target)
lensesLabels = ["age", "prescript", "astigmatic", "tearRate"] # 特征标签
lenses_list = [] # 保存lenses数据的临时列表
lenses_dict = {} # 保存lenses数据的字典,用于生成pandas
for each_label in lensesLabels: # 提取信息,生成字典
for each in lenses:
lenses_list.append(each[lensesLabels.index(each_label)])
lenses_dict[each_label] = lenses_list
lenses_list = []
# print(lenses_dict) #打印字典信息
lenses_pd = pd.DataFrame(lenses_dict) # 生成pandas.DataFrame
# print(lenses_pd) #打印pandas.DataFrame
le = LabelEncoder() # 创建LabelEncoder()对象,用于序列化
for col in lenses_pd.columns: # 序列化
lenses_pd[col] = le.fit_transform(lenses_pd[col])
# print(lenses_pd) #打印编码信息
clf = tree.DecisionTreeClassifier(max_depth=4) # 创建DecisionTreeClassifier()类
clf = clf.fit(lenses_pd.values.tolist(), lenses_target) # 使用数据,构建决策树
dot_data = StringIO()
tree.export_graphviz(
clf,
out_file=dot_data, # 绘制决策树
feature_names=lenses_pd.keys(),
class_names=clf.classes_,
filled=True,
rounded=True,
special_characters=True,
)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("./03-决策树/tree.pdf") # 保存绘制好的决策树,以PDF的形式存储。
print(clf.predict([[1, 1, 1, 0]]))