7、决策树
第2关:信息熵与信息增益
import numpy as np
def calcInfoEntropy(feature, label):
'''
计算信息熵
:param feature:数据集中的特征,类型为ndarray
:param label:数据集中的标签,类型为ndarray
:return:信息熵,类型float
'''
label_set = set(label)
'''
print( label_set)
{0, 1}
{0}
{0, 1}
'''
result=0
for l in label_set:
count = 0
for j in range(len(label)):
if label[j] == l:
count += 1
p = count/len(label)
result+=-p*np.log2(p)
return result
def calcHDA(feature, label, index, value):
'''
计算信息熵
:param feature:数据集中的特征,类型为ndarray
:param label:数据集中的标签,类型为ndarray
:param index:需要使用的特征列索引,类型为int
:param value:index所表示的特征列中需要考察的特征值,类型为int
:return:信息熵,类型float
'''
count = 0
sub_feature = []
sub_label = []
for i in range(len(feature)):
if feature[i][index] == value:
count += 1
sub_feature.append(feature[i])
'''
print(feature[i])
[1, 0]
[1, 2]
[1, 1]
'''
sub_label.append(label[i])
'''
print(label[i])
1
0
1
'''
pHA = count/len(feature)
e = calcInfoEntropy(sub_feature,sub_label)
return pHA*e
def calcInfoGain(feature, label, index):
'''
计算信息增益
:param feature:测试用例中字典里的feature
:param label:测试用例中字典里的label
:param index:测试用例中字典里的index,即feature部分特征列的索引
:return:信息增益,类型float
'''
base_e = calcInfoEntropy(feature,label)
f = np.array(feature)
'''
print(f)
[[0 1]
[1 0]
[1 2]
[0 0]
[1 1]]
'''
f_set = set(f[:, index])
sum_HDA = 0
for a in f_set:
sum_HDA += calcHDA(feature, label, index, a)
return base_e-sum_HDA
第3关:使用ID3算法构造决策树
import numpy as np
class DecisionTree(object):
def __init__(self):
self.tree = {}
def calcInfoGain(self, feature, label, index):
'''
计算信息增益
:param feature:测试用例中字典里的feature,类型为ndarray
:param label:测试用例中字典里的label,类型为ndarray
:param index:测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
:return:信息增益,类型float
'''
def calcInfoEntropy(label):
'''
计算信息熵
:param label:数据集中的标签,类型为ndarray
:return:信息熵,类型float
'''
label_set = set(label)
result = 0
for l in label_set:
count = 0
for j in range(len(label)):
if label[j] == l:
count += 1
p = count / len(label)
result -= p * np.log2(p)
return result
def calcHDA(feature, label, index, value):
'''
计算信息熵
:param feature:数据集中的特征,类型为ndarray
:param label:数据集中的标签,类型为ndarray
:param index:需要使用的特征列索引,类型为int
:param value:index所表示的特征列中需要考察的特征值,类型为int
:return:信息熵,类型float
'''
count = 0
sub_feature = []
sub_label = []
for i in range(len(feature)):
if feature[i][index] == value:
count += 1
sub_feature.append(feature[i])
sub_label.append(label[i])
pHA = count / len(feature)
e = calcInfoEntropy(sub_label)
return pHA * e
base_e = calcInfoEntropy(label)
f = np.array(feature)
f_set = set(f[:, index])
sum_HDA = 0
for value in f_set:
sum_HDA += calcHDA(feature, label, index, value)
return base_e - sum_HDA
def getBestFeature(self, feature, label):
max_infogain = 0
best_feature = 0
for i in range(len(feature[0])):
infogain = self.calcInfoGain(feature, label, i)
if infogain > max_infogain:
max_infogain = infogain
best_feature = i
return best_feature
def createTree(self, feature, label):
if len(set(label)) == 1:
return label[0]
if len(feature[0]) == 1 or len(np.unique(feature, axis=0)) == 1:
vote = {}
for l in label:
if l in vote.keys():
vote[l] += 1
else:
vote[l] = 1
max_count = 0
vote_label = None
for k, v in vote.items():
if v > max_count:
max_count = v
vote_label = k
return vote_label
best_feature = self.getBestFeature(feature, label)
tree = {best_feature: {}}
f = np.array(feature)
f_set = set(f[:, best_feature])
for v in f_set:
sub_feature = []
sub_label = []
for i in range(len(feature)):
if feature[i][best_feature] == v:
sub_feature.append(feature[i])
sub_label.append(label[i])
tree[best_feature][v] = self.createTree(sub_feature, sub_label)
return tree
def fit(self, feature, label):
'''
:param feature: 训练集数据,类型为ndarray
:param label:训练集标签,类型为ndarray
:return: None
'''
self.tree = self.createTree(feature, label)
def predict(self, feature):
'''
:param feature:测试集数据,类型为ndarray
:return:预测结果,如np.array([0, 1, 2, 2, 1, 0])
'''
result = []
def classify(tree, feature):
if not isinstance(tree, dict):
return tree
t_index, t_value = list(tree.items())[0]
f_value = feature[t_index]
if isinstance(t_value, dict):
classLabel = classify(tree[t_index][f_value], feature)
return classLabel
else:
return t_value
for f in feature:
result.append(classify(self.tree, f))
return np.array(result)
第5关:sklearn中的决策树
from sklearn.tree import DecisionTreeClassifier
def iris_predict(train_sample, train_label, test_sample):
'''
实现功能:1.训练模型 2.预测
:param train_sample: 包含多条训练样本的样本集,类型为ndarray
:param train_label: 包含多条训练样本标签的标签集,类型为ndarray
:param test_sample: 包含多条测试样本的测试集,类型为ndarry
:return: test_sample对应的预测标签
'''
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(train_sample, train_label)
result=dt.predict(test_sample)
return result