基于西瓜数据集2.0(提取码:esa8)
,选择信息增益作为属性选择指标,建立决策树。
步骤如下:
输入离散变量的取值集合与标签,并读取数据.
D_keys = {
'色泽': ['青绿', '乌黑', '浅白'],
'根蒂': ['蜷缩', '硬挺', '稍蜷'],
'敲声': ['清脆', '沉闷', '浊响'],
'纹理': ['稍糊', '模糊', '清晰'],
'脐部': ['凹陷', '稍凹', '平坦'],
'触感': ['软粘', '硬滑'],
}
keys = ['是', '否']
根据留出法将数据集分为相斥的测试集与训练集,比例为3:7,为了保证均匀取出好瓜和坏瓜,采用分层抽样,即分别在好瓜和坏瓜的集合中随机取样。
#划分训练集&测试集,留出法,比例为7:3,分层抽样
def traintest(dataSet):
dataSet0 = dataSet[dataSet['好瓜'] == '是']
dataSet1 = dataSet[dataSet['好瓜'] == '否']
list0 = dataSet0.sample(frac=0.7)
list0 = list0.append(dataSet1.sample(frac=0.7))
rowlist = []
for indexs in list0.index:
rowlist.append(indexs)
list1 = dataSet.drop(rowlist, axis=0)
return list0,list1
编写在生成树时会用到的子函数:
# 叶节点选择其类别为D中样本最多的类
def choose_largest_example(D):
count = D['好瓜'].value_counts()
return '是' if count['是'] > count['否'] else '否'
# 测试决策树的准确率
def test_Tree(Tree, data_test):
accuracy = 0
for index, row in data_test.iterrows():
result = dfs_Tree(Tree, row)
if result == row['好瓜']:
# print(row.values, Tree)
accuracy += 1
return accuracy / data_test.shape[0]
# 判断D中的样本在A上的取值是否相同
def same_value(D, A):
for key in A:
if key in D_keys and len(D[key].value_counts()) > 1:
return False
return True
# 计算给定数据集的熵
def calc_Ent(dataSet):
numEntries = dataSet['power'].sum()
Count = dataSet.groupby('好瓜')['power'].sum()
Ent = 0.0
for key in keys:
# print(Count[key])
if key not in Count:
Ent -= 0.0
else:
prob = Count[key] / numEntries
Ent -= prob * math.log(prob, 2)
return Ent
# 计算按key划分的信息增益值
def calc_Gain_D(D, D_no_nan, key, Ent_D):
Ent = 0.0
D_size = D['power'].sum()
D_nan_size = D_no_nan['power'].sum()
for value in D_keys[key]:
Dv = D.loc[D[key] == value]
Dv_size = Dv['power'].sum()
Ent_Dv = calc_Ent(Dv)
Ent += Dv_size / D_nan_size * Ent_Dv
return D_nan_size / D_size * (Ent_D - Ent)
# 生成连续值属性的候选划分点集合T
def candidate_T(D, key, n):
L = set(D[key])
T = []
a, Sum = 0, 0
for value in L:
Sum += value
a += 1
if a == n:
T.append(Sum / n)
a, Sum = 0, 0
if a > 0:
T.append(Sum / a)
return T
# 计算样本D基于划分点t二分后的连续值属性信息增益
def calc_Gain_t(D, D_no_nan, key, t, Ent_D):
Ent = 0.0
D_size = D['power'].sum()
D_nan_size = D_no_nan['power'].sum()
Dv = D.loc[D[key] <= t]
Dv_size = Dv['power'].sum()
Ent_Dv = calc_Ent(Dv)
Ent += Dv_size / D_nan_size * Ent_Dv
Dv = D.loc[D[key] > t]
Dv_size = Dv['power'].sum()
Ent_Dv = calc_Ent(Dv)
Ent += Dv_size / D_nan_size * Ent_Dv
return D_nan_size / D_size * (Ent_D - Ent)
# 计算样本D基于不同划分点t二分后的连续值属性信息增益,找出最大增益划分点
def calc_Gain_C(D, D_no_nan, key, Ent_D):
n = 2
T = candidate_T(D, key, n)
max_Gain, max_partition = -1, -1
for t in T:
Gain = calc_Gain_t(D, D_no_nan, key, t, Ent_D)
if max_Gain < Gain:
max_Gain = Gain
max_partition = t
return max_Gain, max_partition
# 从A中选择最优的划分属性值,若为连续值,返回划分点
def choose_best_attribute(D, A):
max_Gain, max_partition, partition, best_attr = -1, -1, -1, ''
for key in A:
# 划分属性为离散属性时
if key in D_keys:
D_no_nan = D.loc[pd.notna(D[key])]
Ent_D = calc_Ent(D_no_nan)
Gain = calc_Gain_D(D, D_no_nan, key, Ent_D)
# 划分属性为连续属性时
else:
D_no_nan = D.loc[pd.notna(D[key])]
Ent_D = calc_Ent(D_no_nan)
Gain, partition = calc_Gain_C(D, D_no_nan, key, Ent_D)
if max_Gain < Gain:
best_attr = key
max_Gain = Gain
max_partition = partition
return best_attr, max_partition
生成树,并进行预剪枝:
# 函数TreeGenerate 递归生成决策树,以下情形导致递归返回
# 1. 当前结点包含的样本全属于一个类别
# 2. 当前属性值为空, 或是所有样本在所有属性值上取值相同,无法划分
# 3. 当前结点包含的样本集合为空,不可划分
def TreeGenerate(D, A):
Count = D['好瓜'].value_counts()
if len(Count) == 1:
return D['好瓜'].values[0]
if len(A) == 0 or same_value(D, A):
return choose_largest_example(D)
node = {}
best_attr, partition = choose_best_attribute(D, A)
D_size = D.shape[0]
# 最优划分属性为离散属性时
if best_attr in D_keys:
for value in D_keys[best_attr]:
Dv = D.loc[D[best_attr] == value].copy()
Dv_size = Dv.shape[0]
Dv.loc[pd.isna(Dv[best_attr]), 'power'] = Dv_size / D_size
temp1 = test_Tree(choose_largest_example(D),data_test)
if Dv.shape[0] == 0:
node[value] = choose_largest_example(D)
else:
new_A = [key for key in A if key != best_attr]
node[value] = TreeGenerate(Dv, new_A)
temp0 = test_Tree(node[value],data_test)
if temp1 > temp0:
node[value] = choose_largest_example(D)
# 最优划分属性为连续属性时
else:
# print(best_attr, partition)
# print(D.values)
left = D.loc[D[best_attr] <= partition].copy()
Dv_size = left.shape[0]
left.loc[pd.isna(left[best_attr]), 'power'] = Dv_size / D_size
left_key = '<= ' + str(partition)
if left.shape[0] == 0:
node[left_key] = choose_largest_example(D)
else:
node[left_key] = TreeGenerate(left, A)
right = D.loc[D[best_attr] > partition].copy()
Dv_size = right.shape[0]
right.loc[pd.isna(right[best_attr]), 'power'] = Dv_size / D_size
right_key = '> ' + str(partition)
temp1 = test_Tree(choose_largest_example(D), data_test)
if right.shape[0] == 0:
node[right_key] = choose_largest_example(D)
else:
node[right_key] = TreeGenerate(right, A)
temp0 = test_Tree(node[right_key], data_test)
if temp1 > temp0:
node[right_key] = choose_largest_example(D)
# plotTree.plotTree(Tree)
return {best_attr: node}
# 获得下一层子树分支
def get_next_Tree(Tree, key, value):
if key not in D_keys:
partition = float(list(Tree[key].keys())[0].split(' ')[1])
if value <= partition:
value = '<= ' + str(partition)
else:
value = '> ' + str(partition)
return Tree[key][value]
利用树判断:
# 深度优先遍历,判断预测值
def dfs_Tree(Tree, row):
if type(Tree).__name__ == 'dict':
key = list(Tree.keys())[0]
value = row[key]
if pd.isnull(value):
result = {key: 0 for key in D_keys['好瓜']}
for next_key in Tree[key]:
next_Tree = Tree[key][next_key]
temp = dfs_Tree(next_Tree, row)
result[temp] += 1
return '是' if count['是'] > count['否'] else '否'
else:
next_Tree = get_next_Tree(Tree, key, value)
return dfs_Tree(next_Tree, row)
else:
return Tree
主函数
if __name__ == '__main__':
# 读取数据
filename = '/Users/haoranjiang/Documents/machine learning/111111111/tree.txt'
dataSet = loadData(filename)
dataSet.drop(columns=['编号'], inplace=True)
# 考虑缺失值
dataSet['power'] = 1.0
data_train,data_test = traintest(dataSet)
# 决策树训练
A = [column for column in data_train.columns if column != '好瓜']
Tree = TreeGenerate(data_train, A)
# 决策树测试
print('准确度:',test_Tree(Tree, data_test)*100,'%')
print(Tree)