实验任务:
实现ID3决策树,并在给定的数据集上进行5折交叉验证。并观测训所得到的决策树在训练集和测试集的准确率,从而判断该决策树是否存在过拟合。在此基础上实现预剪枝和后剪枝,并比较预剪枝树与后剪枝树在训练集和测试集上的准确率。
编程语言:java, matlab, python, C++,C均可
数据集:
鸢尾花卉Iris数据集描述:
iris是鸢尾植物,这里存储了其萼片和花瓣的长宽,共4个属性,鸢尾植物分三类。所以该数据集一共包含4个特征变量,1个类别变量。共有150个样本,鸢尾有三个亚属,分别是山鸢尾 (Iris-setosa),变色鸢尾(Iris-versicolor)和维吉尼亚鸢尾(Iris-virginica)。
也就是说我们的数据集里每个样本含有四个属性,并且我们的任务是个三分类问题。三个类别分别为: Iris Setosa(山鸢尾),Iris Versicolour(杂色鸢尾),Iris Virginica(维吉尼亚鸢尾)。
例如:
样本一: 5.1, 3.5, 1.4, 0.2, Iris-setosa
其中“5.1,3.5,1.4,0.2”代表当前样本的四个属性的取值,“Iris-setosa”代表当前样本的类别。
实验完成要求:
完成上述实验的编码,并把实验流程,算法思想和在给定数据集上得到的指标记录到实验报告里。向助教老师演示所实现代码,并解释核心代码的思想。
|
import copy,math,random,numpy as np
class notLeaf():
def __init__(self):
self.layer = -1
self.feature = -1
self.feature_value = -1.00
self.remain_list = [0, 0, 0]
self.treeindex_list = []
node = [[notLeaf() for i in range(15)] for i in range(5)]
node_value = [-1,-1,-1,-1,-1]
'''实现数据文件读取,随机打乱,分层抽样'''
def inputData():
file_adress = "C:\python\MyCode_py\iris.data"
iris_data = open(file_adress, "r")
data_list = []
for airis in iris_data:
irsi = list(airis.rstrip("\n").split(","))
#print(num)
if irsi[-1] == 'Iris-setosa' : irsi[-1] = '0'
elif irsi[-1] == 'Iris-versicolor' : irsi[-1] = '1'
elif irsi[-1] == 'Iris-virginica' : irsi[-1] = '2'
if irsi.__len__() == 5 : data_list.append(irsi)
iris_data.close()
random.shuffle(data_list)
data_array = np.array(data_list)
subset = [[],[],[]]
for airis in data_array:
for i in range(3):
if airis[-1] == str(i):
subset[i].append(airis)
irisdata = [[],[],[],[],[]]
for i in range(5):
for j in range(3):
irisdata[i][j*10 : j*10+10] = subset[j][10*i : 10*i+10]
return data_array,irisdata
'''数据划分'''
def dataDivide(index,irisdata):
train_list = []
for i in range(5):
if(index != i):
train_list += irisdata[i]
trainData = np.array(train_list)
testData = np.array(irisdata[index])
return trainData, testData
'''计算信息熵'''
def Ent(irisData):
classCount = [0, 0, 0]
for iris in irisData:
for i in range(3):
if iris[-1] == str(i):
classCount[i] += 1
Ent = 0
for i in range(3):
Ent -= 0 if classCount[i] == 0 else classCount[i] / len(irisData) * math.log(classCount[i] / len(irisData), 2)
return Ent
'''该属性的最大信息增益'''''
def Gain(irisData,feature):
'''分二法处理连续值'''
featureValue, divideValue = [], []
for iris in irisData:
featureValue.append(iris[feature])
featureValue.sort()
for i in range(len(featureValue) - 1):
divideValue.append(round(float(featureValue[i])/2 + float(featureValue[i + 1])/2,3))
value = np.array(divideValue)
'''计算每一值的信息增益'''
max_index = 0
Gain_Datlist = []
for i in range(len(value)):
lessThan, moreThan = [], []
for iris in irisData:
if float(iris[feature]) <= value[i]:
lessThan.append(iris)
else:
moreThan.append(iris)
lessIrisData = np.array(lessThan)
moreIrisData = np.array(moreThan)
ent, ent1, ent2 = Ent(irisData), Ent(lessIrisData), Ent(moreIrisData)
Gain_Dat = ent - len(lessIrisData) / len(irisData) * ent1 - len(moreIrisData) / len(irisData) * ent2
Gain_Datlist.append(Gain_Dat)
'''遍历找到最大信息增益,以及此时对应的特征值'''
for i in range(1, len(value)):
if Gain_Datlist[i] > Gain_Datlist[max_index]:
max_index = i
feature_value = value[max_index]
feature_gain = Gain_Datlist[max_index]
return feature_value, feature_gain
'''计数返回最多的类别'''
def Most_class(class_count):
index = 0
for i in range(3):
if class_count[i] > class_count[index]:
index = i
if index == 0:
return 'Iris-setosa',class_count[0]
elif index == 1:
return 'Iris-versicolor',class_count[1]
elif index == 2:
return 'Iris-virginica',class_count[2]
'''生成决策树'''
def ID3_create(layer,usedFeatureValue ,irisData ,class_count ,father_class,index_list,index1):
global node_value
global node
layer1 = layer
usedFeatureValue1 = usedFeatureValue
'''若当前集合作为一个父节点,集合中数目最多的样本类别是 father_type1'''
father_class1,_ = Most_class(class_count)
'''1、当前节点包含的样本集合 train_list 为空,应该标记为其父节点集合中样本数最多的类'''
if len(irisData) == 0:
return father_class
'''2、如果D中样本全属于同一类别,return 类别'''
for i in range(3):
if(class_count[i] == np.sum(class_count[:])):
Class, _ = Most_class(class_count)
return Class
'''3、当前属性集合为空(不可能)'''
'''4、样本集合中在属性集合的取值完全一样'''
isAllTheSame = True
for feature in range(4):
for i in range(len(irisData) - 1):
if irisData[i][feature] != irisData[i + 1][feature]:
isAllTheSame = False
if isAllTheSame :
Class, _ = Most_class(class_count)
return Class
'''最大gain_dat'''
max_feature_gain , max_feature_gain_featurevalue , which_feature = 0 , 0 , 0
train_array = np.array(irisData)
for feature in range(4):
feature_value, feature_gain = Gain(train_array, feature)
if feature_gain > max_feature_gain:
max_feature_gain = feature_gain
max_feature_gain_featurevalue = feature_value
which_feature = feature
'''非使用过的'''
for i in usedFeatureValue1[which_feature]:
if max_feature_gain == i:
Class, _ = Most_class(class_count)
return Class
usedFeatureValue1[which_feature].append(max_feature_gain_featurevalue)
# ************************************后剪枝
node_value[index1] += 1
# print(node_value[index])
node[index1][node_value[index1]].layer = layer
node[index1][node_value[index1]].feature = which_feature
node[index1][node_value[index1]].feature_value = max_feature_gain_featurevalue
node[index1][node_value[index1]].remain_list = class_count
node[index1][node_value[index1]].treeindex_list = copy.deepcopy(index_list)
layer1 = layer1 + 1
# ************************************后剪枝
index_list1, index_list2 = [], []
index_list1 = copy.deepcopy(index_list)
index_list2 = copy.deepcopy(index_list)
index_list1.append(2)
index_list2.append(3)
'''左右子树生成,左小右大'''
less_than , more_than = [] , []
class_count1 , class_count2 = [0, 0, 0] , [0, 0, 0]
for iris in irisData: # 对于train_list里的每一个data
if float(iris[which_feature]) <= max_feature_gain_featurevalue:
less_than.append(iris)
class_count1[int(iris[-1])] += 1
else:
more_than.append(iris)
class_count2[int(iris[-1])] += 1
ID3tree = [ which_feature,
max_feature_gain_featurevalue,
ID3_create(layer1,usedFeatureValue1, less_than, class_count1, father_class1,index_list1,index1),
ID3_create(layer1,usedFeatureValue1, more_than, class_count2, father_class1,index_list2,index1)]
return ID3tree
'''生成预剪枝决策树'''
def ID3Prepruning_create(usedFeatureValue ,irisData ,class_count ,father_class):
usedFeatureValue1 = usedFeatureValue
'''若当前集合作为一个父节点,集合中数目最多的样本类别是 father_type1'''
father_class1,_ = Most_class(class_count)
'''1、当前节点包含的样本集合 train_list 为空,应该标记为其父节点集合中样本数最多的类'''
if len(irisData) == 0:
return father_class
'''2、如果D中样本全属于同一类别,return 类别'''
for i in range(3):
if(class_count[i] == np.sum(class_count[:])):
Class,_ = Most_class(class_count)
return Class
'''3、当前属性集合为空(不可能)'''
'''4、样本集合中在属性集合的取值完全一样'''
isAllTheSame = True
for feature in range(4):
for i in range(len(irisData) - 1):
if irisData[i][feature] != irisData[i + 1][feature]:
isAllTheSame = False
if isAllTheSame :
Class, _ = Most_class(class_count)
return Class
'''最大gain_dat'''
max_feature_gain , max_feature_gain_featurevalue , which_feature = 0 , 0 , 0
train_array = np.array(irisData)
for feature in range(4):
feature_value, feature_gain = Gain(train_array, feature)
if feature_gain > max_feature_gain:
max_feature_gain = feature_gain
max_feature_gain_featurevalue = feature_value
which_feature = feature
'''非使用过的'''
for i in usedFeatureValue1[which_feature]:
if max_feature_gain == i:
Class, _ = Most_class(class_count)
return Class
usedFeatureValue1[which_feature].append(max_feature_gain_featurevalue)
'''左右子树生成,左小右大'''
less_than , more_than = [] , []
class_count1 , class_count2 = [0, 0, 0] , [0, 0, 0]
for iris in irisData: # 对于train_list里的每一个data
if float(iris[which_feature]) <= max_feature_gain_featurevalue:
less_than.append(iris)
class_count1[int(iris[-1])] += 1
else:
more_than.append(iris)
class_count2[int(iris[-1])] += 1
'''是否预剪枝'''
Class,Class_num = Most_class(class_count)
_,Class_num1 = Most_class(class_count1)
_,Class_num2 = Most_class(class_count2)
if Class_num >= Class_num1 + Class_num2:
return Class
ID3Prepruningtree = [ which_feature,
max_feature_gain_featurevalue,
ID3Prepruning_create(usedFeatureValue1, less_than, class_count1, father_class1),
ID3Prepruning_create(usedFeatureValue1, more_than, class_count2, father_class1)]
return ID3Prepruningtree
'''生成后剪枝决策树'''
def ID3Postpruning_create(ID3tree,irisData,index):
ID3Postpruningtree = copy.deepcopy(ID3tree)
for i in range(14, -1, -1):
for j in range(0, 15):
if node[index][j].layer == i:
ID3Postpruningtree2 = copy.deepcopy(Postpruning(node[index][j],ID3Postpruningtree))
_, a1 = ID3_accuracy(ID3Postpruningtree , irisData)
_, a2 = ID3_accuracy(ID3Postpruningtree2, irisData)
if a1 < a2:
ID3Postpruningtree = copy.deepcopy(ID3Postpruningtree2)
return ID3Postpruningtree
'''是否进行后剪枝'''
def Postpruning(node, ID3tree):
ID3Postpruningtree = copy.deepcopy(ID3tree)
for i in range(1,16):
if len(node.treeindex_list) == i:
if i == 1:
ID3Postpruningtree,_ = Most_class(node.remain_list)
else:
for j in range(1,i):
temp = id(ID3Postpruningtree[node.treeindex_list[j]])
temp,_ = Most_class(node.remain_list)
return ID3Postpruningtree
'''测试训练得到的决策树的准确率'''
def ID3_test(ID3tree, iris):
root = "unknowen"
while (len(ID3tree) > 0):
if isinstance(ID3tree, str) and ID3tree in ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']:
return ID3tree
featureORclass = ID3tree[0]
if (isinstance(featureORclass, str)):
return featureORclass
if isinstance(featureORclass, int):
if float(iris[featureORclass]) <= ID3tree[1] and ID3tree[2] != []:
ID3tree = ID3tree[2]
else:
ID3tree = ID3tree[3]
return featureORclass
'''计算决策树的准确率'''
def ID3_accuracy(ID3tree, irisData):
correct = 0
for iris in irisData:
for i in range(3):
if iris[-1] == str(i):
s = [0,0,0]
s[i] = 1
Class,_ = Most_class(s)
result = ID3_test(ID3tree, iris)
if result == Class: correct += 1
return correct, correct/len(irisData)
'''主程序运行'''
def main():
print("201700301102 决策树实验")
irisdata = [[],[],[],[],[]]
data_array, irisdata = inputData()
'''5次交叉验证的准确率'''
accuracyRate_train , accuracyRate_test = [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]
accuracyRate_train1, accuracyRate_test1 = [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]
accuracyRate_train2, accuracyRate_test2 = [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]
for i in range(5):
trainData, testData = dataDivide(i,irisdata)
train_list = trainData.tolist()
usedFeatureValue = [[], [], [], []]
class_count = [np.sum(trainData[:, -1] == '0'), np.sum(trainData[:, -1] == '1'), np.sum(trainData[:, -1] == '2')]
father_class = ''
'''递归构建ID3决策树'''
layer = 0
index_list = []
index_list.append(0)
ID3tree = ID3_create(layer,usedFeatureValue, train_list, class_count, father_class,index_list,i)
print("第{}次训练习得ID3决策树{}".format(i+1, ID3tree))
'''准确率计算'''
correctCount_train, accuracyRate_train[i] = ID3_accuracy(ID3tree,trainData)
correctCount_test , accuracyRate_test [i] = ID3_accuracy(ID3tree,testData )
print("训练集判断正确{}个,准确率为{:.2%};".format(correctCount_train, accuracyRate_train[i]))
print("测试集判断正确{}个,准确率为{:.2%};".format(correctCount_test , accuracyRate_test [i]))
'''递归构建预剪枝ID3决策树'''
ID3tree1 = ID3Prepruning_create(usedFeatureValue, train_list, class_count, father_class)
print("第{}次训练习得ID3预剪枝决策树{}".format(i + 1, ID3tree1))
'''准确率计算'''
correctCount_train1, accuracyRate_train1[i] = ID3_accuracy(ID3tree1, trainData)
correctCount_test1, accuracyRate_test1[i] = ID3_accuracy(ID3tree1, testData)
print("训练集判断正确{}个,准确率为{:.2%};".format(correctCount_train1, accuracyRate_train1[i]))
print("测试集判断正确{}个,准确率为{:.2%};".format(correctCount_test1, accuracyRate_test1[i]))
'''遍历构建后剪枝ID3决策树'''
ID3tree2 = ID3Postpruning_create(ID3tree,trainData,i)
print("第{}次训练习得ID3后剪枝决策树{}".format(i + 1, ID3tree2))
'''准确率计算'''
correctCount_train2, accuracyRate_train2[i] = ID3_accuracy(ID3tree2, trainData)
correctCount_test2, accuracyRate_test2[i] = ID3_accuracy(ID3tree2, testData)
print("训练集判断正确{}个,准确率为{:.2%};".format(correctCount_train2, accuracyRate_train2[i]))
print("测试集判断正确{}个,准确率为{:.2%};".format(correctCount_test2, accuracyRate_test2[i]))
print("")
print("ID3决策树5折交叉验证平均训练集准确率为{:.2%}".format(sum(accuracyRate_train) / len(accuracyRate_train)))
print("ID3决策树5折交叉验证平均测试集准确率为{:.2%}".format(sum(accuracyRate_test) / len(accuracyRate_test)))
print("ID3预剪枝决策树5折交叉验证平均训练集准确率为{:.2%}".format(sum(accuracyRate_train1) / len(accuracyRate_train1)))
print("ID3预剪枝决策树5折交叉验证平均测试集准确率为{:.2%}".format(sum(accuracyRate_test1 ) / len(accuracyRate_test1 )))
print("ID3后剪枝决策树5折交叉验证平均训练集准确率为{:.2%}".format(sum(accuracyRate_train2) / len(accuracyRate_train2)))
print("ID3后剪枝决策树5折交叉验证平均测试集准确率为{:.2%}".format(sum(accuracyRate_test2) / len(accuracyRate_test2)))
main()