ID3选取信息增益作为最优特征的选择方式。
import numpy as np
import pandas as pd
class Leaf_Node(object):
# 叶子节点
category = None # 分类
def __init__(self,category):
self.category = category
class Branch_Node(object):
# 非叶子结点
category = None # 某个特征的选择
character = None # 该结点对应的特征名
D = None # 建立决策树用,该结点对应的数据集
A = None # 建立决策树用,该结点对应的特征集
charac_index = None # 该结点对应的特征对应的索引
child_list = None ## 可加入叶子结点也可加入非叶子结点
def __init__(self,character,D,A,charac_index):
self.character = character
self.D = D
self.A = A
self.charac_index = charac_index
self.child_list = []
class Decision_Tree(object):
data = None # 全部数据
chara = None # 全部特征
threshold = None
decision_tree = None
def __init__(self):
## load data
data = pd.read_excel("./data.xlsx")
self.data = data.drop(labels = ['ID'],axis = 1)
self.chara = self.character(self.data)
self.threshold = 0
def duishu(self,p):
# -p*log(p)
if p == 0:
return 0
else:
return p * np.log2(p)
def shang(self,data):
# 输入数据集data,求得最优特征对应的索引和其信息增益
data_x,data_y = self.split_x_y(data)
jingyanshang = 0
exp_condition_shang = []
zengyibi = [] # 信息增益比
H_ad = []
count_y = data_y.value_counts(normalize=True)
# print(count_y.index[0][0])
item_y = list(count_y.index)
for i in count_y:
jingyanshang -= self.duishu(i)
for i in range(data_x.shape[1]):
exp_cond = 0 ## 经验条件熵
exp_rate = 0 ## 信息增益比
data_x_slice = pd.merge(data_x.iloc[:,[i]],data_y,left_index=True,right_index=True)
count_x = data_x_slice.iloc[:,[0]].value_counts(normalize=True)
item_x = list(count_x.index)
for j in range(len(item_x)):
x_item = item_x[j][0]
data_x_slice_2 = data_x_slice[data_x_slice[data_x_slice.columns.values[0]] == x_item]
exp_rate -= self.duishu(data_x_slice_2.shape[0] / data_x_slice.shape[0])
for k in range(len(item_y)):
y_item = item_y[k][0]
data_x_slice_3 = data_x_slice_2[data_x_slice_2[data_x_slice_2.columns.values[1]] == y_item]
exp_cond -= data_x_slice_2.shape[0] / data_x_slice.shape[0] * self.duishu(data_x_slice_3.shape[0] / data_x_slice_2.shape[0])
exp_condition_shang.append(exp_cond)
H_ad.append(exp_rate)
zengyi = jingyanshang - exp_condition_shang # 信息增益
for i in range(len(H_ad)):
zengyibi.append(zengyi[i] / H_ad[i])
return np.argmax(zengyi),zengyi[np.argmax(zengyi)]
# _,self.chara_seq = zip(*(sorted(zip(list(np.argsort(zengyi)),[i for i in range(self.data_x.shape[1])]),reverse=False)))
def split_x_y(self,D): # D:数据集
# 划分为x和y
return D.drop(labels = ['类别'],axis=1,inplace=False),D[['类别']]
def character(self,D):
# 求数据集D中的所有特征
return list(D.drop(labels = ['类别'],axis=1,inplace=False).columns)
def Build_Tree(self,D,A):
# 非递归建立决策树
stack = []
root = self.Build_Node(D,A)
stack.append(root)
while(len(stack)>0):
cur = stack.pop()
if(isinstance(cur,Branch_Node)):
D = cur.D
A = cur.A
charac_index = cur.charac_index
charac_name = cur.character
A_new = A.copy()
del A_new[charac_index] # 删除某一特征
for i in D[charac_name].value_counts().index:
D_new_1 = D[D[charac_name] == i]
D_new_1 = D_new_1.drop(labels = [charac_name],axis = 1)
mChild = self.Build_Node(D_new_1,A_new)
mChild.category = i
cur.child_list.append(mChild)
stack.append(mChild)
self.decision_tree = root
return
def Build_Node(self,D,A):
D_X,D_Y = self.split_x_y(D)
charac_index,character_zengyi = self.shang(D)
if(len(D_Y.value_counts().index) == 1) or (len(A) == 0) \
or (character_zengyi < self.threshold):
return Leaf_Node(D_Y.value_counts(ascending=False).index[0][0])
else:
return Branch_Node(A[charac_index],D,A,charac_index)
def Traverse_Tree(self,test):
# 搜索决策树,得到结果
root = self.decision_tree
while(isinstance(root,Branch_Node)):
ans = test[root.character][0]
for i in root.child_list:
if(i.category == ans):
root = i
break
return root.category
d = Decision_Tree()
d.Build_Tree(d.data,d.chara)
test_data = [['青年','否','否','一般']]
test = pd.DataFrame(test_data,columns=['年龄','有工作','有自己的房子','信贷情况'])
print(test,"的预测结果是:")
print(d.Traverse_Tree(test))