首先来看一下我们今天计算的这个数据集的样子:
第一列是标签,计算的时候无视即可,主要是后面四列属性来计算,注意这里一共分成了三类,所以计算机公式相应的n都是3
import math
import numpy as np
import random
file=open('balance-scale.data')
dataMat=[]
labelMat=[]
for line in file.readlines():
curLine=line.strip().split(",")
# floatLine=map(float,curLine)#这里使用的是map函数直接把数据转化成为float类型
dataMat.append(curLine[1:5])
labelMat.append(curLine[0])
print('dataMat:',dataMat)
print('labelMat:',labelMat)
print(np.shape(dataMat))
for i in range(625):
for j in range(4):
dataMat[i][j] = float(dataMat[i][j])
# 转换为浮点数类型
print('dataMat:',dataMat)
# 计算信息增益
labelB = 0
labelR = 0
labelL = 0
for i in labelMat:
if i == 'B':
labelB+=1
elif i == 'R':
labelR+=1
else:
labelL+=1
labelNum = labelB+labelR+labelL
print(labelB,labelR,labelL,labelNum)
def log(x):
return math.log(x,2)
# 计算S的熵:
def shang(p,n,m):
Num = p+n+m
I = -p/Num*log(p/Num) -n/Num*log(n/Num) - m/Num*log(m/Num)
return I
SS = shang(labelB,labelR,labelL)
suanzi = [[[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0]],[[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0]],[[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0]],[[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0]]]
# 这是4*5*3的矩阵,分别代表属性,属性值,分类值
for i in range(625):
for j in range(4):
if dataMat[i][j] == 1:
if labelMat[i] == 'B':
suanzi[j][0][0]+=1
elif labelMat[i] == 'R':
suanzi[j][0][1]+=1
else:
suanzi[j][0][2]+=1
elif dataMat[i][j] == 2:
if labelMat[i] == 'B':
suanzi[j][1][0]+=1
elif labelMat[i] == 'R':
suanzi[j][1][1]+=1
else:
suanzi[j][1][2]+=1
elif dataMat[i][j] == 3:
if labelMat[i] == 'B':
suanzi[j][2][0]+=1
elif labelMat[i] == 'R':
suanzi[j][2][1]+=1
else:
suanzi[j][2][2]+=1
elif dataMat[i][j] == 4:
if labelMat[i] == 'B':
suanzi[j][3][0]+=1
elif labelMat[i] == 'R':
suanzi[j][3][1]+=1
else:
suanzi[j][3][2]+=1
else:
if labelMat[i] == 'B':
suanzi[j][4][0]+=1
elif labelMat[i] == 'R':
suanzi[j][4][1]+=1
else:
suanzi[j][4][2]+=1
# print(suanzi)
one = [0,0,0,0,0]
# 代表第一个属性的五个值对应的I(B,R,L)的值
two = [0,0,0,0,0]
three = [0,0,0,0,0]
four = [0,0,0,0,0]
for i in range(5):
one[i] = shang(suanzi[0][i][0],suanzi[0][i][1],suanzi[0][i][2])
two[i] = shang(suanzi[1][i][0],suanzi[1][i][1],suanzi[1][i][2])
three[i] = shang(suanzi[2][i][0],suanzi[2][i][1],suanzi[2][i][2])
four[i] = shang(suanzi[3][i][0],suanzi[3][i][1],suanzi[3][i][2])
E = [0,0,0,0]
# 代表四个属性,每个属性对应的熵
for i in range(4):
a = [0,0,0,0,0]
# 代表属性的每个取值对应的结果的个数
for j in range(5):
for z in range(3):
a[j] += suanzi[i][j][z]
if i == 0:
E[0] = a[0]/625*one[0] + a[1]/625*one[1] + a[2]/625*one[2] + a[3]/625*one[3] + a[4]/625*one[4]
elif i == 1:
E[1] = a[0]/625*two[0] + a[1]/625*two[1] + a[2]/625*two[2] + a[3]/625*two[3] + a[4]/625*two[4]
elif i == 2:
E[2] = a[0]/625*three[0] + a[1]/625*three[1] + a[2]/625*three[2] + a[3]/625*three[3] + a[4]/625*three[4]
else:
E[3] = a[0]/625*four[0] + a[1]/625*four[1] + a[2]/625*four[2] + a[3]/625*four[3] + a[4]/625*four[4]
# print("****************",a)
G = [0,0,0,0]
# G就是最后的信息增益,分别代表第1,2,3,4个属性
for i in range(4):
G[i] = SS - E[i]
print(G)