#计算香农熵
from math import log
#条件1,条件2,结果,通过for循环取出每列的结果featVec[-1],然后计算结果所占用的数量labelCounts
#然后就是香农熵的计算公式 -sum(p*log(p,2)),计算出当前数组的香农熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel]+=1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob*log(prob,2)
return shannonEnt
def createDataSet():
dataSet = [ [1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels = ['no surfacing','flippers']
return dataSet,labels
#对数组进行分类,dataSet中的每一行中的第axis列值=value值,则把这一列加入到reDataSet中
#注意extend跟append的用法不同
#列表操作
#List.append 与 List.extend
#a = [1, 2, 4]
#b = [5, 6, 7]
#a.extend(b)
#[1, 2, 4, 5, 6, 7]
#a = [1, 2, 4]
#a.append(b)
#[1, 2, 4, [5, 6, 7]]
def splitDataSet(dataSet, axis, value):
retDataSet = []
for item in dataSet:
if item[axis]==value:
reduceFeatVec = item[:axis]
reduceFeatVec.extend(item[axis+1:])
retDataSet.append(reduceFeatVec)
return retDataSet
#选择最好数据划分方式
#取出数据的前n-1列,计算基础的香农熵
#for循环numFeatures取出数据的第i列,然后去重,之后循环uniqueVals,通过每列的唯一value划分数据集splitDataSet,取出划分后的数据集如下
#set([0, 1])
#[[1, 'no'], [1, 'no']]
#[[1, 'yes'], [1, 'yes'], [0, 'no']]
#set([0, 1])
#[[1, 'no']]
#[[1, 'yes'], [1, 'yes'], [0, 'no'], [0, 'no']]
#与baseEntroy 对比 信息增益值
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGrain = 0.0
bestFeature = -1
print numFeatures
print baseEntropy
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
print featList
uniqueVals = set(featList)
newEntroy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntroy += prob * calcShannonEnt(subDataSet)
#估计大家看到这个地方就懵逼了,通俗的解释一下,就是按照第i列,根据unqiueVals种类型,分成了uniqueVals组,然后每组求香农熵,最后按照当前组在所有组的概率prob,计算出最后当前分组真实的香农熵,然后通过newEntroy进行累加。计算出当前i列所有分组的香农熵值。
infoGrain = baseEntropy - newEntroy
if(infoGrain > bestInfoGrain):
bestInfoGrain = infoGrain
bestFeature = i
return bestFeature
def start():
myDat,labels=createDataSet()
value = calcShannonEnt(myDat)
chooseBestFeatureToSplit(myDat)
#result1 = splitDataSet(myDat,0,1)
#result2 = splitDataSet(myDat,0,0)
#return result1,result2
大家始终记住,chooseBestFeatureToSplit,计算最优的分类策略。
信息增益=结果的信息熵-当前属性各个类别的信息熵
信息增益越大,也划分的纯度越高,所谓的纯度指划分以后的分类属于同一分类的个数
所以,信息增益则对可取值数目较多的属性有所偏好!注意:是可取值,假如100条数据,可取值也是100条比如编号,每个分支节点包含一个样本,这些分支节点的纯度已达最大!