# -*- coding: utf-8 -*-
from numpy import *
import numpy as np
import pandas as pd
from math import log
import operator
import re
from collections import defaultdict
import itertools
def calGini(dataSet):
numEntries = len(dataSet)
labelCounts={}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
gini=1
for label in labelCounts.keys():
prop=float(labelCounts[label])/numEntries
gini -=prop*prop
return gini
# 传入的是一个特征值的列表,返回特征值二分的结果
def featuresplit(features):
count = len(features)#特征值的个数
if count < 2: #特征值只有一个值比如'cold_blood'
li=[]
print "please check sample's features,only one feature value"
li.append(features)
return tuple(li) #列表转化为元组
# 由于需要返回二分结果,所以每个分支至少需要一个特征值,所以要从所有的特征组合中选取1个以上的组合
# itertools的combinations 函数可以返回一个列表选多少个元素的组合结果,例如combinations(list,2)返回的列表元素选2个的组合
# 我们需要选择1-(count-1)的组合
featureIndex = range(count)
featureIndex.pop(0)
combinationsList = []
resList=[]
# 遍历所有的组合
for i in featureIndex:
temp_combination = list(itertools.combinations(features, len(features[0:i])))
combinationsList.extend(temp_combination)
combiLen = len(combinationsList)
# 每次组合的顺序都是一致的,并且也是对称的,所以我们取首尾组合集合
# zip函数提供了两个列表对应位置组合的功能
resList = zip(combinationsList[0:combiLen/2], combinationsList[combiLen-1:combiLen/2-1:-1]) #往回数间隔为1
return resList #二分特征的不同情况
#def splitDataSet(dataSet, axis, values):
# retDataSet = []
# for featVec in dataSet:
# for value in values:
# if featVec[axis] == value:
# reducedFeatVec = featVec[:axis] #剔除样本集
# reducedFeatVec.extend(featVec[axis+1:])
# retDataSet.append(reducedFeatVec)
# return retDataSet #把那些特征值等于value的都剔出来
#def splitDataSet(dataSet, axis, values): #实现了一些特征的重复利用 比如cover 特征复用
# retDataSet = []
# if len(values) < 2:
# for featVec in dataSet: #长度小于2即只有一个特征值
# if featVec[axis] == values[0]: #如果特征值只有一个,不抽取当选特征
# reducedFeatVec = featVec[:axis]
# reducedFeatVec.extend(featVec[axis+1:])
# retDataSet.append(reducedFeatVec)
# else:
# for featVec in dataSet:
# for value in values:
# if featVec[axis] == value: #如果特征值多于一个,选取当前特征
# retDataSet.append(featVec)
#
# return retDataSet
#处理连续特征值
def splitDataSet(dataSet, axis, value,threshold):
retDataSet = []
if threshold == 'lt':
for featVec in dataSet:
if featVec[axis] <= value:
retDataSet.append(featVec)
else:
for featVec in dataSet:
if featVec[axis] > value:
retDataSet.append(featVec)
return retDataSet
# 返回最好的特征以及二分特征值
"""def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 #
bestGiniGain = 1.0
bestFeature = -1
bestBinarySplit=()
for i in range(numFeatures): #遍历特征
featList = [example[i] for example in dataSet] #得到特征列
uniqueVals = list(set(featList)) #去除重复值的特征列
# 三个特征值的二分结果:
# [(('young',), ('old', 'middle')), (('old',), ('young', 'middle')), (('middle',), ('young', 'old'))]
for split in featuresplit(uniqueVals): #featuresplit返回特征的所有二分情况
GiniGain = 0.0
if len(split)==1: #split是一个元组 特征值只有一个比如:cold_blood 只有一个特征值就没办法继续划分下去了 所以跳出循环继续下一循环
continue
(left,right)=split
# 对于每一个可能的二分结果计算gini增益
# 左增益
left_subDataSet = splitDataSet(dataSet, i, left)
left_prob = len(left_subDataSet)/float(len(dataSet))
GiniGain += left_prob * calGini(left_subDataSet)
# 右增益
right_subDataSet = splitDataSet(dataSet, i, right)
right_prob = len(right_subDataSet)/float(len(dataSet))
GiniGain += right_prob * calGini(right_subDataSet)
if (GiniGain <= bestGiniGain): #比较是否是最好的结果
bestGiniGain = GiniGain #记录最好的结果和最好的特征
bestFeature = i
bestBinarySplit=(left,right)
return bestFeature,bestBinarySplit
"""
#处理连续特征值
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
bestGiniGain = 1.0; bestFeature = -1;bsetValue=""
for i in range(numFeatures): #遍历特征
featList = [example[i] for example in dataSet] #得到特征列
uniqueVals = list(set(featList)) #从特征列获取该特征的特征值的set集合
uniqueVals.sort()
for value in uniqueVals: #遍历所有的特征值
GiniGain = 0.0
#左基尼指数
left_subDataSet = splitDataSet(dataSet, i, value,'lt')
left_prob = len(left_subDataSet)/float(len(dataSet))
GiniGain += left_prob * calGini(left_subDataSet)
#右基尼指数
right_subDataSet = splitDataSet(dataSet, i, value,'gt')
right_prob = len(right_subDataSet)/float(len(dataSet))
GiniGain += right_prob * calGini(right_subDataSet)
if (GiniGain < bestGiniGain): #比较是否是最好的结果
bestGiniGain = GiniGain #记录最好的结果和最好的特征
bestFeature = i
bestValue=value
return bestFeature,bestValue
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0] #返回标签
"""def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
# 9/0
# print dataSet
if classList.count(classList[0]) == len(classList):
return classList[0] #所有的类别都一样,就不用再划分了
if len(dataSet) == 1: #如果没有继续可以划分的特征,就多数表决决定分支的类别
# print "here"
return majorityCnt(classList)
bestFeat,bestBinarySplit = chooseBestFeatureToSplit(dataSet)
# 9/0
# print bestFeat,bestBinarySplit,labels
bestFeatLabel = labels[bestFeat]
if bestFeat==-1:
return majorityCnt(classList)
myTree = {bestFeatLabel:{}}
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = list(set(featValues))
# 9/0
for value in bestBinarySplit:
# 9/0
subLabels = labels[:] #拷贝防止其他地方修改 特征标签
if len(value)<2:
del(subLabels[bestFeat])
# 9/0
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
# 9/0
return myTree
"""
#处理连续特征值, labels是特征标签
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0] #所有的类别都一样,就不用再划分了
if len(dataSet) == 1: #如果没有继续可以划分的特征,就多数表决决定分支的类别
return majorityCnt(classList)
bestFeat,bestValue = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
if bestFeat==-1:
return majorityCnt(classList)
myTree = {bestFeatLabel:{}}
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = list(set(featValues))
subLabels = labels[:]
myTree[bestFeatLabel][bestFeatLabel+'<='+str(round(float(bestValue),3))] = createTree(splitDataSet(dataSet, bestFeat, bestValue,'lt'),subLabels)
myTree[bestFeatLabel][bestFeatLabel+'>'+str(round(float(bestValue),3))] = createTree(splitDataSet(dataSet, bestFeat, bestValue,'gt'),subLabels)
return myTree
#完美没有问题!!!
####测试分类
#由于在Tree中,连续值特征的名称改为了feature<=value的形式
#因此对于这类特征,需要利用正则表达式进行分割,获得特征名以及分割阈值(其他方法也可以)
def classify(inputTree,featLabels,testVec):
firstStr=inputTree.values()[0].keys() #第一个为最佳分类特征 #连续值
if '<=' not in firstStr[0]:
firstStr.reverse()
featvalue=float(re.compile("(<=.+)").search(firstStr[0]).group()[2:]) #例子中的97
featkey=re.compile("(.+<=)").search(firstStr[0]).group()[:-2] #例子中的money
featIndex=featLabels.index(featkey) #特征列表中当前特征标签位置
if testVec[featIndex]<=featvalue: #测试样本中对应位置的值
secondDict=inputTree.values()[0][firstStr[0]]
if type(secondDict).__name__=='dict':
classLabel=classify(secondDict,featLabels,testVec) #递归调用
else:
classLabel=secondDict
else:
secondDict=inputTree.values()[0][firstStr[1]]
if type(secondDict).__name__=='dict':
classLabel=classify(secondDict,featLabels,testVec) #递归调用
else:
classLabel=secondDict
#else:离散的先不考虑
return classLabel #返回分类标签
def testing(myTree,data_test,labels):
error=0.0
for i in range(len(data_test)): #一个个的测试
if classify(myTree,labels,data_test[i])!=data_test[i][-1]: #如果测试的结果与实际的标签不同
error+=1
print 'myTree %f'%((len(data_test)-error)/len(data_test)) #注意这里的%f 之前是%d返回整型
# 9/0
return None
df=pd.read_csv('C:/Users/test_5.csv')
data=df.values[:280,1:].tolist() #从0行开始 从1列开始 data包含特征数据和类别值
data_full=data[:]
data_test=df.values[280:,1:].tolist() #划分测试集
#test_length=len(data_test)
labels=df.columns.values[1:-1].tolist() #特征标签 color root knocks texture navel touch 400个
labels_full=labels[:]
myTree=createTree(data,labels)
testing(myTree,data_test,labels_full)
import treePlotter
treePlotter.createPlot(myTree)