import numpy as np import scipy as sp import time, sklearn, math from sklearn.model_selection import train_test_split import sklearn.datasets, sklearn.neighbors, sklearn.tree, sklearn.metrics # 计算信息熵 def cal_entropy(dataset): numInstances = len(dataset) labelCounts = {} # 定义空字典 for featVec in dataset: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key]) / numInstances shannonEnt -= prob * math.log(prob, 2) return shannonEnt # 提取数据集 def draw_dataset(dataset, axis, value): resultDataSet = [] for featVec in dataset: if featVec[axis] == value: # 当前属性不需要 reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis + 1:]) resultDataSet.append(reducedFeatVec) return resultDataSet #选择最好的特征划分 def choose_best_feature(dataset): #决策属性不算 numFeatures = len(dataset[0]) - 1 baseEntropy = cal_entropy(dataset) bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): #把第i列属性的值取出来生成一维数组 featList = [example[i] for example in dataset] #剔除重复值 uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subDataSet = draw_dataset(dataset, i, value) prob = len(subDataSet) / float(len(dataset)) newEntropy += prob*cal_entropy(subDataSet) infoGain = baseEntropy - newEntropy if(infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature #如果剩下的数据中无特征,则直接按最大百分比形成叶节点 def build_leaf_node(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount += 1; sortedClassCount = sorted(classCount.iteritems(), key = np.operator.itemgette(1), reverse = True) return sortedClassCount[0][0] # 创建决策树 def create_decision_tree(dataset, paraFeatureName): featureName = paraFeatureName.copy() classList = [example[-1] for example in dataset] # Already pure if classList.count(classList[0]) == len(classList): return classList[0] # No more attribute if len(dataset[0]) == 1: # if len(dataSet) == 1: return build_leaf_node(classList) bestFeat = choose_best_feature(dataset) # print(dataSet) # print("bestFeat:", bestFeat) bestFeatureName = featureName[bestFeat] myTree = {bestFeatureName: {}} del (featureName[bestFeat]) featvalue = [example[bestFeat] for example in dataset] uniqueVals = set(featvalue) for value in uniqueVals: subfeatureName = featureName[:] myTree[bestFeatureName][value] = create_decision_tree(draw_dataset(dataset, bestFeat, value), subfeatureName) return myTree # 分类以及返回准确率 def id3Classify(paraTree, paraTestingSet, featureNames, classValues): tempCorrect = 0.0 tempTotal = len(paraTestingSet) tempPrediction = classValues[0] for featureVector in paraTestingSet: print("Instance: ", featureVector) tempTree = paraTree while True: for feature in featureNames: try: tempTree[feature] splitFeature = feature break except: i = 1 #Do nothing attributeValue = featureVector[featureNames.index(splitFeature)] print(splitFeature, " = ", attributeValue) tempPrediction = tempTree[splitFeature][attributeValue] if tempPrediction in classValues: break else: tempTree = tempPrediction print("Prediction = ", tempPrediction) if featureVector[-1] == tempPrediction: tempCorrect += 1 return tempCorrect/tempTotal def mfID3Test(): #Step 1. Load the dataset weatherData = [['Sunny','Hot','High','FALSE','N'], ['Sunny','Hot','High','TRUE','N'], ['Overcast','Hot','High','FALSE','P'], ['Rain','Mild','High','FALSE','P'], ['Rain','Cool','Normal','FALSE','P'], ['Rain','Cool','Normal','TRUE','N'], ['Overcast','Cool','Normal','TRUE','P'], ['Sunny','Mild','High','FALSE','N'], ['Sunny','Cool','Normal','FALSE','P'], ['Rain','Mild','Normal','FALSE','P'], ['Sunny','Mild','Normal','TRUE','P'], ['Overcast','Mild','High','TRUE','P'], ['Overcast','Hot','Normal','FALSE','P'], ['Rain','Mild','High','TRUE','N']] featureName = ['Outlook', 'Temperature', 'Humidity', 'Windy'] classValues = ['P', 'N'] tempTree = create_decision_tree(weatherData, featureName) print(tempTree) #print(createTree(mydata, featureName)) #featureName = ['Outlook', 'Temperature', 'Humidity', 'Windy'] print("Before classification, feature names = ", featureName) tempAccuracy = id3Classify(tempTree, weatherData, featureName, classValues) print("The accuracy of ID3 classifier is {}".format(tempAccuracy)) # 鸢尾花数据集测试 def iris_test(): # 加载数据集 data_iris = sklearn.datasets.load_iris() x = data_iris.data y = data_iris.target #for i in range(len(data_iris)): # x[i].append(y[i]) print(x[1]) feature_name = ['花萼的长', '花萼的宽', '花瓣的长', '花瓣的宽'] classValues = ['山鸢尾Setosa', '杂色鸢尾Versicolour', '维吉尼亚鸢尾Virginica'] temp_tree = create_decision_tree(data_iris, feature_name) print(temp_tree) print("Before classification, feature names = ", feature_name) tempAccuracy = id3Classify(temp_tree, data_iris, feature_name, classValues) print("The accuracy of ID3 classifier is {}".format(tempAccuracy)) def main(): # sklearnDecisionTreeTest() # mfID3Test() iris_test() main()