本文小结最近python编程中解决的几个问题。
这些问题大部分是不同版本python的函数差异。
列表:
1、python的对象序列化模块是pickle.
2、读写文件过程中,主要读写的方式'w'、'wb','r'、'rb',不同版本python的函数对参数的要求不同。
3、函数isinstance(secondDict[key], dict) 作用等价于type(secondDict[key]).__name__=='dict'。都是判断是不是字典类型。
4、字典dictionary的函数key()在python2.x返回列表,在python3.x返回dict对象。注意甄别。
5、调用dictionary时,Pydev无法智能提示字典的函数key()。不知什么原因,以后注意。
5、查看python函数的详细定义时,一般java工程中,按F3即可。但Pydev中,总是跳出一个选择列表,对于刚接触python,不懂各个模块功能的人来说,确实痛苦。
6、python开发文件.py中,加入中文字符就会乱码,编译运行都会失败。
解决办法:1、#!/usr/bin/env python。# coding=utf-8
2、# -*- coding: utf-8 -*-
7、import时,引入一个函数时,可以form os import listdir;引入某个包所有函数,可以from numpy import *;引入某个包或者.py文件,import operator.
8、with open() as file,经常用于打开文件。
利用香农熵构建决策树,并进行分类,以及存储调用决策树的python源码如下:
#!/usr/bin/env python # coding=utf-8 ''' Created on Oct 12, 2010 Decision Tree Source Code for Machine Learning in Action Ch. 3 @author: Peter Harrington ''' from math import log import operator #创建数据集 def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing','flippers'] #change to discrete values return dataSet, labels #计算样本集的香农熵 def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: #the the number of unique elements and their occurance currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: #计算数据集的熵,依据是否为鱼类 prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob,2) #log base 2 return shannonEnt #返回数据集中所有axis的属性值==value的数据 def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] #chop out axis used for splitting 返回从开始到axis的元素,不包括axis元素 reducedFeatVec.extend(featVec[axis+1:]) #list末尾添加另一个list retDataSet.append(reducedFeatVec) return retDataSet #计算最好的数据集划分方式 def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels baseEntropy = calcShannonEnt(dataSet) #最初熵值 bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): #iterate over all the features featList = [example[i] for example in dataSet]#create a list of all the examples of this feature uniqueVals = set(featList) #get a set of unique values newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) #按第i个属性值,value值进行划分 prob = len(subDataSet)/float(len(dataSet)) #子集概率 newEntropy += prob * calcShannonEnt(subDataSet) #熵值 infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy if (infoGain > bestInfoGain): #compare this to the best gain so far bestInfoGain = infoGain #if better than current best, set to best bestFeature = i return bestFeature #returns an integer #返回次数最多的分类名称 def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] #创建决策树 def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] #分类名称 if classList.count(classList[0]) == len(classList): #类型一致,则返回分类名称 return classList[0] #stop splitting when all of the classes are equal if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet return majorityCnt(classList) #遍历完所有特征值,返回剩下数据中,出现最多的分类名称 bestFeat = chooseBestFeatureToSplit(dataSet) #返回最佳特征值序号 bestFeatLabel = labels[bestFeat] #去除特征值名称 myTree = {bestFeatLabel:{}} #嵌套字典表示决策树 del(labels[bestFeat]) #删除已选择标签 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] #copy all of labels, so trees don't mess up existing labels myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) return myTree #利用决策树进行分类 def classify(inputTree,featLabels,testVec): #参数:决策树、特征向量、测试向量 firstStr = list(inputTree.keys())[0] #第一层决策 secondDict = inputTree[firstStr] #第一层key对应到的value featIndex = featLabels.index(firstStr) #特征值对应的索引 key = testVec[featIndex] #测试向量key valueOfFeat = secondDict[key] #测试向量key对应的value if isinstance(valueOfFeat, dict): #value还是dict classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat #value不是dict return classLabel #存储决策树到文件 def storeTree(inputTree,filename): import pickle #序列化对象 fw = open(filename,'wb') #注意打开方式'w'与'wb'区别 pickle.dump(inputTree, fw) fw.close() #加载文件中的决策树 def grabTree(filename): import pickle fr = open(filename, 'rb') return pickle.load(fr) dataSet, labels = createDataSet() print('the dataSet is: %s'%dataSet) print('the labels is: %s'%labels) shannonEntropy = calcShannonEnt(dataSet) print('the shannonEntropy is: %.10f'%shannonEntropy) bestFeature = chooseBestFeatureToSplit(dataSet) print('the best feature is: %d'%bestFeature) import ch03_treePlotter myTree = ch03_treePlotter.retrieveTree(0) classLabel = classify(myTree, labels, [1,0]) print ('the classify result is: %s'%classLabel) storeTree(myTree, 'decisiontree\\decisionTree.txt') tree = grabTree('decisiontree\\decisionTree.txt') ok = 1