机器学习实战-第四章贝叶斯分类-代码理解-读书笔记

#coding:utf-8

from numpy import *
import pdb

def load_data_set():
    word_list = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ace', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    class_vec = [0, 1, 0, 1, 0, 1]   # 0 stands for normal, 1 stands for insulting
    return word_list, class_vec

def createVocabList(dataSet):
	vocabSet = set([])
	for document in dataSet:
		vocabSet = vocabSet | set(document)
	return list(vocabSet)

#这个函数的目的是从词表转换成了一个向量
def setOfWords2Vec(vocabList, inputSet):
	returnVec = [0]*len(vocabList)
	for word in inputSet:
		if word in vocabList:
			returnVec[vocabList.index(word)] = 1
		else:
			print "the word : %s is not in my vocabulary!" % word
	return returnVec
	
def trainNB0(trainMatrix, trainCategory):
	pdb.set_trace()
	numTrainDocs = len(trainMatrix)
	numWords = len(trainMatrix[0])
	pAbusice = sum(trainCategory) / float(numTrainDocs) #侮辱文字的比例
	p0Num = zeros(numWords); p1Num = zeros(numWords)	#pONum 保存正常言论的向量统计
	p0Demon = 0.0; p1Demon = 0.0	#p0Demon 正常言论中所用单词的总数
	for i in range(numTrainDocs):
		if trainCategory[i] == 1:
			p1Num += trainMatrix[i]
			p1Demon += sum(trainMatrix[i])
		else:
			p0Num += trainMatrix[i]
			p0Demon += sum(trainMatrix[i])
	#计算出所有单词在正常言论和侮辱言论中所占的比例
	p1Vect = p1Num / p1Demon
	p0Vect = p0Num / p0Demon
	#返回所有单词在正常言论和侮辱言论中所占的比例,以及侮辱性言论总的比例
	return p0Vect, p1Vect, pAbusice

def trainNB1(trainMatrix, trainCategory):
	#pdb.set_trace()
	numTrainDocs = len(trainMatrix)
	numWords = len(trainMatrix[0])
	pAbusice = sum(trainCategory) / float(numTrainDocs) #侮辱文字的比例
	#所有单词的初始数目设为1,可以避免概率为0的出现
	p0Num = ones(numWords); p1Num = ones(numWords)	#pONum 保存正常言论的向量统计
	p0Demon = 2.0; p1Demon = 2.0	#p0Demon 正常言论中所用单词的总数
	for i in range(numTrainDocs):
		if trainCategory[i] == 1:
			p1Num += trainMatrix[i]
			p1Demon += sum(trainMatrix[i])
		else:
			p0Num += trainMatrix[i]
			p0Demon += sum(trainMatrix[i])
	#计算出所有单词在正常言论和侮辱言论中所占的比例
	#为了避免太多很小的数值相乘造成下溢出
	p1Vect = log(p1Num / p1Demon)
	p0Vect = log(p0Num / p0Demon)
	#返回所有单词在正常言论和侮辱言论中所占的比例,以及侮辱性言论总的比例
	return p0Vect, p1Vect, pAbusice

def classfyNB(vec2Classify, p0Vec, p1Vec, pClass1):
	p1 = sum(vec2Classify * p1Vec) + log(pClass1)
	p0 = sum(vec2Classify * p0Vec) + log(1-pClass1)
	if p1 > p0:
		return 1
	else:
		return 0

def testingNB():
	#pdb.set_trace()
	listOPosts,listClasses = load_data_set()
	myVocabList = createVocabList(listOPosts)
	trainMat = []
	for postinDoc in listOPosts:
		trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
	p0V, p1V, pAb = trainNB1(array(trainMat), array(listClasses))
	testEntry = ['love', 'my', 'dalmation']
	thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
	print testEntry, 'classfied as', classfyNB(thisDoc, p0V, p1V, pAb)
'''
	我们将词的出现与否作为一个特征,可以被描述为词集模型
	如果一个词可能出现多次,这样的模型称为词袋模型
'''

def badOfWords2VecMN(vocabList, inputSet):
	returnVec = [0] * len(vocabList)
	for word in inputSet:
		if word in vocabList:
			returnVec[vocabList.index(word)] += 1
	return returnVec

def textParse(bigString):
	import re
	listOfTokens = re.split(r'\W*', bigString)
	return [tok.lower() for tok in listOfTokens if len(tok) > 2]
	
def spamTest():
	#pdb.set_trace()
	docList = []; classList = []; fullText = []
	for i in range(1,26):
		wordList = textParse(open('email/spam/%d.txt' % i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(1)
		wordList = textParse(open('email/ham/%d.txt' % i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(0)
	vocabList = createVocabList(docList)
	traingSet = range(50); testSet = []
	#随机选取训练集当数据集进行交叉验证,并删除选中的测试数据集
	for i in range(10):
		readIndex = int(random.uniform(0,len(traingSet)))
		testSet.append(traingSet[readIndex])
		del(traingSet[readIndex])
	trainMat = []; trainingClasses = []
	for docIndex in traingSet:
		trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
		trainingClasses.append(classList[docIndex])
	p0V, p1V, pAb = trainNB1(array(trainMat), array(trainingClasses))
	errorCount = 0
	for docIndex in testSet:
		wordVector =  setOfWords2Vec(vocabList, docList[docIndex])
		if classfyNB(array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
			errorCount += 1
	print 'the error rate is :', float(errorCount) / len(testSet)
		
'''
	operator模块提供的itemgetter函数用于获取对象的哪些维的数据
	参数为一些序号(即需要获取的数据在对象中的序号)
'''
def calcMostFreq(vocabList, fullText):
	import operator
	freqDict = {}
	#字典中存了fullText中token出现的次数
	for token in vocabList:
		freqDict[token] = fullText.count(token)
	sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), \
		reverse = True)
	#返回排序前30的数据
	return sortedFreq[:30]

def localWords(feed1, feed0):
	#pdb.set_trace()
	import feedparser
	docList=[]; classList=[]; fullText=[]
	minLen = min(len(feed1['entries']), len(feed0['entries']))
	for i in range(minLen):
		wordList = textParse(feed1['entries'][i]['summary'])
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(1)
		wordList = textParse(feed0['entries'][i]['summary'])
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(0)
	vocabList = createVocabList(docList)
	top30Words = calcMostFreq(vocabList, fullText)
	for pairW in top30Words:
		if pairW[0] in vocabList:
			vocabList.remove(pairW[0])
	trainingSet = range(2*minLen); testSet = []
	for i in range(20):
		randIndex = int(random.uniform(0, len(trainingSet)))
		testSet.append(trainingSet[randIndex])
		del(trainingSet[randIndex])
	trainMat = []; trainingClasses=[]
	for docIndex in trainingSet:
		trainMat.append(badOfWords2VecMN(vocabList,docList[docIndex]))
		trainingClasses.append(classList[docIndex])
	p0V, p1V, pAb = trainNB1(array(trainMat), array(trainingClasses))
	errorCount = 0
	for docIndex in testSet:
		wordVector =  setOfWords2Vec(vocabList, docList[docIndex])
		if classfyNB(array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
			errorCount += 1
	print 'the error rate is :', float(errorCount) / len(testSet)
	return vocabList, p0V, p1V
	

ny = feedparser.parse('http://newyork.craigslist.org/search/stp?format=rss')
sf = feedparser.parse('http://sfbay.craigslist.org/search/stp?format=rss')

	
	


你可能感兴趣的:(机器学习)