from numpy import *
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
return postingList,classVec
def createVocabList(dataSet): #dataSet就是上面那个函数返回的postingList
vocabSet = set([]) #创建一个空集
for document in dataSet:
vocabSet = vocabset | set(document) #符号‘|’用来求解两个集合的并集
return list(vocabSet) #再将集合转换成列表,返回
def setOfWords2Vec(vocabList,inputSet):#vocabList是词汇表,是上面那个函数的输出,inputSet是一个文档
returnVec=[0]*len(vocabList) #创建一个元素都是0的向量
for word in inputSet: #对于文档中的每一个单词
if word in vocabList:
returnVec[vocabList.index(word)]=1 #就令词汇表中该word所在位置为1
else: print "the word: %s is not int my Vocabulary!" %word
return returnVec
def trainNB0(trainMatrix,trainCategory): #trainMatrix:文档矩阵(已经是有01构成)。trainCategory:每篇文档标签所构成的向量
numTrainDocs = len(trainMatrix) #获取文档矩阵中有几篇文档
numWords = len(trainMatrix[0])#获取第一篇文档的单词长度
pAbusive = sum(trainCategory)/float(numTrainDocs) #类别为1的样本数量/样本总数
p0Num = ones(numWords); p1Num = ones(numWords) #初始化求概率的分子和分母向量
p0Denom=2.0; p1Denom=2.0
for i in range(numTrainDocs): #对于每一篇文档
if trainCategory[i]==1: #如果它的类别是1类
p1Num += trainMatrix[i] #p1Num向量中统计了类别为1时,每个特征在类别为1样本中出现的总次数
p1Denom += sum(trainMatrix[i]) #p1Denom统计了类别为1的所有样本中,所有出现过的单词的总数目
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom) #计算类别为1时,每个特征出现的频率
p0Vect = log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
#vec2Classify:待分类的向量
#p0Vec,p1Vec:由trainNB0函数输出的:元素的每个词在该类别上出现的概率
#pClass1:trainNB0函数输出的pAbusive,既类别为1侮辱性概率
p1=sum(vec2Classify*p1Vec)+log(pclass1)
p0=sum(vec2Classify*p0Vec)+log(1-pclass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet() #listOPosts列表的每个元素是:一个词条文档的列表
myVocabList = createVocabList(listOPosts) #生成listOPosts的所有词汇的词汇表myVocabList
trainMat=[]
for postinDoc in listOPosts: #trainMat列表的每个元素是:每个文档转化成 词汇是否出现的01文档向量
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #这也是训练过程
testEntry = ['love', 'my', 'dalmation'] #测试文档
thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) #首先要将测试文档转化成为文档向量
print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
testEntry = ['stupid', 'garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1 #统计每个文档中,一个词出现的次数
return returnVec
def textParse(bigString): #input is big string, #output is word list
import re
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList=[]; classList = []; fullText =[]
for i in range(1,26):
wordList = textParse(open('email/spam/%d.txt' %i).read())
#读文本文档分割字符串,得到这个文档所有单词元素的列表
docList.append(wordList)#把每个文档切分后,加入一个列表中
fullText.extend(wordList) #把这个文档的所有单词都拿出来,做一个列表合并
classList.append(1) #这个文档的类别标签是1
wordList = textParse(open('email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)#这个文档的类别标签是0
vocabList = createVocabList(docList) #得到词汇表
trainingSet = range(50);testSet=[]
for i in range(10):#就是随机挑出10个做为测试文本,其余的作为训练样本
randIndex = int(random.uniform(0,len(trainingSet))) #在0~49随机选取一个数
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex]) #在训练集中删掉被选中的随机数
trainMat=[];trianClass=[]
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClass.append(classList[docIndex])
#到此,得到训练样本矩阵和训练样本类标签向量
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClass))
errorCount =0
for docIndex in testSet: #随机选出的10个测试样本进行测试
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print "classification error",docList[docIndex]
print 'the error rate is: ',float(errorCount)/len(testSet)
#return vocabList,fullText
def calMostFreq(vocabList,fulltext):
freqDict={}
for token in vocabList:
freqDict[token]=fullText.count(token) #统计词汇表中的每个词在fullText中出现的次数
sortedFreq=sorted(freqDict.iteritems,key=operator.itemgetter(1),reverse=True)
return sortedFreq[:30]
def localWords(feed1,feed0):
import feedparser
docList=[];classList=[];fullText=[]
minLen = min(len(feed1['entries']),len(feed0['entries']))#求两个源所有条目列表中长度较小的那个长度值
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])#每次访问一条RSS源
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calcMostFreq(vocabList,fullText)#得到在两个源中出现次数最高的30个单词
for pairW in top30Words:
if pairW[0] in vocabList:
vocabList.remove(pairW[0])#从词汇表中把高频的30个词移除
trainingSet = range(2*minLen); testSet=[]#
for i in range(20):#从两个rss源中挑出20条作为测试文本
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []; trainClasses = []
for docIndex in trainingSet:#训练文本
trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:#计算分类,和错误率
wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print 'the error rate is: ',float(errorCount)/len(testSet)
return vocabList,p0V,p1V