1. 实验要求:
对垃圾邮件分类算法(书上P66)改进:
1、采用词袋模型
2、随机选择15个测试样本
3、去除长度小于3的字符
2. 垃圾邮件分类算法改进点
defbagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
deftextParse(bigString): #input is bigstring, #output is word list
import re
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokensif len(tok) > 3]
def spamTest():
docList=[]; classList = []; fullText =[]
for i in range(1,26):
wordList =textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList =textParse(open('email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)#createvocabulary
trainingSet = range(50); testSet=[] #create test set
for i in range(15):
randIndex =int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train theclassifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam =trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet: #classify the remaining items
wordVector =bagOfWords2VecMN(vocabList, docList[docIndex])
ifclassifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print "classificationerror",docList[docIndex]
print 'the error rate is:',float(errorCount)/len(testSet)
#return vocabList,fullText
3. 改进点说明
上述代码中将改进点高亮标注。
词袋模型算法说明
文档词袋模型是词集模型的改进算法。
词集模型仅仅将每个词出现与否作为一个特征;词袋模型考虑到如果一个词出现多次,其中包含的信息。
bagOfWords2VecMN函数中,针对属于inputSet的word没出现一次进行计数,函数返回计数值。
4. 测试分类器
>>>import os
>>>os.getcwd()
'C:\\Python27\\MLIA\\Ch04'
>>>import bayes
>>>bayes.spamTest()
classificationerror ['adobe', 'microsoft', 'softwares', 'fast', 'order', 'download','microsoft', 'office', 'professional', 'plus', '2007', '2010', 'microsoft','windows', 'ultimate', 'adobe', 'photoshop', 'extended', 'adobe', 'acrobat','extended', 'windows', 'professional', 'thousand', 'more', 'titles']
the error rateis: 0.0666666666667
>>>bayes.spamTest()
classificationerror ['yeah', 'ready', 'here', 'because', 'plane', 'tickets', 'germany']
classificationerror ['benoit', 'mandelbrot', '1924', '2010', 'benoit', 'mandelbrot', '1924','2010', 'wilmott', 'team', 'benoit', 'mandelbrot', 'mathematician', 'father','fractal', 'mathematics', 'advocate', 'more', 'sophisticated', 'modelling','quantitative', 'finance', 'died', '14th', 'october', '2010', 'aged','wilmott', 'magazine', 'often', 'featured', 'mandelbrot', 'ideas', 'work','others', 'inspired', 'fundamental', 'insights', 'must', 'logged', 'view','these', 'articles', 'from', 'past', 'issues', 'wilmott', 'magazine']
the error rateis: 0.133333333333