最近在学习Peter Harrington的《机器学习实战》。
Craigslist个人广告链接已经找不到了,改用该网站的社会事件event与政治politics的RSS源
https://newyork.craigslist.org/search/eve?format=rss&sale_date=2018-06-11
https://losangeles.craigslist.org/search/eve?format=rss&sale_date=2018-06-11
https://newyork.craigslist.org/search/pol?format=rss
https://sfbay.craigslist.org/search/pol?format=rss
from numpy import *
import feedparser
import operator
def loadDataset():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
# 从数据集中提取词汇表
def createVocabList(dataset):
vocabSet = set([])
for document in dataset:
vocabSet = vocabSet | set(document)
# 操作符|表示取并集
return list(vocabSet)
# 将数据集转为词向量
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
# else:
# print('the word: %s is not in my Vocabulary!' % word)
return returnVec
# 简单测试
# posts, classes = loadDataset()
# myVocabulary = createVocabList(posts)
# posting1 = setOfWords2Vec(myVocabulary, posts[1])
# print(posts)
# print(posts[1])
# print(myVocabulary)
# print(posting1)
# 朴素贝叶斯分类器训练函数
def trainNB0(trainMatrix, trainCategory):
# 计算文档属于侮辱性文档的概率(侮辱性文档为类别1,正常文档为类别0)
numTrainDocs = len(trainMatrix)
pAbusive = sum(trainCategory)/float(numTrainDocs)
# 计算侮辱性文档与正常文档中各个单词出现的概率
# 如果某个概率为0,最后的乘积也为0。为了降低这种影响,将所有词的出现数初始化为1,分母初始化为2
# 太多很小的数相乘将造成下溢出,最后四舍五入为0。解决这个问题可以对乘积取自然对数ln(a*b)=ln(a)+ln(b)
numWords = len(trainMatrix[0])
p1Num = ones(numWords)
p0Num = ones(numWords)
p1Demon = 2.0
p0Demon = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Demon += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Demon += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Demon) # 侮辱性文档中各个单词出现的概率
p0Vect = log(p0Num/p0Demon) # 正常文档中各个单词出现的概率
return p0Vect, p1Vect, pAbusive
# 测试
# posts, classes = loadDataset()
# myVocabulary = createVocabList(posts)
# trainMat = []
# for post in posts:
# trainMat.append(setOfWords2Vec(myVocabulary, post))
# print(myVocabulary)
# print(trainMat)
# print(classes)
# p0Vect, p1Vect, pAbusive = trainNB0(trainMat, classes)
# print(p0Vect)
# print(p1Vect)
# print(pAbusive)
# 朴素贝叶斯分类函数
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
p1 = sum(vec2Classify * p1Vect) + log(pClass1)
p0 = sum(vec2Classify * p0Vect) + log(1 - pClass1)
if p1 > p0:
return 1
else:
return 0
# 测试朴素贝叶斯分类器
def trainNB():
posts, classes = loadDataset()
myVocabulary = createVocabList(posts)
trainMat = []
for post in posts:
trainMat.append(setOfWords2Vec(myVocabulary, post))
p0Vect, p1Vect, pAbusive = trainNB0(trainMat, classes)
testPosts = [['love', 'my', 'dalmation'], ['stupid', 'garbage']]
for testPost in testPosts:
testVec = setOfWords2Vec(myVocabulary, testPost)
print(testPost, 'is classified as', classifyNB(testVec, p0Vect, p1Vect, pAbusive))
# trainNB()
# 词袋模式(计算单词出现次数,而非是否出现)
def bagOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
# 示例:使用贝叶斯分类器过滤垃圾邮件
# 切分长文本,并过滤太短的单词
def textParse(bigString):
import re
wordList = []
words = re.split('\W+', bigString) # '\W'匹配非字母字符(特殊字符)。'+'匹配前一个字符1次或无限次
for i in words:
if len(i) > 2:
i = i.lower()
wordList.append(i)
return wordList
# 构建垃圾邮件分类器
def spamTest():
import os
docList = []
classList = []
fullText = [] # 用于计算高频词
spamfold = 'D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch04/email/spam'
for i in os.listdir(spamfold):
spamfile = open(spamfold + '/' + i)
wordList = textParse(spamfile.read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
hamfold = 'D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch04/email/ham'
for i in os.listdir(hamfold):
hamfile = open(hamfold + '/' + i)
wordList = textParse(hamfile.read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocalList = createVocabList(docList)
# 邮件共50份,随机选择10份做测试集(交叉验证)
# trainSet与testSet为训练集与测试集中邮件的索引号
trainSet = list(range(50))
testSet = []
for i in range(10):
randIndex = int(random.uniform(0, len(trainSet)))
testSet.append(trainSet[randIndex])
del(trainSet[randIndex])
# 根据索引号,得到训练集与测试集
trainMat = []
trianClasses = []
for docIndex in trainSet:
trainMat.append(setOfWords2Vec(vocalList, docList[docIndex]))
trianClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(trainMat, trianClasses)
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocalList, docList[docIndex])
if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
# print('the error rate is', float(errorCount)/len(testSet))
errorRate = float(errorCount)/len(testSet)
return errorRate
# 做20次交叉验证,获得平均错误率
# errorRates = []
# for i in range(20):
# errorRate = spamTest()
# errorRates.append(errorRate)
# errorRates = array(errorRates)
# print(sum(errorRates)/len(errorRates))
# 错误率为0.02,即2%
# 示例:使用朴素贝叶斯分类器从个人广告中获取区域倾向
# 获取top30高频词
def calMostFreq(vocabList, fullText):
freqDict = {}
for word in vocabList:
freqDict[word] = fullText.count(word)
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30]
# 从RSS源获取数据
def localWords(feed1, feed0):
docList = []
classList = []
fullText = []
minLen = min(len(feed1['entries']), len(feed0['entries'])) # 获取两个RSS源的最小条目数
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary']) # 获取每个条目的摘要信息
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocalList = createVocabList(docList)
top30words = calMostFreq(vocalList, fullText)
# 移除高频词,降低错误率
for pairW in top30words:
if pairW[0] in vocalList:
vocalList.remove(pairW[0])
# trainSet与testSet为训练集与测试集中文档的索引号
trainSet = list(range(2*minLen))
testSet = []
for i in range(10):
randIndex = int(random.uniform(0, len(trainSet)))
testSet.append(trainSet[randIndex])
del(trainSet[randIndex])
# 根据索引号,得到训练集与测试集
trainMat = []
trianClasses = []
for docIndex in trainSet:
trainMat.append(bagOfWords2Vec(vocalList, docList[docIndex]))
trianClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(trainMat, trianClasses)
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2Vec(vocalList, docList[docIndex])
if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
# 输出错误率,用于多次交叉验证
# errorRate = float(errorCount) / len(testSet)
# return errorRate
print('the error rate is', float(errorCount)/len(testSet))
return vocalList, p1V, p0V
# Craigslist中的个人广告链接已经找不到了,改用该网站的社会事件event与政治politics的RSS源
# ny = feedparser.parse('https://newyork.craigslist.org/search/eve?format=rss&sale_date=2018-06-11')
# la = feedparser.parse('https://losangeles.craigslist.org/search/eve?format=rss&sale_date=2018-06-11')
ny_p = feedparser.parse('https://newyork.craigslist.org/search/pol?format=rss')
sf_p = feedparser.parse('https://sfbay.craigslist.org/search/pol?format=rss')
# 20次交叉验证计算错误率
# errorRates = []
# for i in range(100):
# errorRate = localWords(ny_p, sf_p)
# errorRates.append(errorRate)
# errorRates = array(errorRates)
# print(sum(errorRates)/len(errorRates))
# 纽约与洛杉矶的事件,移除高频词,0.29000000000000004,不移除高频词,0.297
# 纽约与三藩市的政治,移除高频词,0.292,不移除高频词0.30999999999999994
# try again,纽约与三藩市的政治,移除高频词,0.425,不移除高频词0.313
# RSS源的信息不断改变,测试结果往往不相同
# 获取高频词
def getTopWords(ny_p, sf_p):
vocalList, p1V, p0V = localWords(ny_p, sf_p)
topNY = []
topSF = []
for i in range(len(p0V)):
if p1V[i] > -5.0:
topNY.append((vocalList[i], p1V[i]))
if p0V[i] > -5.0:
topSF.append((vocalList[i], p0V[i])) # 高频词与其出现频率
sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True) #只保留高频词
print('NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**')
for i in sortedNY:
print(i[0])
sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
print('SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**')
for i in sortedSF:
print(i[0])
# getTopWords(ny_p, sf_p)