#coding:-utf-8
from numpy import *
def loadDataSet():
postingList = [['my','dog','has','flea',\
'problems','help','please'],
['maybe','not','take','him',\
'to','dog','park','stupid'],
['my','dalmation','is','so','cute',\
'I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how',\
'to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec = [0,1,0,1,0,1]
return postingList,classVec
def createVocaList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet|set(document)
return list(vocabSet)
def setOfWordds2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
else:
print "The word:%s is not in my Vocabulary!" % word
return returnVec
def trainNB0(trainMatrix,trainCategory):
#矩阵正一共有6行数据
numTrainDocs = len(trainMatrix)
#print numTrainDocs:6
#每行一共有32个元素
numWords = len(trainMatrix[0])
#print numWords:32
#侮辱性留言中文档数在总文档数中所占百分比
pAbusive = sum(trainCategory)/float(numTrainDocs)
#print pAbusive:0.5
#创建一共32个元素的一维数组
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom = 2.0;p1Denom = 2.0
for i in range(numTrainDocs):
#print trainCategory[i]
#print sum(trainMatrix[i])
#print trainMatrix[i]
if trainCategory[i] == 1:
#对类别1(侮辱性),每个词向量文档累加
p1Num += trainMatrix[i]
#每个词向量文档中所有词相加,即一共有多少个侮辱性的词
p1Denom += sum(trainMatrix[i])
else:
# 对类别0(正常词),每个词向量文档累加
p0Num += trainMatrix[i]
# 每个词向量文档中所有词相加,即一共有多少个正常词
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom)
p0Vect = log(p0Num/p0Denom)
#返回的是给定文档类别条件下词汇表中单词的出现概率
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocaList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWordds2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry = ['love','my','dalmation']
thisDoc = array(setOfWordds2Vec(myVocabList,testEntry))
print thisDoc
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
testEntry = ['stupid','garbage']
thisDoc = array(setOfWordds2Vec(myVocabList,testEntry))
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
#数据切分
def split_test():
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
#split()函数默认以空格为切分点将整行数据切分,返回的是一个列表数据
#my = mysent.split()
#print my
import re
#使用正则表达式来切分句子,为了去除标点符号
regEx = re.compile('\\W*') #注意这里'W'是大写!!!
listOfTokens0 = regEx.split(mySent)
#去除空格:轮流检查字符中的数值长度?> 0
listOfTokens1 = [tok for tok in listOfTokens0 if len(tok) > 0]
#利用lower()函数将字符串全部转换成小写
listOfTokens2 = [tok.lower() for tok in listOfTokens0 if len(tok) > 0]
#print listOfTokens2
emailText = open('email/ham/6.txt').read()
emailham0 = regEx.split(emailText)
emailham1 = [tok.lower() for tok in emailham0 if len(tok) > 0]
print emailham1
def textParse(bigString):
import re
listOfTokens = re.split(r'\W*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
#文件解析及完整的垃圾邮件测试函数
def spamTest():
docList = []; classList = []; fullText = []
for i in range(1,26):
#挨个读取email/spam中的txt文件
wordList = textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
#创建一个包含所有文档词汇中不重复的列表:乱序
vocabList = createVocaList(docList)
#print vocabList
#创建0-49的一维数组,数组:[1,2,3]。列表:['1','2','3']
trainingSet = range(50); testSet = [];
for i in range(10):
#随机返回一个0-49的整数,uniform(low,high)
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
#删除的是对变量的引用
del(trainingSet[randIndex])
trainMat = []; trainClasses = []
#训练集--得到p0V,p1V,pSpam
for docIndex in trainingSet:
trainMat.append(setOfWordds2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
#测试集--验证错误率
for docIndex in testSet:
wordVector = setOfWordds2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print 'the error rate is:',float(errorCount)/len(testSet)