def loaddataset():
postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classvec=[0,1,0,1,0,1]
return postinglist,classvec
def createvocablist(dataset):
vocabset=set([])
for document in dataset:
vocabset=vocabset|set(document) #union of the two sets
return list(vocabset)
def setofwords2vec(vocablist,inputset):
returnvec=[0]*len(vocablist) #create a vector where compose of 0
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)]=1
else:
print "the word:%s is not in my vocabulary" %word
return returnvec
if __name__=='__main__':
listoposts,listclasses=loaddataset() #assign postinglist,classvec to a in tuple
myvocablist=createvocablist(listoposts)
a=setofwords2vec(myvocablist,listoposts[3])
print myvocablist
print a
from numpy import *
def loaddataset():
postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classvec=[0,1,0,1,0,1]
return postinglist,classvec
def createvocablist(dataset):
vocabset=set([])
for document in dataset:
vocabset=vocabset|set(document) #union of the two sets
return list(vocabset)
def setofwords2vec(vocablist,inputset):
returnvec=[0]*len(vocablist) #create a vector where compose of 0
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)]=1
else:
print "the word:%s is not in my vocabulary" %word
return returnvec
def trainnb0(trainmatrix,traincategory):
numtraindocs=len(trainmatrix) #number of document
numwords=len(trainmatrix[0]) #element of each document
pabusive=sum(traincategory)/float(numtraindocs)
p0num=zeros(numwords)
p1num=zeros(numwords)
p0denom=0.0 ; p1denom=0.0
for i in range(numtraindocs):
if traincategory[i]==1:
p1num+=trainmatrix[i]
p1denom+=sum(trainmatrix[i])
else:
p0num+=trainmatrix[i]
p0denom+=sum(trainmatrix[i])
p1vect=p1num/p1denom
p0vect=p0num/p0denom
return p0vect,p1vect,pabusive
if __name__=='__main__':
listoposts,listclasses=loaddataset() #assign postinglist,classvec to a in tuple
myvocablist=createvocablist(listoposts) #contain unique value of documents
trainmat=[]
for postindoc in listoposts:
trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
p0v,p1v,pab=trainnb0(trainmat,listclasses)
print myvocablist
print trainmat
print pab
print p0v
print p1v
'''
Created on March, 2017
@author: yang
'''
from numpy import *
def loaddataset():
postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classvec=[0,1,0,1,0,1]
return postinglist,classvec
def createvocablist(dataset):
vocabset=set([])
for document in dataset:
vocabset=vocabset|set(document) #union of the two sets
return list(vocabset)
def setofwords2vec(vocablist,inputset):
returnvec=[0]*len(vocablist) #create a vector where compose of 0
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)]=1
else:
print "the word:%s is not in my vocabulary" %word
return returnvec
def trainnb0(trainmatrix,traincategory):
numtraindocs=len(trainmatrix) #number of document
numwords=len(trainmatrix[0]) #element of each document
pabusive=sum(traincategory)/float(numtraindocs)
p0num=ones(numwords)
p1num=ones(numwords)
p0denom=2.0 ; p1denom=2.0
for i in range(numtraindocs):
if traincategory[i]==1:
p1num+=trainmatrix[i]
p1denom+=sum(trainmatrix[i])
else:
p0num+=trainmatrix[i]
p0denom+=sum(trainmatrix[i])
p1vect=log(p1num/p1denom)
p0vect=log(p0num/p0denom)
return p0vect,p1vect,pabusive
def classifynb(vec2classify,p0vec,p1vec,pclass1):
p1=sum(vec2classify*p1vec)+log(pclass1)
p0=sum(vec2classify*p0vec)+log(1-pclass1)
if p1>p0:
return 1
else:
return 0
def testingnb():
listoposts,listclasses=loaddataset() #assign postinglist,classvec to a in tuple
myvocablist=createvocablist(listoposts) #contain unique value of documents
trainmat=[]
for postindoc in listoposts:
trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
p0v,p1v,pab=trainnb0(trainmat,listclasses)
testentry=['love','my','dalmation']
thisdoc=array(setofwords2vec(myvocablist,testentry))
print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
testentry=['stupid','garbage']
thisdoc=array(setofwords2vec(myvocablist,testentry))
print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
if __name__=='__main__':
testingnb()
from numpy import *
def loaddataset():
postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classvec=[0,1,0,1,0,1]
return postinglist,classvec
def createvocablist(dataset):
vocabset=set([])
for document in dataset:
vocabset=vocabset|set(document) #union of the two sets
return list(vocabset)
def setofwords2vec(vocablist,inputset):
returnvec=[0]*len(vocablist) #create a vector where compose of 0
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)]+=1 #bag-of
else:
print "the word:%s is not in my vocabulary" %word
return returnvec
def trainnb0(trainmatrix,traincategory):
numtraindocs=len(trainmatrix) #number of document
numwords=len(trainmatrix[0]) #element of each document
pabusive=sum(traincategory)/float(numtraindocs)
p0num=ones(numwords)
p1num=ones(numwords)
p0denom=2.0 ; p1denom=2.0
for i in range(numtraindocs):
if traincategory[i]==1:
p1num+=trainmatrix[i]
p1denom+=sum(trainmatrix[i])
else:
p0num+=trainmatrix[i]
p0denom+=sum(trainmatrix[i])
p1vect=log(p1num/p1denom)
p0vect=log(p0num/p0denom)
return p0vect,p1vect,pabusive
def classifynb(vec2classify,p0vec,p1vec,pclass1):
p1=sum(vec2classify*p1vec)+log(pclass1)
p0=sum(vec2classify*p0vec)+log(1-pclass1)
if p1>p0:
return 1
else:
return 0
def testingnb():
listoposts,listclasses=loaddataset() #assign postinglist,classvec to a in tuple
myvocablist=createvocablist(listoposts) #contain unique value of documents
trainmat=[]
for postindoc in listoposts:
trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
p0v,p1v,pab=trainnb0(trainmat,listclasses)
testentry=['love','my','dalmation']
thisdoc=array(setofwords2vec(myvocablist,testentry))
print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
testentry=['stupid','garbage']
thisdoc=array(setofwords2vec(myvocablist,testentry))
print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
def textparse(bigstring):
import re
listoftokens=re.split(r'\W*',bigstring)
return [tok.lower() for tok in listoftokens if len(tok)>2]
def spamtest():
doclist=[]
classlist=[]
fulltext=[]
for i in range(1,26):
wordlist=textparse(open('/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/spam/%d.txt' % i).read())
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(1)
wordlist=textparse(open("/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/ham/%d.txt" % i).read())
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(0)
vocablist=createvocablist(doclist)
trainingset=range(50)
testset=[]
for i in range(10):
randindex=int(random.uniform(0,len(trainingset)))
testset.append(trainingset[randindex])
del (trainingset[randindex])
trainmat=[]
trainclasses=[]
for docindex in trainingset:
trainmat.append(setofwords2vec(vocablist,doclist[docindex]))
trainclasses.append(classlist[docindex])
p0v,p1v,pspam=trainnb0(array(trainmat),array(trainclasses))
errorcount=0
for docindex in testset:
wordvector=setofwords2vec(vocablist,doclist[docindex])
if classifynb(array(wordvector),p0v,p1v,pspam)!=classlist[docindex]:
errorcount+=1
print 'the error rate is: ',float(errorcount)/len(testset)
if __name__=='__main__':
spamtest()