机器学习实战第4章-朴素贝叶斯

  • 从文本中构建词向量
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]=1
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

if __name__=='__main__':
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)
    a=setofwords2vec(myvocablist,listoposts[3])
    print myvocablist
    print a

  • 从词向量计算概率
from numpy import *
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]=1
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

def trainnb0(trainmatrix,traincategory):
    numtraindocs=len(trainmatrix)   #number of document
    numwords=len(trainmatrix[0])    #element of each document
    pabusive=sum(traincategory)/float(numtraindocs)
    p0num=zeros(numwords)
    p1num=zeros(numwords)
    p0denom=0.0 ; p1denom=0.0
    for i in range(numtraindocs):
        if traincategory[i]==1:
            p1num+=trainmatrix[i]
            p1denom+=sum(trainmatrix[i])
        else:
            p0num+=trainmatrix[i]
            p0denom+=sum(trainmatrix[i])
    p1vect=p1num/p1denom
    p0vect=p0num/p0denom
    return p0vect,p1vect,pabusive

if __name__=='__main__':
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)   #contain unique value of documents
    trainmat=[]
    for postindoc in listoposts:
        trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
    p0v,p1v,pab=trainnb0(trainmat,listclasses)
    print myvocablist
    print trainmat
    print pab
    print p0v
    print p1v

  • 测试算法,根据现实情况修改分类器
'''
Created on March, 2017

@author: yang
'''

from numpy import *
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]=1
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

def trainnb0(trainmatrix,traincategory):
    numtraindocs=len(trainmatrix)   #number of document
    numwords=len(trainmatrix[0])    #element of each document
    pabusive=sum(traincategory)/float(numtraindocs)
    p0num=ones(numwords)
    p1num=ones(numwords)
    p0denom=2.0 ; p1denom=2.0
    for i in range(numtraindocs):
        if traincategory[i]==1:
            p1num+=trainmatrix[i]
            p1denom+=sum(trainmatrix[i])
        else:
            p0num+=trainmatrix[i]
            p0denom+=sum(trainmatrix[i])
    p1vect=log(p1num/p1denom)
    p0vect=log(p0num/p0denom)
    return p0vect,p1vect,pabusive

def classifynb(vec2classify,p0vec,p1vec,pclass1):
    p1=sum(vec2classify*p1vec)+log(pclass1)
    p0=sum(vec2classify*p0vec)+log(1-pclass1)
    if p1>p0:
        return 1
    else:
        return 0

def testingnb():
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)   #contain unique value of documents
    trainmat=[]
    for postindoc in listoposts:
        trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
    p0v,p1v,pab=trainnb0(trainmat,listclasses)  
    testentry=['love','my','dalmation']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
    testentry=['stupid','garbage']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)

if __name__=='__main__':
    testingnb()

  • 测试算法:使用朴素贝叶斯进行交叉验证
from numpy import *
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]+=1  #bag-of    
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

def trainnb0(trainmatrix,traincategory):
    numtraindocs=len(trainmatrix)   #number of document
    numwords=len(trainmatrix[0])    #element of each document
    pabusive=sum(traincategory)/float(numtraindocs)
    p0num=ones(numwords)
    p1num=ones(numwords)
    p0denom=2.0 ; p1denom=2.0
    for i in range(numtraindocs):
        if traincategory[i]==1:
            p1num+=trainmatrix[i]
            p1denom+=sum(trainmatrix[i])
        else:
            p0num+=trainmatrix[i]
            p0denom+=sum(trainmatrix[i])
    p1vect=log(p1num/p1denom)
    p0vect=log(p0num/p0denom)
    return p0vect,p1vect,pabusive

def classifynb(vec2classify,p0vec,p1vec,pclass1):
    p1=sum(vec2classify*p1vec)+log(pclass1)
    p0=sum(vec2classify*p0vec)+log(1-pclass1)
    if p1>p0:
        return 1
    else:
        return 0

def testingnb():
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)   #contain unique value of documents
    trainmat=[]
    for postindoc in listoposts:
        trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
    p0v,p1v,pab=trainnb0(trainmat,listclasses)  
    testentry=['love','my','dalmation']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
    testentry=['stupid','garbage']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)

def textparse(bigstring):
    import re
    listoftokens=re.split(r'\W*',bigstring)
    return [tok.lower() for tok in listoftokens if len(tok)>2]

def spamtest():
    doclist=[]
    classlist=[]
    fulltext=[]
    for i in range(1,26):
        wordlist=textparse(open('/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/spam/%d.txt' % i).read())
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(1)
        wordlist=textparse(open("/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/ham/%d.txt" % i).read())
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(0)
    vocablist=createvocablist(doclist)
    trainingset=range(50)
    testset=[]
    for i in range(10):
        randindex=int(random.uniform(0,len(trainingset)))
        testset.append(trainingset[randindex])
        del (trainingset[randindex])
    trainmat=[]
    trainclasses=[]
    for docindex in trainingset:
        trainmat.append(setofwords2vec(vocablist,doclist[docindex]))
        trainclasses.append(classlist[docindex])
    p0v,p1v,pspam=trainnb0(array(trainmat),array(trainclasses))
    errorcount=0
    for docindex in testset:
        wordvector=setofwords2vec(vocablist,doclist[docindex])
        if classifynb(array(wordvector),p0v,p1v,pspam)!=classlist[docindex]:
            errorcount+=1
    print 'the error rate is: ',float(errorcount)/len(testset) 

if __name__=='__main__':
    spamtest()

你可能感兴趣的:(机器学习实战第4章-朴素贝叶斯)