朴素贝叶斯算法

原理

选择具有最高概率的决策
朴素的意思是每个特征之间是独立同分布的，互不影响。

优点：

在数据较少的情况下仍然有效，可以处理多类别问题。特征条件独立假设，使得模型预测所需要估计的参数规模从冥指数量级向线性量级减少，极大地节约了内存消耗和计算时间。

缺点：

对于输入数据的准备方式较为敏感。模型训练时无法将各个特征之间的联系考量在内，该模型在特征关联性强的分类任务上性能表现不佳。

适用数据类型：

标称型数据。

使用Python进行文本分类

#加载文档字符集
def loadDataSet():
    postingList = [['my','dog','has','flea','probles','help','please'],
        ['maybe','not','take','him','to','dog','park','stupid'],
        ['my','dalmation','is','so','cute','I','love','him'],
        ['stop','posting','stupid','worthless','garbage'],
        ['mr','licks','ate','my','steak','how','to','stop','him'],
        ['qiut','buying','worthless','dog','food','stupid']]
    classVecList = [0,1,0,1,0,1]
    return postingList,classVecList

#建立词集
def createVocabSet(dataSet): #dataSet是全部单词
    vocabSet = set()
    for vocabList in dataSet:
        vocabSet = vocabSet | set(vocabList)
        vocabList = list(vocabSet) #set2list,后面用到list.index，set没有index属性
    return vocabList

#setOfWords2Vec,建立样本词向量(词集模型)
def setOfWords2Vec(vocabList,inputSet):
    returnVecList = [0]*len(vocabList) #shape(returnVecList):1*32
    for word in inputSet:
        if word in vocabList:
            indexOfWordInVocabList = vocabList.index(word) #找到word在词集的index
            returnVecList[indexOfWordInVocabList] = 1 #在词向量对应位置赋值为1
        else:
            print('the word : %s is not in my vocabulary!' %word)
    return returnVecList

#建立train_Matrix
def creatTrainMatrix(dataWordsList,classVecList):
    myVocabList = createVocabSet(dataWordsList)
    trainMat = []
    for i in dataWordsList:
        returnVecList = setOfWords2Vec(myVocabList,i)
        trainMat.append(returnVecList)
    '''
    print('词集：','\n',myVocabList)
    print('文档样本:','\n',dataWordsList)
    词集： 
     ['to', 'has', 'probles', 'dalmation', 'food', 'maybe', 'steak', 'licks', 'help', 'how', 'dog', 'ate', 'him', 'love', 'worthless', 'stupid', 'so', 'garbage', 'my', 'not', 'mr', 'is', 'qiut', 'flea', 'posting', 'buying', 'please', 'park', 'stop', 'take', 'cute', 'I']
    文档样本: 
     [['my', 'dog', 'has', 'flea', 'probles', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['qiut', 'buying', 'worthless', 'dog', 'food', 'stupid']]

    '''
    return trainMat,classVecList,myVocabList
dataWordsList,classVecList = loadDataSet()
trainMat,classVecList,myVocabList = creatTrainMatrix(dataWordsList,classVecList)

#判断词集每一个位置的单词是否在样本中出现，若出现则赋值为1
print(trainMat,'\n',classVecList)

[[0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]] 
 [0, 1, 0, 1, 0, 1]

from numpy import *

def trainNB0(trainMatrix,trainClass):
    numTrainDocs = len(trainMatrix) #样本数
#     print(numTrainDocs) #32
    numWords = len(trainMatrix[0]) #词集的大小
    pClass1 = sum(trainClass) / float(numTrainDocs) #p(c1),类别1的概率
    p0Num = ones(numWords) #初始化为1，array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
    p1Num = ones(numWords) #shape(p1Num) : 32*1,32行一列
    p0Denom = float(numWords) #初始化在类别0中，出现的单词的总数：因为p0Num初始化为每个单词出现1次，所以总数为32个单词
    p1Denom = float(numWords)
    for i in range(numTrainDocs):
        '''
        print(trainMatrix[i],trainClass[i])
        [0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0] 0
        [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0] 1
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] 0
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0] 1
        [1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] 0
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0] 1
        '''
        if trainClass[i] == 1:
            p1Num += trainMatrix[i] #统计类别1，词集中每个词出现的次数
            p1Denom += sum(trainMatrix[i]) #统计类别1中，出现的单词的总数
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    '''
    print(p1Num) #类别1,词集中每个词出现的次数
    print(p0Num) #类别0,词集中每个词出现的次数
    [1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 2. 0. 1. 0. 2. 3. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0.]
    [1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 2. 1. 0. 0. 1. 0. 3. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1.]
    '''
    p1Vect = log(p1Num/p1Denom) #条件概率：类别1中，每个单词出现的概率
    p0Vect = log(p0Num/p0Denom) #条件概率：类别0中，每个单词出现的概率
    return p0Vect,p1Vect,pClass1
p0Vect,p1Vect,pClass1 = trainNB0(trainMat,classVecList)
print('p0Vect: ','\n',p0Vect)
print('p1Vect: ','\n',p1Vect)
print('pClass1: ','\n',pClass1)

p0Vect:  
 [-3.33220451 -3.33220451 -3.33220451 -3.33220451 -4.02535169 -4.02535169
 -3.33220451 -3.33220451 -3.33220451 -3.33220451 -3.33220451 -3.33220451
 -2.9267394  -3.33220451 -4.02535169 -4.02535169 -3.33220451 -4.02535169
 -2.63905733 -4.02535169 -3.33220451 -3.33220451 -4.02535169 -3.33220451
 -4.02535169 -4.02535169 -3.33220451 -4.02535169 -3.33220451 -4.02535169
 -3.33220451 -3.33220451]
p1Vect:  
 [-3.23867845 -3.93182563 -3.93182563 -3.93182563 -3.23867845 -3.23867845
 -3.93182563 -3.93182563 -3.93182563 -3.93182563 -2.83321334 -3.93182563
 -3.23867845 -3.93182563 -2.83321334 -2.54553127 -3.93182563 -3.23867845
 -3.93182563 -3.23867845 -3.93182563 -3.93182563 -3.23867845 -3.93182563
 -3.23867845 -3.23867845 -3.93182563 -3.23867845 -3.23867845 -3.23867845
 -3.93182563 -3.93182563]
pClass1:  
 0.5


#最高概率决策
def classifyNB(diff,p0Vec,p1Vec,pClass1): #diff=[0,1,1,1,0],一个样本向量
    p1 = sum(diff*p1Vec)+log(pClass1)
    p0 = sum(diff*p0Vec)+log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

#测试样本
def testingNB(testdiff):
    dataWordsList,classVecList = loadDataSet()
    trainMat,classVecList,myVocabList = creatTrainMatrix(dataWordsList,classVecList) #创建特征矩阵
    p0Vec,p1Vec,pClass1 = trainNB0(trainMat,classVecList) #训练特征矩阵
    diffVecList = setOfWords2Vec(myVocabList,testdiff) #文档变成特征向量
    res = classifyNB(diffVecList,p0Vec,p1Vec,pClass1) #最高概率决策
    return res
test1 = ['love','my','dalmation']
test2 = ['stupid','garbage']
print('The result with %s is %s ' %(test1,testingNB(test1)))
print('The result with %s is %s ' %(test2,testingNB(test2)))

The result with ['love', 'my', 'dalmation'] is 0 
The result with ['stupid', 'garbage'] is 1

词集模型&词袋模型

#bagOfWords2Vec(词袋模型)
def bagOfWords2Vec(vocabList,inputSet):
    returnVecList = [0]*len(vocabList) #shape(returnVecList):1*32
    for word in inputSet:
        if word in vocabList:
            indexOfWordInVocabList = vocabList.index(word) #找到word在词集的index
            returnVecList[indexOfWordInVocabList] += 1 #在词向量对应位置统计数量
        else:
            print('the word : %s is not in my vocabulary!' %word)
    return returnVecList

#setOfWords2Vec(词集模型)
def setOfWords2Vec(vocabList,inputSet):
    returnVecList = [0]*len(vocabList) #shape(returnVecList):1*32
    for word in inputSet:
        if word in vocabList:
            indexOfWordInVocabList = vocabList.index(word) #找到word在词集的index
            returnVecList[indexOfWordInVocabList] = 1 #在词向量对应位置赋值为1
        else:
            print('the word : %s is not in my vocabulary!' %word)
    return returnVecList

使用朴素贝叶斯过滤垃圾邮件

#文件解析
mySent = 'This booke is the best booke on Python or M.L. I have ever laid eyes upon.'
import re
listOfTokens = re.split('\W',mySent)
#['This', 'booke', 'is', 'the', 'best', 'booke', 'on', 'Python', 'or', 'M', 'L', '', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']
listOfTokens = [tok.lower() for tok in listOfTokens if len(tok)>0]
#['this', 'booke', 'is', 'the', 'best', 'booke', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']

#读取文件进行解析
emailText = open('../../Reference Code/Ch04/email/ham/7.txt').read()
listOfTokens = re.split('\W',emailText)
listOfTokens = [tok.lower() for tok in listOfTokens if len(tok)>0]
print(listOfTokens)

['zach', 'hamm', 'commented', 'on', 'your', 'status', 'zach', 'wrote', 'doggy', 'style', 'enough', 'said', 'thank', 'you', 'good', 'night']

#字符解析
def textParse(bigString):
    import re
    listOfTokens = re.split('\W',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2] #去掉一些后缀py等，设置了len(tok)>2的条件限制

#构建样本集
def creatTrainSet():
    fullText = [] #全部单词
    classList = [] #类别List
    docList = [] #文档List
    for i in range(1,26):
        #构建垃圾邮件训练集
        wordsList = open('../../Reference Code/Ch04/email/spam/%d.txt' %i).read() #读取垃圾邮件文本
        sample = textParse(wordsList)
        docList.append(sample) #每个样本放到docList中
        fullText.extend(sample)
        classList.append(1)
        
        #构建非垃圾邮件训练集
        wordsList = open('../../Reference Code/Ch04/email/ham/%d.txt' %i).read() #读取非垃圾邮件文本
        sample = textParse(wordsList)
        docList.append(sample) #每个样本放到docList中
        fullText.extend(sample)
        classList.append(0)
    return fullText,docList,classList

#随机抽取10个样本作为测试集，剩下的样本作为训练集
import random
def splitTestTrainSet():
    fullText,docList,classList = creatTrainSet() #构建样本集
    vocabList = createVocabSet(docList) #构建词集
    
    #随机抽取10个样本作为测试集,构建测试集的Index
    testSetIndex = []; trainSetIndex = list(range(len(docList)))
    randIndex = random.sample(range(0,50),10) #不重复抽取，在0-50之间随机返回10个整数
#   randIndex = int(random.uniform(0,len(docList))) #这种方法会重复抽取，这样在del(index)之后，再抽到del掉的index就会报错
    for i in randIndex:
        testSetIndex.append(trainSetIndex[i]) #添加样本到测试集
    
    #剩下的样本作为训练集
    trainMatrix = [];classTrain = []
    for i in trainSetIndex:
        if not i in testSetIndex: #排除测试样本的index
            diffVec = setOfWords2Vec(vocabList,docList[i]) #样本向量
            trainMatrix.append(diffVec) #样本矩阵
            classTrain.append(classList[i]) #样本类别
    
    #训练样本
    p0Vec,p1Vec,pClass1 = trainNB0(trainMatrix,classTrain)
    
    #统计测试结果
    error = 0
    for testIndex in testSetIndex:
        diffVecList = setOfWords2Vec(vocabList,docList[testIndex]) #文档变成特征向量
        res = classifyNB(diffVecList,p0Vec,p1Vec,pClass1) #最高概率决策
        if res != classList[i]:
            error += 1
            print('classfification error:',docList[testIndex])
            print('the classify result is:',res,',but the real result is:',classList[i])
    print(error)
    print(len(testSetIndex))
    print('the error rate is:',error/len(testSetIndex))
splitTestTrainSet()

classfification error: ['get', 'off', 'online', 'watchesstore', 'discount', 'watches', 'for', 'all', 'famous', 'brands', 'watches', 'arolexbvlgari', 'dior', 'hermes', 'oris', 'cartier', 'and', 'more', 'brands', 'louis', 'vuitton', 'bags', 'wallets', 'gucci', 'bags', 'tiffany', 'jewerly', 'enjoy', 'full', 'year', 'warranty', 'shipment', 'via', 'reputable', 'courier', 'fedex', 'ups', 'dhl', 'and', 'ems', 'speedpost', 'you', 'will', '100', 'recieve', 'your', 'order']
the classify result is: 1 ,but the real result is: 0
classfification error: ['ordercializviagra', 'online', 'save', '0nline', 'pharmacy', 'noprescription', 'required', 'buy', 'canadian', 'drugs', 'wholesale', 'prices', 'and', 'save', 'fda', 'approved', 'drugs', 'superb', 'quality', 'drugs', 'only', 'accept', 'all', 'major', 'credit', 'cards']
the classify result is: 1 ,but the real result is: 0
classfification error: ['get', 'off', 'online', 'watchesstore', 'discount', 'watches', 'for', 'all', 'famous', 'brands', 'watches', 'arolexbvlgari', 'dior', 'hermes', 'oris', 'cartier', 'and', 'more', 'brands', 'louis', 'vuitton', 'bags', 'wallets', 'gucci', 'bags', 'tiffany', 'jewerly', 'enjoy', 'full', 'year', 'warranty', 'shipment', 'via', 'reputable', 'courier', 'fedex', 'ups', 'dhl', 'and', 'ems', 'speedpost', 'you', 'will', '100', 'recieve', 'your', 'order']
the classify result is: 1 ,but the real result is: 0
3
10
the error rate is: 0.3

'''
参考统计学习P50
试由下表的训练数据学习一个朴素贝叶斯分类器并确定x=（2,S）的类标记y。表中X1和X2为特征。
X1= [1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3]
X2=['S','L','M','M','S','L','S','S','L','L','M','M','L','S','M','M']
y=[-1,1,1,-1,-1,1,1,-1,1,-1,1,1,1,1,-1,1]
'''
X1= [1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3]
X2=['S','L','M','M','S','L','S','S','L','L','M','M','L','S','M','M']
y=[-1,1,1,-1,-1,1,1,-1,1,-1,1,1,1,1,-1,1]
import pandas as pd
df = pd.DataFrame(array([X1,X2,y]).T,columns=['X1','X2','y'])

#y=1或者y=-1
y_1 = df[df['y']=='1']
y_0 = df[df['y']=='-1']

#y=1的条件下，X1和X2的情况
x1_1_y1 = y_1[y_1['X1']=='1']
x1_2_y1 = y_1[y_1['X1']=='2']
x1_3_y1 = y_1[y_1['X1']=='3']
x2_M_y1 = y_1[y_1['X2']=='M']
x2_S_y1 = y_1[y_1['X2']=='S']
x2_L_y1 = y_1[y_1['X2']=='L']

#y=-1的条件下，X1和X2的情况
x1_1_y0 = y_0[y_0['X1']=='1']
x1_2_y0 = y_0[y_0['X1']=='2']
x1_3_y0 = y_0[y_0['X1']=='3']
x2_M_y0 = y_0[y_0['X2']=='M']
x2_S_y0 = y_0[y_0['X2']=='S']
x2_L_y0 = y_0[y_0['X2']=='L']

#y=1或者y=-1的概率
p_y_1 = len(y_1)/len(df)
p_y_0 = len(y_0)/len(df)

#y=1的条件下，X1和X2的概率
p_x1_1_y1 = len(x1_1_y1) / len(y_1)
p_x1_2_y1 = len(x1_2_y1) / len(y_1)
p_x1_3_y1 = len(x1_3_y1) / len(y_1)
p_x2_M_y1 = len(x2_M_y1) / len(y_1)
p_x2_S_y1 = len(x2_S_y1) / len(y_1)
p_x2_L_y1 = len(x2_L_y1) / len(y_1)

#y=-1的条件下，X1和X2的概率
p_x1_1_y0 = len(x1_1_y0) / len(y_0)
p_x1_2_y0 = len(x1_2_y0) / len(y_0)
p_x1_3_y0 = len(x1_3_y0) / len(y_0)
p_x2_M_y0 = len(x2_M_y0) / len(y_0)
p_x2_S_y0 = len(x2_S_y0) / len(y_0)
p_x2_L_y0 = len(x2_L_y0) / len(y_0)

#x=（2,S）
#求x属于1类的概率
p_x_1 = p_y_1*p_x1_2_y1*p_x2_S_y1

#求x属于-1类的概率
p_x_0 = p_y_0*p_x1_2_y0*p_x2_S_y0

print('the probality of 1:',p_x_1)
print('the probality of 0:',p_x_0)
if p_x_1>p_x_0:
    print('x=(2,S) belong to 1')
else:
    print('x=(2,S) belong to -1')

the probality of 1: 0.05
the probality of 0: 0.0625
x=(2,S) belong to -1

使用朴素贝叶斯对新闻文本数据进行类别预测

#读取数据
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
print(len(news.data))
print(news.data[0])

18846
From: Mamatha Devineni Ratnam 
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu

I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!

#分割数据
from sklearn.cross_validation import train_test_split
X_train, X_test,y_train, y_test = train_test_split(news.data,news.target,test_size=.25,random_state=33)

#文本转化为特征向量
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

#朴素贝叶斯模型
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_predict = mnb.predict(X_test)

#评估模型
from sklearn.metrics import classification_report
print('The accuracy of Naive Bayes Classifier is:',mnb.score(X_test,y_test))
print(classification_report(y_test,y_predict,target_names=news.target_names))

The accuracy of Naive Bayes Classifier is: 0.8397707979626485
                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
                 sci.med       0.92      0.94      0.93       245
               sci.space       0.89      0.96      0.92       221
  soc.religion.christian       0.78      0.96      0.86       232
      talk.politics.guns       0.88      0.96      0.92       251
   talk.politics.mideast       0.90      0.98      0.94       231
      talk.politics.misc       0.79      0.89      0.84       188
      talk.religion.misc       0.93      0.44      0.60       158

             avg / total       0.86      0.84      0.82      4712

朴素贝叶斯算法

原理

优点：

缺点：

适用数据类型：

使用Python进行文本分类

词集模型&词袋模型

使用朴素贝叶斯过滤垃圾邮件

使用朴素贝叶斯对新闻文本数据进行类别预测

你可能感兴趣的:(朴素贝叶斯算法)