《机器学习实战》---朴素贝叶斯分类器进行文本分类

朴素贝叶斯分类器:假设特征之间相互独立

另外,有两种模式 词集set(一个词在文档中只出现一次,即词向量中每个词的权重相等),伯努利模型;词袋bag,多项式模型

code:https://github.com/apachecn/AiLearning/blob/master/src/py2.x/ml/4.NaiveBayes/bayes.py

import numpy as np

def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......]
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList,classVec


def createVocabList(dataSet):
    vocabSet  = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
    return returnVec

def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

def trainNB(trainMatirx,trainCategory):

    '''

    :param trainMatirx:  将文档转为了词向量矩阵
    :param trainCategory:   文档每个句子所属类别
    :return:   贝叶斯公式,需要有先验、每个特征的条件概率
    '''

    numTrainDocs = len(trainMatirx)
    numTrainWords = len(trainMatirx[0])   # 将每一个句子转为 用词向量表示

    #   (1)    pA :A的先验
    pA = sum(trainCategory) / float(numTrainDocs)

    #   (2)   由于存储每个特征的条件概率
    p0Num = np.ones(numTrainWords)         #  每个 单词特征的 初始值 ,列表 [] 的每个元素用于统计每个单词出现的次数
    p1Num = np.ones(numTrainWords)

    p0Sum = 2.0                         #  整文本 词向量中一共出现的单词个数。 0类别一共的个数,初始值
    p1Sum = 2.0

    for i in range(numTrainDocs):
        if trainCategory[i]==0:
            p0Num += trainMatirx[i]
            p0Sum += sum(trainMatirx[i])

        else:
            p1Num += trainMatirx[i]
            p1Sum += sum(trainMatirx[i])

    p0Vec = np.log(p0Num/p0Sum)
    p1Vec = np.log(p1Num/p1Sum)

    return p0Vec,p1Vec,pA


def classifyNB(vec2Classify,p0Vec,p1Vec,pA):

    '''

    :param vec2Classify:  [1,0,0,1……] 类型的词向量,分别和p1Vec、p2Vec每个元素相乘,表示 在测试样本中,只用样本出现的特征进行相乘,
                            从而求得某个类别后验概率最大    (即书中提到的 伯努利模型 实现)
    :param p0Vec:
    :param p1Vec:
    :param pA:
    :return:
    '''

    p1 = np.sum(vec2Classify*p1Vec) + np.log(pA)
    p2 = np.sum(vec2Classify*p0Vec) + np.log(1-pA)

    if p1>p2:
        return 1
    else:
        return 0

def testingNB():
    data,label = loadDataSet()

    VocabList = createVocabList(data)

    print("set: ")
    print(VocabList)
    print("----------------")

    trainMat = []
    for d in data:
        trainMat.append(setOfWords2Vec(VocabList,d))

    p0V,p1V,pA = trainNB(trainMat,label)


    print("template: ")
    print("p0: ",p0V)
    print("p1: ",p1V)
    print("pa: ",pA)
    print("----------------------------------------------------")
    #test = ['love', 'my', 'dalmation']
    test =  ['stupid', 'garbage']
    array_test = np.array(setOfWords2Vec(VocabList,test))
    print(test,"classified as: ",classifyNB(array_test,p0V,p1V,pA))




if __name__ =="__main__":
    testingNB()





你可能感兴趣的:(“机器学习实战”)