朴素贝叶斯——完整代码

#coding:-utf-8

from numpy import *

def loadDataSet():
    postingList = [['my','dog','has','flea',\
                    'problems','help','please'],
                   ['maybe','not','take','him',\
                       'to','dog','park','stupid'],
                   ['my','dalmation','is','so','cute',\
                       'I','love','him'],
                   ['stop','posting','stupid','worthless','garbage'],
                   ['mr','licks','ate','my','steak','how',\
                    'to','stop','him'],
                   ['quit','buying','worthless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1]
    return postingList,classVec

def createVocaList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet|set(document)
    return list(vocabSet)

def setOfWordds2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
        else:
            print "The word:%s is not in my Vocabulary!" % word
    return returnVec

def trainNB0(trainMatrix,trainCategory):
    #矩阵正一共有6行数据
    numTrainDocs = len(trainMatrix)
    #print numTrainDocs:6
    #每行一共有32个元素
    numWords = len(trainMatrix[0])
    #print numWords:32
    #侮辱性留言中文档数在总文档数中所占百分比
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    #print pAbusive:0.5
    #创建一共32个元素的一维数组
    p0Num = ones(numWords)
    p1Num = ones(numWords)
    p0Denom = 2.0;p1Denom = 2.0
    for i in range(numTrainDocs):
        #print trainCategory[i]
        #print sum(trainMatrix[i])
        #print trainMatrix[i]
        if trainCategory[i] == 1:
            #对类别1(侮辱性),每个词向量文档累加
            p1Num += trainMatrix[i]
            #每个词向量文档中所有词相加,即一共有多少个侮辱性的词
            p1Denom += sum(trainMatrix[i])
        else:
            # 对类别0(正常词),每个词向量文档累加
            p0Num += trainMatrix[i]
            # 每个词向量文档中所有词相加,即一共有多少个正常词
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)
    p0Vect = log(p0Num/p0Denom)
    #返回的是给定文档类别条件下词汇表中单词的出现概率
    return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocaList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWordds2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))

    testEntry = ['love','my','dalmation']
    thisDoc = array(setOfWordds2Vec(myVocabList,testEntry))
    print thisDoc
    print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)

    testEntry = ['stupid','garbage']
    thisDoc = array(setOfWordds2Vec(myVocabList,testEntry))
    print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)

#数据切分
def split_test():
    mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
    #split()函数默认以空格为切分点将整行数据切分,返回的是一个列表数据
    #my = mysent.split()
    #print my

    import re
    #使用正则表达式来切分句子,为了去除标点符号
    regEx = re.compile('\\W*') #注意这里'W'是大写!!!
    listOfTokens0 = regEx.split(mySent)
    #去除空格:轮流检查字符中的数值长度?> 0
    listOfTokens1 = [tok for tok in listOfTokens0 if len(tok) > 0]
    #利用lower()函数将字符串全部转换成小写
    listOfTokens2 = [tok.lower() for tok in listOfTokens0 if len(tok) > 0]
    #print listOfTokens2

    emailText = open('email/ham/6.txt').read()
    emailham0 = regEx.split(emailText)
    emailham1 = [tok.lower() for tok in emailham0 if len(tok) > 0]
    print emailham1

def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

#文件解析及完整的垃圾邮件测试函数
def spamTest():
    docList = []; classList = []; fullText = []
    for i in range(1,26):
        #挨个读取email/spam中的txt文件
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    #创建一个包含所有文档词汇中不重复的列表:乱序
    vocabList = createVocaList(docList)
    #print vocabList
    #创建0-49的一维数组,数组:[1,2,3]。列表:['1','2','3']
    trainingSet = range(50); testSet = [];
    for i in range(10):
        #随机返回一个0-49的整数,uniform(low,high)
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        #删除的是对变量的引用
        del(trainingSet[randIndex])
    trainMat = []; trainClasses = []
    #训练集--得到p0V,p1V,pSpam
    for docIndex in trainingSet:
        trainMat.append(setOfWordds2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    #测试集--验证错误率
    for docIndex in testSet:
        wordVector = setOfWordds2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is:',float(errorCount)/len(testSet)

你可能感兴趣的:(朴素贝叶斯)