ML基础-朴素贝叶斯-3-使用朴素贝叶斯进行交叉验证

交叉验证

#
# 接受一个大字符串并将其解析为字符串列表。
# 该函数去掉少于两个字符的字符串,并将所有字符串转换为小写。
#  

def textParse(bigString):  # input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]


def spamTest():
    docList = [];
    classList = [];
    fullText = []
    for i in range(1, 26):
        # 正常邮件
        print('111111111   %d ', i)
        wordList = textParse(open('email/spam/%d.txt' % i, 'r', encoding='utf-8').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        # 垃圾邮件
        print('00000000   %d ', i)
        wordList = textParse(open('email/ham/%d.txt' % i, 'r', encoding='utf-8').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    # 创建词汇表
    vocabList = createVocabList(docList)  # create vocabulary
    trainingSet = list(range(50));
    testSet = []  # create test set
    # 构建随机的训练集合
    for i in range(10):
        # 任意抽取出10分邮件,第i封,选一个随机数
        randIndex = int(random.uniform(0, len(trainingSet)))
        # 加入test的集合中
        testSet.append(trainingSet[randIndex])
        # 从训练的集合中移除
        del (trainingSet[randIndex])
    trainMat = [];
    trainClasses = []
    # 训练集合
    for docIndex in trainingSet:  # train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0

    # 对测试集进行分类
    for docIndex in testSet:  # classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        # 通过分类器得出的结果和判断的词汇表进行判断球的判断的错误率
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error", docList[docIndex])
    print('the error rate is: ', float(errorCount) / len(testSet))
    # return vocabList,fullText

在textParse函数中,\W 表示非单词字符,就是除了26个字母和10个阿拉伯数字之外的字符。表示0或多个。re.split(r’\W‘, bigString)意思就是在所有连续的非单词字符组成的字符串处,把bigString“切分开”,就是其间不论有多少空格、换行符什么,通通一刀切。然后过滤掉单词长度为0的字符串(空串),再把所有字母变成小写。

python 函数

  • del
li=[1,2,3,4,5]
first=li[0]
del li[0]

print(li)#[2, 3, 4, 5]
print(first)  #1
  • range()
    参考文章:https://www.cnblogs.com/c-x-m/articles/7779301.html

你可能感兴趣的:(ML基础)