机器学习实战-朴素贝叶斯笔记

  • 对于python3的编码和解码问题 encode及decode
  • 由于encode及decode问题带来的python3 中 ‘r’和’rb’问题
  • range(50)需要list(range(50))可以返回列表形式
  • set()的 | 并集操作
  • python正则表达式,re的pattern及match search的方法
import numpy as np

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec
#提取所有的词汇表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet |= set(document)
    return list(vocabSet)
listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
print(type(myVocabList))
print(myVocabList)

['posting', 'so', 'my', 'take', 'ate', 'not', 'help', 'mr', 'cute', 'please', 'quit', 'I', 'licks', 'has', 'park', 'love', 'buying', 'is', 'dalmation', 'him', 'how', 'steak', 'stupid', 'stop', 'dog', 'flea', 'problems', 'worthless', 'food', 'maybe', 'garbage', 'to']
a = set([1,2,3,1])
a
{1, 2, 3}
#词转化为词向量形式,便于计算
def setOfWord2Vec(vocablist, inputSet):
    returnVec = [0] * len(vocablist)
    for word in inputSet:
        if word in vocablist:
            # list 的index方法()。第一个匹配项。类似字符串的find方法
            returnVec[vocablist.index(word)] = 1
        else: 
            print("the word: {0} is not in vocablist".format(word))
    return returnVec
print(setOfWord2Vec(myVocabList, listOfPosts[3]))
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]
str = 'abca'
lis = ['a','b','a']
print(str.index('a'))
print(str.index('a',1))
print(str.find('a'))
print(str.find('a',2))
#rfind()
print(str.rfind('a'))
#rindex
print(str.rindex('a'))
#list没有find方法
print(lis.find('a'))
0
3
0
3
3
3



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

 in ()
      8 print(str.rindex('a'))
      9 #list没有find方法
---> 10 print(lis.find('a'))


AttributeError: 'list' object has no attribute 'find'
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    #不能np.shape 因为是字符类型,不是数字不是np.array
    numWords = len(trainMatrix[0])
    # 计算p(c1) ,即脏话文档的概率
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    #准备统计每个词出现的次数
# p0Num = np.zeros(numWords)
# p1Num = np.zeros(numWords)
# p0Denom = 0.0
# p1Denom = 0.0
#由于下溢问题,修改为
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            #统计脏话词向量
            p1Num += trainMatrix[i]
            #计算脏话文档总计一共有多少个词
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    #计算每个词在相应类别总词数占的概率,如p(w1|c1)
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vect, p1Vect, pAbusive
trainMat = []
for postinDoc in listOfPosts:
    trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(trainMat, listClasses)
print(trainMat)
print(p0V)
print(p1V)
print(pAb)
[[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]]
[ 0.03846154  0.07692308  0.15384615  0.03846154  0.07692308  0.03846154
  0.07692308  0.07692308  0.07692308  0.07692308  0.03846154  0.07692308
  0.07692308  0.07692308  0.03846154  0.07692308  0.03846154  0.07692308
  0.07692308  0.11538462  0.07692308  0.07692308  0.03846154  0.07692308
  0.07692308  0.07692308  0.07692308  0.03846154  0.03846154  0.03846154
  0.03846154  0.07692308]
[ 0.0952381   0.04761905  0.04761905  0.0952381   0.04761905  0.0952381
  0.04761905  0.04761905  0.04761905  0.04761905  0.0952381   0.04761905
  0.04761905  0.04761905  0.0952381   0.04761905  0.0952381   0.04761905
  0.04761905  0.0952381   0.04761905  0.04761905  0.19047619  0.0952381
  0.14285714  0.04761905  0.04761905  0.14285714  0.0952381   0.0952381
  0.0952381   0.0952381 ]
0.5
pAb
0.5
#构建Bayes分类函数
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
    #取对数,便于计算,防止下溢
    p1Vec = np.log(p1Vect)
    p0Vec = np.log(p0Vect)
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0
def testingNB():
    listOfPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOfPosts)
    trainMat = []
    for postInDoc in listOfPosts:
        trainMat.append(setOfWord2Vec(myVocabList, postInDoc))
    p0V,p1V,pAb = trainNB0(trainMat, listClasses)
    testEntry1 = ['love', 'my', 'dalmation']
    thisDoc1 = np.array(setOfWord2Vec(myVocabList, testEntry1))
    print('{0} classified as : {1}'.format(testEntry1,classifyNB(thisDoc1, p0V, p1V, pAb)))
    testEntry2 = ['stupid', 'garbage']
    thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2))
    print('{0} classified as : {1}'.format(testEntry2,classifyNB(thisDoc2, p0V, p1V, pAb)))
testingNB()
['love', 'my', 'dalmation'] classified as : 0
['stupid', 'garbage'] classified as : 1
def textParse(bigString):
    import re
    if bigString != None:
        #需要非空的pattern,所以我把*改成了+
        listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(26):
        #read()读取整个文件,通常转化为字符串
        #print(i)#,用来检测哪个文件出错了
        #若用'rb'读取,需要'gbk'解码
        #我在目录下添加了0.txt,为了符合range(26)的需求
        wordList = textParse(open('email\\spam\\%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        #ham\6.txt有错误,中间有个are是乱码。
        wordList = textParse(open('email\\ham\\%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)
    #改成list()形式,python3返回range迭代器
    traingSet = list(range(50))
    testSet = []

    for i in range(10):
        randIndex = int(np.random.uniform(0, len(traingSet)))
        #testSet只要index
        testSet.append(traingSet[randIndex])
        del(traingSet[randIndex])
    traingMat = []
    traingClasses = []
    for docIndex in traingSet:
        traingMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
        traingClasses.append(classList[docIndex])
    p0V, p1V, pAb = trainNB0(np.array(traingMat), np.array(traingClasses))
    errorCount = 0.0
    for docIndex in testSet:
        wordVector = setOfWord2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
            errorCount += 1
    print('the error rate is : ', float(errorCount) / len(testSet))
spamTest()
the error rate is :  0.1
'ab'.encode('gbk')
'\0xab'.encode('gbk')
b'\x00xab'
#改动前ham\6.txt为置信区间0.73, windows-1225编码?中间有乱码导致错误
import chardet
f = open('email\\ham\\6.txt','rb')
chardet.detect(f.read())
{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}
print(list(range(50)))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]

你可能感兴趣的:(python,机器学习)