- 选择具有最高概率的决策
- 朴素的意思是每个特征之间是独立同分布的,互不影响。
- 在数据较少的情况下仍然有效,可以处理多类别问题。特征条件独立假设,使得模型预测所需要估计的参数规模从冥指数量级向线性量级减少,极大地节约了内存消耗和计算时间。
- 对于输入数据的准备方式较为敏感。模型训练时无法将各个特征之间的联系考量在内,该模型在特征关联性强的分类任务上性能表现不佳。
def loadDataSet():
postingList = [['my','dog','has','flea','probles','help','please'],
classVecList = [0,1,0,1,0,1]
return postingList,classVecList
def createVocabSet(dataSet): #dataSet是全部单词
vocabSet = set()
for vocabList in dataSet:
vocabSet = vocabSet | set(vocabList)
vocabList = list(vocabSet) #set2list,后面用到list.index,set没有index属性
return vocabList
def setOfWords2Vec(vocabList,inputSet):
returnVecList = [0]*len(vocabList) #shape(returnVecList):1*32
for word in inputSet:
if word in vocabList:
indexOfWordInVocabList = vocabList.index(word) #找到word在词集的index
returnVecList[indexOfWordInVocabList] = 1 #在词向量对应位置赋值为1
print('the word : %s is not in my vocabulary!' %word)
return returnVecList
def creatTrainMatrix(dataWordsList,classVecList):
myVocabList = createVocabSet(dataWordsList)
trainMat = []
for i in dataWordsList:
returnVecList = setOfWords2Vec(myVocabList,i)
['to', 'has', 'probles', 'dalmation', 'food', 'maybe', 'steak', 'licks', 'help', 'how', 'dog', 'ate', 'him', 'love', 'worthless', 'stupid', 'so', 'garbage', 'my', 'not', 'mr', 'is', 'qiut', 'flea', 'posting', 'buying', 'please', 'park', 'stop', 'take', 'cute', 'I']
[['my', 'dog', 'has', 'flea', 'probles', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['qiut', 'buying', 'worthless', 'dog', 'food', 'stupid']]
return trainMat,classVecList,myVocabList
dataWordsList,classVecList = loadDataSet()
trainMat,classVecList,myVocabList = creatTrainMatrix(dataWordsList,classVecList)
[[0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]]
[0, 1, 0, 1, 0, 1]
from numpy import *
def trainNB0(trainMatrix,trainClass):
numTrainDocs = len(trainMatrix) #样本数
# print(numTrainDocs) #32
numWords = len(trainMatrix[0]) #词集的大小
pClass1 = sum(trainClass) / float(numTrainDocs) #p(c1),类别1的概率
p0Num = ones(numWords) #初始化为1,array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
p1Num = ones(numWords) #shape(p1Num) : 32*1,32行一列
p0Denom = float(numWords) #初始化在类别0中,出现的单词的总数:因为p0Num初始化为每个单词出现1次,所以总数为32个单词
p1Denom = float(numWords)
for i in range(numTrainDocs):
[0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0] 0
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0] 1
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] 0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0] 1
[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] 0
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0] 1
if trainClass[i] == 1:
p1Num += trainMatrix[i] #统计类别1,词集中每个词出现的次数
p1Denom += sum(trainMatrix[i]) #统计类别1中,出现的单词的总数
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
print(p1Num) #类别1,词集中每个词出现的次数
print(p0Num) #类别0,词集中每个词出现的次数
[1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 2. 0. 1. 0. 2. 3. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0.]
[1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 2. 1. 0. 0. 1. 0. 3. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1.]
p1Vect = log(p1Num/p1Denom) #条件概率:类别1中,每个单词出现的概率
p0Vect = log(p0Num/p0Denom) #条件概率:类别0中,每个单词出现的概率
return p0Vect,p1Vect,pClass1
p0Vect,p1Vect,pClass1 = trainNB0(trainMat,classVecList)
print('p0Vect: ','\n',p0Vect)
print('p1Vect: ','\n',p1Vect)
print('pClass1: ','\n',pClass1)
[-3.33220451 -3.33220451 -3.33220451 -3.33220451 -4.02535169 -4.02535169
-3.33220451 -3.33220451 -3.33220451 -3.33220451 -3.33220451 -3.33220451
-2.9267394 -3.33220451 -4.02535169 -4.02535169 -3.33220451 -4.02535169
-2.63905733 -4.02535169 -3.33220451 -3.33220451 -4.02535169 -3.33220451
-4.02535169 -4.02535169 -3.33220451 -4.02535169 -3.33220451 -4.02535169
-3.33220451 -3.33220451]
[-3.23867845 -3.93182563 -3.93182563 -3.93182563 -3.23867845 -3.23867845
-3.93182563 -3.93182563 -3.93182563 -3.93182563 -2.83321334 -3.93182563
-3.23867845 -3.93182563 -2.83321334 -2.54553127 -3.93182563 -3.23867845
-3.93182563 -3.23867845 -3.93182563 -3.93182563 -3.23867845 -3.93182563
-3.23867845 -3.23867845 -3.93182563 -3.23867845 -3.23867845 -3.23867845
-3.93182563 -3.93182563]
def classifyNB(diff,p0Vec,p1Vec,pClass1): #diff=[0,1,1,1,0],一个样本向量
p1 = sum(diff*p1Vec)+log(pClass1)
p0 = sum(diff*p0Vec)+log(1.0 - pClass1)
if p1 > p0:
return 1
return 0
def testingNB(testdiff):
dataWordsList,classVecList = loadDataSet()
trainMat,classVecList,myVocabList = creatTrainMatrix(dataWordsList,classVecList) #创建特征矩阵
p0Vec,p1Vec,pClass1 = trainNB0(trainMat,classVecList) #训练特征矩阵
diffVecList = setOfWords2Vec(myVocabList,testdiff) #文档变成特征向量
res = classifyNB(diffVecList,p0Vec,p1Vec,pClass1) #最高概率决策
return res
test1 = ['love','my','dalmation']
test2 = ['stupid','garbage']
print('The result with %s is %s ' %(test1,testingNB(test1)))
print('The result with %s is %s ' %(test2,testingNB(test2)))
The result with ['love', 'my', 'dalmation'] is 0
The result with ['stupid', 'garbage'] is 1
def bagOfWords2Vec(vocabList,inputSet):
returnVecList = [0]*len(vocabList) #shape(returnVecList):1*32
for word in inputSet:
if word in vocabList:
indexOfWordInVocabList = vocabList.index(word) #找到word在词集的index
returnVecList[indexOfWordInVocabList] += 1 #在词向量对应位置统计数量
print('the word : %s is not in my vocabulary!' %word)
return returnVecList
def setOfWords2Vec(vocabList,inputSet):
returnVecList = [0]*len(vocabList) #shape(returnVecList):1*32
for word in inputSet:
if word in vocabList:
indexOfWordInVocabList = vocabList.index(word) #找到word在词集的index
returnVecList[indexOfWordInVocabList] = 1 #在词向量对应位置赋值为1
print('the word : %s is not in my vocabulary!' %word)
return returnVecList
mySent = 'This booke is the best booke on Python or M.L. I have ever laid eyes upon.'
import re
listOfTokens = re.split('\W',mySent)
#['This', 'booke', 'is', 'the', 'best', 'booke', 'on', 'Python', 'or', 'M', 'L', '', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']
listOfTokens = [tok.lower() for tok in listOfTokens if len(tok)>0]
#['this', 'booke', 'is', 'the', 'best', 'booke', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']
emailText = open('../../Reference Code/Ch04/email/ham/7.txt').read()
listOfTokens = re.split('\W',emailText)
listOfTokens = [tok.lower() for tok in listOfTokens if len(tok)>0]
['zach', 'hamm', 'commented', 'on', 'your', 'status', 'zach', 'wrote', 'doggy', 'style', 'enough', 'said', 'thank', 'you', 'good', 'night']
def textParse(bigString):
import re
listOfTokens = re.split('\W',bigString)
return [tok.lower() for tok in listOfTokens if len(tok)>2] #去掉一些后缀py等,设置了len(tok)>2的条件限制
def creatTrainSet():
fullText = [] #全部单词
classList = [] #类别List
docList = [] #文档List
for i in range(1,26):
wordsList = open('../../Reference Code/Ch04/email/spam/%d.txt' %i).read() #读取垃圾邮件文本
sample = textParse(wordsList)
docList.append(sample) #每个样本放到docList中
wordsList = open('../../Reference Code/Ch04/email/ham/%d.txt' %i).read() #读取非垃圾邮件文本
sample = textParse(wordsList)
docList.append(sample) #每个样本放到docList中
return fullText,docList,classList
import random
def splitTestTrainSet():
fullText,docList,classList = creatTrainSet() #构建样本集
vocabList = createVocabSet(docList) #构建词集
testSetIndex = []; trainSetIndex = list(range(len(docList)))
randIndex = random.sample(range(0,50),10) #不重复抽取,在0-50之间随机返回10个整数
# randIndex = int(random.uniform(0,len(docList))) #这种方法会重复抽取,这样在del(index)之后,再抽到del掉的index就会报错
for i in randIndex:
testSetIndex.append(trainSetIndex[i]) #添加样本到测试集
trainMatrix = [];classTrain = []
for i in trainSetIndex:
if not i in testSetIndex: #排除测试样本的index
diffVec = setOfWords2Vec(vocabList,docList[i]) #样本向量
trainMatrix.append(diffVec) #样本矩阵
classTrain.append(classList[i]) #样本类别
p0Vec,p1Vec,pClass1 = trainNB0(trainMatrix,classTrain)
error = 0
for testIndex in testSetIndex:
diffVecList = setOfWords2Vec(vocabList,docList[testIndex]) #文档变成特征向量
res = classifyNB(diffVecList,p0Vec,p1Vec,pClass1) #最高概率决策
if res != classList[i]:
error += 1
print('classfification error:',docList[testIndex])
print('the classify result is:',res,',but the real result is:',classList[i])
print('the error rate is:',error/len(testSetIndex))
classfification error: ['get', 'off', 'online', 'watchesstore', 'discount', 'watches', 'for', 'all', 'famous', 'brands', 'watches', 'arolexbvlgari', 'dior', 'hermes', 'oris', 'cartier', 'and', 'more', 'brands', 'louis', 'vuitton', 'bags', 'wallets', 'gucci', 'bags', 'tiffany', 'jewerly', 'enjoy', 'full', 'year', 'warranty', 'shipment', 'via', 'reputable', 'courier', 'fedex', 'ups', 'dhl', 'and', 'ems', 'speedpost', 'you', 'will', '100', 'recieve', 'your', 'order']
the classify result is: 1 ,but the real result is: 0
classfification error: ['ordercializviagra', 'online', 'save', '0nline', 'pharmacy', 'noprescription', 'required', 'buy', 'canadian', 'drugs', 'wholesale', 'prices', 'and', 'save', 'fda', 'approved', 'drugs', 'superb', 'quality', 'drugs', 'only', 'accept', 'all', 'major', 'credit', 'cards']
the classify result is: 1 ,but the real result is: 0
classfification error: ['get', 'off', 'online', 'watchesstore', 'discount', 'watches', 'for', 'all', 'famous', 'brands', 'watches', 'arolexbvlgari', 'dior', 'hermes', 'oris', 'cartier', 'and', 'more', 'brands', 'louis', 'vuitton', 'bags', 'wallets', 'gucci', 'bags', 'tiffany', 'jewerly', 'enjoy', 'full', 'year', 'warranty', 'shipment', 'via', 'reputable', 'courier', 'fedex', 'ups', 'dhl', 'and', 'ems', 'speedpost', 'you', 'will', '100', 'recieve', 'your', 'order']
the classify result is: 1 ,but the real result is: 0
the error rate is: 0.3
X1= [1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3]
X1= [1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3]
import pandas as pd
df = pd.DataFrame(array([X1,X2,y]).T,columns=['X1','X2','y'])
y_1 = df[df['y']=='1']
y_0 = df[df['y']=='-1']
x1_1_y1 = y_1[y_1['X1']=='1']
x1_2_y1 = y_1[y_1['X1']=='2']
x1_3_y1 = y_1[y_1['X1']=='3']
x2_M_y1 = y_1[y_1['X2']=='M']
x2_S_y1 = y_1[y_1['X2']=='S']
x2_L_y1 = y_1[y_1['X2']=='L']
x1_1_y0 = y_0[y_0['X1']=='1']
x1_2_y0 = y_0[y_0['X1']=='2']
x1_3_y0 = y_0[y_0['X1']=='3']
x2_M_y0 = y_0[y_0['X2']=='M']
x2_S_y0 = y_0[y_0['X2']=='S']
x2_L_y0 = y_0[y_0['X2']=='L']
p_y_1 = len(y_1)/len(df)
p_y_0 = len(y_0)/len(df)
p_x1_1_y1 = len(x1_1_y1) / len(y_1)
p_x1_2_y1 = len(x1_2_y1) / len(y_1)
p_x1_3_y1 = len(x1_3_y1) / len(y_1)
p_x2_M_y1 = len(x2_M_y1) / len(y_1)
p_x2_S_y1 = len(x2_S_y1) / len(y_1)
p_x2_L_y1 = len(x2_L_y1) / len(y_1)
p_x1_1_y0 = len(x1_1_y0) / len(y_0)
p_x1_2_y0 = len(x1_2_y0) / len(y_0)
p_x1_3_y0 = len(x1_3_y0) / len(y_0)
p_x2_M_y0 = len(x2_M_y0) / len(y_0)
p_x2_S_y0 = len(x2_S_y0) / len(y_0)
p_x2_L_y0 = len(x2_L_y0) / len(y_0)
p_x_1 = p_y_1*p_x1_2_y1*p_x2_S_y1
p_x_0 = p_y_0*p_x1_2_y0*p_x2_S_y0
print('the probality of 1:',p_x_1)
print('the probality of 0:',p_x_0)
if p_x_1>p_x_0:
print('x=(2,S) belong to 1')
print('x=(2,S) belong to -1')
the probality of 1: 0.05
the probality of 0: 0.0625
x=(2,S) belong to -1
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
From: Mamatha Devineni Ratnam
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu
I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game. PENS RULE!!!
from sklearn.cross_validation import train_test_split
X_train, X_test,y_train, y_test = train_test_split(news.data,news.target,test_size=.25,random_state=33)
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
y_predict = mnb.predict(X_test)
from sklearn.metrics import classification_report
print('The accuracy of Naive Bayes Classifier is:',mnb.score(X_test,y_test))
The accuracy of Naive Bayes Classifier is: 0.8397707979626485
precision recall f1-score support
alt.atheism 0.86 0.86 0.86 201
comp.graphics 0.59 0.86 0.70 250
comp.os.ms-windows.misc 0.89 0.10 0.17 248
comp.sys.ibm.pc.hardware 0.60 0.88 0.72 240
comp.sys.mac.hardware 0.93 0.78 0.85 242
comp.windows.x 0.82 0.84 0.83 263
misc.forsale 0.91 0.70 0.79 257
rec.autos 0.89 0.89 0.89 238
rec.motorcycles 0.98 0.92 0.95 276
rec.sport.baseball 0.98 0.91 0.95 251
rec.sport.hockey 0.93 0.99 0.96 233
sci.crypt 0.86 0.98 0.91 238
sci.electronics 0.85 0.88 0.86 249
sci.med 0.92 0.94 0.93 245
sci.space 0.89 0.96 0.92 221
soc.religion.christian 0.78 0.96 0.86 232
talk.politics.guns 0.88 0.96 0.92 251
talk.politics.mideast 0.90 0.98 0.94 231
talk.politics.misc 0.79 0.89 0.84 188
talk.religion.misc 0.93 0.44 0.60 158
avg / total 0.86 0.84 0.82 4712