1、朴素贝叶斯算法之朴素的含义:
朴素是指假定数据的特征变量之间是相互独立的。
2、朴素贝叶斯算法分类:
将实例分类到后验概率最大的类别当中。
假设实例数据有3个特征向量:{A1,A2,A3},Cj为分类当中的第j个类,后验概率的计算公式为:
假设j的取值为0~n,那么计算出所有的P(Cj | A1A2A3),结果最大的所对应的分类即为实例的分类。
比较后验概率大小的问题,也可以简化为比较P(A1A2A3 | Cj) P(Cj)的问题。
3、朴素贝叶斯分类是计算实例属于各个分类的概率,最后给出最优的猜测分类结果,所以是一种软分类,而决策树要求分类器作出一个分类的明确答案,属于硬分类。
4、在实现朴素贝叶斯算法时,下溢出的问题可以用取对数的方法解决,词袋模型也优于词集模型。
5、朴素贝叶斯算法实现:
from numpy import *
from math import log
def loadDataset():
# 创建实验样本
postingList = [['my','dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
def createVocablist(dataset):
# 创建不重复词的列表
vocabSet = set([])
for document in dataset:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def wordsVec(vocablist, inputset):
# 生成文档向量
vec = [0] * len(vocablist)
for word in inputset:
if word in vocablist:
vec[vocablist.index(word)] = 1
else:
print('The word: %s is not in my Vocabulary' % word)
return vec
def trainNB(trainset, traincategories):
# 训练朴素贝叶斯分类器
p1num = ones(len(trainset[0]))
p0num = ones(len(trainset[0]))
p1denom = 2.0
p0denom = 2.0
for i in range(len(trainset)):
if traincategories[i] == 1:
p1num += trainset[i]
p1denom += sum(trainset[i])
else:
p0num += trainset[i]
p0denom += sum(trainset[i])
p1vec = log(p1num / p1denom)
p0vec = log(p0num / p0denom)
pAbusive = sum(traincategories) / (len(trainset))
return p1vec, p0vec, pAbusive
def classifyNB(vecClassiify, p0vec, p1vec, pclass):
# 朴素贝叶斯分类函数
p1 = sum(vecClassiify * p1vec) + log(pclass)
p0 = sum(vecClassiify * p0vec) + log(1.0 - pclass)
if p1 > p0:
return 1
else:
return 0
def testingNB():
# 输出实例的分类结果
inputset, classes = loadDataset()
myvolcablist = createVocablist(inputset)
trainmat = []
for doc in inputset:
trainmat.append(wordsVec(myvolcablist, doc))
p1V, p0V, pA = trainNB(trainmat, classes)
testEntry = ['love', 'my', 'dalmation']
thisDoc = wordsVec(myvolcablist, testEntry)
print(testEntry, 'classified as :', classifyNB(thisDoc, p0V, p1V, pA))
最后运行代码:
print(testingNB()
['love', 'my', 'dalmation'] classified as: 0
6、用sklearn工具包实现文档分类:
import os
import jieba
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
warnings.filterwarnings('ignore')
def cut_words(file_path):
# 对文本进行切词
text_with_spaces = ''
text = open(file_path, 'r', encoding='gb18030').read()
textcut = jieba.cut(text)
for word in textcut:
text_with_spaces += word +' '
return text_with_spaces
def loadfile(file_dir, label):
# 将路径下的所有文件都加载
files = os.listdir(file_dir)
words_list = []
labels_list = []
for file in files:
file_path = file_dir + '/' + file
words_list.append(cut_words(file_path))
labels_list.append(label)
return words_list, labels_list
# 训练数据
train_words_list1, train_labels1 = loadfile('text_classification-master/text classification/train/女性', '女性')
train_words_list2, train_labels2 = loadfile('text_classification-master/text classification/train/体育', '体育')
train_words_list3, train_labels3 = loadfile('text_classification-master/text classification/train/文学', '文学')
train_words_list4, train_labels4 = loadfile('text_classification-master/text classification/train/校园', '校园')
train_words_list = train_words_list1 + train_words_list2 + train_words_list3 + train_words_list4
train_labels = train_labels1 + train_labels2 + train_labels3 + train_labels4
# 测试数据
test_words_list1, test_labels1 = loadfile('text_classification-master/text classification/test/女性', '女性')
test_words_list2, test_labels2 = loadfile('text_classification-master/text classification/test/体育', '体育')
test_words_list3, test_labels3 = loadfile('text_classification-master/text classification/test/文学', '文学')
test_words_list4, test_labels4 = loadfile('text_classification-master/text classification/test/校园', '校园')
test_words_list = test_words_list1 + test_words_list2 + test_words_list3 + test_words_list4
test_labels = test_labels1 + test_labels2 + test_labels3 + test_labels4
stop_words = open('text_classification-master/text classification/stop/stopword.txt', 'r', encoding='utf-8').read()
stop_words = stop_words.encode('utf-8').decode('utf-8-sig') #列表头部\ufeff处理
stop_words = stop_words.split('\n') # 根据分隔符分隔
# 计算单词权重
tf = TfidfVectorizer(stop_words=stop_words, max_df=0.5)
train_features = tf.fit_transform(train_words_list)
test_features = tf.transform(test_words_list)
# 多项式贝叶斯分类器
clf = MultinomialNB(alpha=0.001).fit(train_features, train_labels)
predicted_labels = clf.predict(test_features)
# 计算准确率
print('准确率为:{}'.format(metrics.accuracy_score(test_labels, predicted_labels)))
输出结果为:
准确率为: 0.9104477611940298