首先给出贝叶斯概率公式:
朴素贝叶斯原理:
对给定的训练数据集,计算其输入输出的联合概率分布,模型训练好后,对于给定的输入,利用贝叶斯定理求出后验概率最大的输出。朴素贝叶斯最主要的假设是特征之间相互独立且各特征等重。
算法
代码:
- 数据处理
def creatdataset():
dataset=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
labels = [0,1,0,1,0,1] #1 is abusive, 0 not
return dataset,labels
def creatlist(dataset):
List=[]
for data in dataset:
List.extend(data)
List_=list(set(List))
List_.sort(key=List.index)
return List_ #输出一个词汇表
#List_=['my', 'dog', 'has', 'flea', 'problems', 'help', 'please', 'maybe', 'not', 'take', 'him', 'to', 'park', 'stupid', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'stop', 'posting', 'worthless', 'garbage', 'mr', 'licks', 'ate', 'steak', 'how', 'quit', 'buying', 'food']
def word2vec(list,sequence):
returnlist=len(list)*[0] #与词汇表等长的0向量
for word in sequence:
if word in list:
returnlist[list.index(word)]=1 #出现词汇表中的单词则对应值设为1
else:
print('{} is not in list'.format(word))
return returnlist
- 计算概率
def dataarray():
dataset,labels=creatdataset()
list_=creatlist(dataset)
weight=len(list_)
height=len(dataset)
datasets=np.zeros([height,weight])
for i in range(height):
datasets[i]=word2vec(list_,dataset[i])
p1=sum(labels)/float(len(labels))
J1p1=[0]*len(list_)
J1p0=[0]*len(list_)
for i in range(height):
if labels[i]==1:
J1p1+=datasets[i]
else:
J1p0+=datasets[i]
J1p0=np.array([x+1 for x in J1p0]) #为防止概率为0,采用贝叶斯估计
J1p1=np.array([x+1 for x in J1p1])
return p1,J1p0/float(p1*height+2.),J1p1/float((1-p1)*height+2.)
为了防止出现概率为0的状况,采用贝叶斯估计,条件概率的贝叶斯估计如下:
在本实验中取λ=1,Sj=2 。
- 对新数据进行分类
def testclassify(sequence):
p1,J1p0,J1p1=dataarray()
p0=1-p1
dataset, labels = creatdataset()
list_ = creatlist(dataset)
seq=word2vec(list_, sequence)
for i in range(len(seq)):
if seq[i]==1:
p1=p1*J1p1[i]
p0=p0*J1p0[i]
else:
p1 = p1 * (1-J1p1[i])
p0 = p0 * (1-J1p0[i])
if p1>p0:
print(sequence,' :abusive')
else:
print(sequence,' :not abusive')
- 测试
testclassify(['stupid','garbage']) #['stupid', 'garbage'] :abusive
testclassify(['love','my','dalmation']) #['love', 'my', 'dalmation'] :not abusive