我是用的伯努利来实现中文垃圾短信的分类,其中中文分词因为用的jieba没设置停用词,所以垃圾短信的召回率会降低,建议最好加上停用词。
数据集用的常见的80w条带标签的中文短信,下载链接https://github.com/hrwhisper/SpamMessage/blob/master/data/%E5%B8%A6%E6%A0%87%E7%AD%BE%E7%9F%AD%E4%BF%A1.txt
下面是源码
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import pandas as pd
import codecs
import numpy as np
import math
import jieba
# In[3]:
#2.计算对数先验概率
def getLogPrior(train):
'''
totals:{'y':单词总数,'no':'单词总数'}
samples:{'y':样本数,'n':样本数}
logPrior:{'y':先验概率,'no':先验概率}
'''
a1 = train[train['1']==0]
a2 = train[train['1']==1]
samples={'y':len(a1),'n':len(a2)}
docSum = samples['y']+samples['n']
prior={'y':samples['y']/docSum, 'n':samples['n']/docSum}
logPrior={'y':math.log(prior['y']), 'n':math.log(prior['n'])}
return (docSum,samples,logPrior,prior)
# In[4]:
def getConditionPro(train):
conditionPro={'y':{},'n':{}}
logConditionPro={'y':{},'n':{}}
docSum,samples,logPrior,prior=getLogPrior(train)
a1 = train[train['1']==0]
a2 = train[train['1']==1]
classNum=2
wordSet=set()
for X_word in train['2']:
for word in X_word:
wordSet.add(word)
#1.分类别计算词出现的次数
words={'y':{},'n':{}}
for messge in a1['2']:
settemp=set(messge)
for ch in messge:
if ch in words['y']:
words['y'][ch]+=1
else:
words['y'].setdefault(ch,1)
for messge in a2['2']:
settemp=set(messge)
for ch in messge:
if ch in words['n']:
words['n'][ch]+=1
else:
words['n'].setdefault(ch,1)
for word in wordSet:
for (category,sonDict) in words.items():
if word not in sonDict:
sonDict.setdefault(word,0)
#4.循环words字典,计算条件概率
for (category,chDict) in words.items():
for(ch,chCount) in chDict.items():
conditionPro[category][ch] = (chCount+1)/(samples[category]+classNum)
logConditionPro[category][ch] = math.log(conditionPro[category][ch])
return (conditionPro,logConditionPro)
# In[5]:
#训练函数,得到模型
def myFit(train):
docSum,samples,logPrior,prior=getLogPrior(train)
conditionPro,logConditionPro = getConditionPro(train)
return (docSum,samples,logPrior,prior,conditionPro,logConditionPro)
# In[6]:
#计算后验概率
def getPostProb(testData,docSum,samples,logPrior,prior,conditionPro,logConditionPro):
postProb = {'y':1,'n':1}
postProbLog = {'y':0,'n':0}
set_X_test=set()
for ch in testData:
set_X_test.add(ch)
for (category,pri) in prior.items():
postProb[category]=pri
postProbLog[category]=logPrior[category]
#1.计算联合条件概率
for ch,condition in conditionPro[category].items():
if ch in set_X_test:
postProb[category]=postProb.get(category,1)*conditionPro[category][ch]
postProbLog[category]=postProbLog.get(category,1)+logConditionPro[category][ch]
else:
postProb[category]=postProb.get(category,1)*(1-conditionPro[category][ch])
postProbLog[category]=postProbLog.get(category,1)+(1-logConditionPro[category][ch])
for ch in set_X_test:
if ch not in conditionPro[category]:
prob = (1)/(samples[category]+len(prior))
postProb[category]=postProb.get(category,1)*prob
postProbLog[category]=postProbLog.get(category,1)+math.log(prob)
return (postProb,postProbLog)
# In[7]:
#求出最大后验概率,及类别
def getGender(postProb):
sex = postProb['n']>postProb['y']
support = postProb['n']/(postProb['n']+postProb['y'])
return (sex,support,postProb)
# In[8]:
#测试:读取一个测试文件,返回一个混淆矩阵
from numpy import *
def testModel(test,docSum,samples,logPrior,prior,conditionPro,logConditionPro):
'''
返回:
result:混淆矩阵
testTotals:测试数据总条数
'''
testTotals=shape(test)[0]
results={}
for index,data in test.iterrows():
#print(data[0])
postProb,postProbLogtProb=getPostProb(data[1],docSum,samples,logPrior,prior,conditionPro,logConditionPro)
sex,support,postPort=getGender(postProb)
predictClass=0
if sex==True:
predictClass=1
else:
predictClass=0
realClass=data[0]
results.setdefault(realClass,{})
results[realClass].setdefault(predictClass,0)
results[realClass][predictClass]+=1
return (results,testTotals)
# In[9]:
#取一小部分数据建模
fr = codecs.open('1.txt',encoding='utf8')
train=[]
for line in fr.readlines()[0:30000]:
arr=line.strip().split('\t')[1:3]
train.append(arr)
print(train[0:5])
# In[10]:
#分词
def toFormat(data):
data = pd.DataFrame(data)
data2 = np.array(data)
list_data = data[1].tolist()
list_data = list(map(lambda line:list(jieba.cut(line)),list_data))
list_data=np.array(list_data)
data2 = np.c_[data2,list_data]
data2 = np.delete(data2,1,axis=1)
df_data = pd.DataFrame(data2)
df_data=df_data.rename(columns={0:'1',1:'2'})
df_data['1'] = pd.DataFrame(pd.Categorical(df_data['1']).codes)
return df_data
# In[11]:
train_m = toFormat(train)
train_m.head()
# In[12]:
docSum,samples,logPrior,prior,conditionPro,logConditionPro = myFit(train_m)
# In[13]:
#取后10000条测试
fr = codecs.open('1.txt',encoding='utf8')
test=[]
totals=0
for line in fr.readlines()[-10000:]:
totals+=1
arr=line.strip().split('\t')[1:3]
test.append(arr)
print(test[:5])
# In[14]:
test_m = toFormat(test)
test_m.head()
# In[15]:
metrics,total = testModel(test_m,docSum,samples,logPrior,prior,conditionPro,logConditionPro)
print(metrics) #混淆矩阵
print(total) #测试条数
# In[16]:
def showAccuracy(metrics):
preRight=0
total=0
for category,preDict in metrics.items():
preRight += metrics[category][category]
for ca,result in preDict.items():
total +=result
return preRight/total
print("准确率为:",showAccuracy(metrics))
# In[17]:
def showPrecision(metrics):
n=len(metrics)
precision={}
for i in range(n):
ySum = sum(metrics[row].get(i,0) for row in range(n))
p=metrics[i][i]/ySum
precision[i]=p
return precision
print('精准率:',showPrecision(metrics))
# In[18]:
def showRecall(metrics):
n =len(metrics)
recall={}
for i in range(n):
xSum=sum(value for category,value in metrics[i].items())
p=metrics[i][i]/xSum
recall[i]=p
return recall
print('召回率:',showRecall(metrics))
# In[19]:
def showKappa(metrics):
total=0
totalForP0=0
for (category,preDict) in metrics.items():
totalForP0+=preDict[category]
for ca,result in preDict.items():
total+=result
p0=totalForP0/total
print('准确数:',totalForP0,'总样本数:',total)
totalForPe=0
for (category,preDict) in metrics.items():
xsum=0
ysum=0
for (k,v) in preDict.items():
ysum+=metrics[k][category]
xsum+=v
print(xsum,' ',ysum)
totalForPe+=xsum*ysum
pe=totalForPe/(total*total)
k=(p0-pe)/(1-pe)
return k
showKappa(metrics)
# In[ ]: