机器学习 朴素贝叶斯实现中文垃圾短信分类

我是用的伯努利来实现中文垃圾短信的分类,其中中文分词因为用的jieba没设置停用词,所以垃圾短信的召回率会降低,建议最好加上停用词。

数据集用的常见的80w条带标签的中文短信,下载链接https://github.com/hrwhisper/SpamMessage/blob/master/data/%E5%B8%A6%E6%A0%87%E7%AD%BE%E7%9F%AD%E4%BF%A1.txt

下面是源码

#!/usr/bin/env python
# coding: utf-8

# In[2]:


import pandas as pd
import codecs
import numpy as np
import math
import jieba


# In[3]:


#2.计算对数先验概率
def getLogPrior(train):
    '''
        totals:{'y':单词总数,'no':'单词总数'}
        samples:{'y':样本数,'n':样本数}
        logPrior:{'y':先验概率,'no':先验概率}
    '''
    a1 = train[train['1']==0]
    a2 = train[train['1']==1]
    samples={'y':len(a1),'n':len(a2)}
    
    docSum = samples['y']+samples['n']
    
    prior={'y':samples['y']/docSum, 'n':samples['n']/docSum}
    logPrior={'y':math.log(prior['y']), 'n':math.log(prior['n'])}
    
    return (docSum,samples,logPrior,prior)


# In[4]:


def getConditionPro(train):
    conditionPro={'y':{},'n':{}}
    logConditionPro={'y':{},'n':{}}
    docSum,samples,logPrior,prior=getLogPrior(train)
    
    a1 = train[train['1']==0]
    a2 = train[train['1']==1]
    
    classNum=2
    wordSet=set()
    for X_word in train['2']:
        for word in X_word:
            wordSet.add(word)
    #1.分类别计算词出现的次数
    words={'y':{},'n':{}}
    for messge in a1['2']:
        settemp=set(messge)
        for ch in messge:
            if ch in words['y']:
                words['y'][ch]+=1
            else:
                words['y'].setdefault(ch,1)
                
    for messge in a2['2']:
        settemp=set(messge)
        for ch in messge:
            if ch in words['n']:
                words['n'][ch]+=1
            else:
                words['n'].setdefault(ch,1)    
    
    for word in wordSet:
        for (category,sonDict) in words.items():
            if word not in sonDict:
                sonDict.setdefault(word,0)
    
    #4.循环words字典,计算条件概率
    for (category,chDict) in words.items():
        for(ch,chCount) in chDict.items():
            conditionPro[category][ch] = (chCount+1)/(samples[category]+classNum)
            logConditionPro[category][ch] = math.log(conditionPro[category][ch])
        
    return (conditionPro,logConditionPro)


# In[5]:


#训练函数,得到模型
def myFit(train):
    docSum,samples,logPrior,prior=getLogPrior(train)
    conditionPro,logConditionPro = getConditionPro(train)
    return (docSum,samples,logPrior,prior,conditionPro,logConditionPro)


# In[6]:


#计算后验概率
def getPostProb(testData,docSum,samples,logPrior,prior,conditionPro,logConditionPro):
    postProb = {'y':1,'n':1}
    postProbLog = {'y':0,'n':0}
    
    set_X_test=set()
    for ch in testData:
        set_X_test.add(ch)
    
    for (category,pri) in prior.items():
        postProb[category]=pri
        postProbLog[category]=logPrior[category]
    #1.计算联合条件概率
        for ch,condition in conditionPro[category].items():
            if ch in set_X_test:
                postProb[category]=postProb.get(category,1)*conditionPro[category][ch]
                postProbLog[category]=postProbLog.get(category,1)+logConditionPro[category][ch]
            else:
                postProb[category]=postProb.get(category,1)*(1-conditionPro[category][ch])
                postProbLog[category]=postProbLog.get(category,1)+(1-logConditionPro[category][ch])
        
        for ch in set_X_test:
            if ch not in conditionPro[category]:
                prob = (1)/(samples[category]+len(prior))
                postProb[category]=postProb.get(category,1)*prob
                postProbLog[category]=postProbLog.get(category,1)+math.log(prob)
                
    return (postProb,postProbLog)


# In[7]:


#求出最大后验概率,及类别
def getGender(postProb):
    sex = postProb['n']>postProb['y']
    support = postProb['n']/(postProb['n']+postProb['y'])
    return (sex,support,postProb)


# In[8]:


#测试:读取一个测试文件,返回一个混淆矩阵
from numpy import *
def testModel(test,docSum,samples,logPrior,prior,conditionPro,logConditionPro):
    '''
    返回:
        result:混淆矩阵
        testTotals:测试数据总条数
    '''
    testTotals=shape(test)[0]
    results={}
    for index,data in test.iterrows():
        #print(data[0])
        postProb,postProbLogtProb=getPostProb(data[1],docSum,samples,logPrior,prior,conditionPro,logConditionPro)
        sex,support,postPort=getGender(postProb)
        
        predictClass=0
        if sex==True:
            predictClass=1
        else:
            predictClass=0
        
        realClass=data[0]
        results.setdefault(realClass,{})
        results[realClass].setdefault(predictClass,0)
        results[realClass][predictClass]+=1
    return (results,testTotals)


# In[9]:


#取一小部分数据建模
fr = codecs.open('1.txt',encoding='utf8')
train=[]
for line in fr.readlines()[0:30000]:
    arr=line.strip().split('\t')[1:3]
    train.append(arr)
print(train[0:5])


# In[10]:


#分词
def toFormat(data):
    data = pd.DataFrame(data)
    data2 = np.array(data)
    list_data = data[1].tolist()
    list_data = list(map(lambda line:list(jieba.cut(line)),list_data))
    list_data=np.array(list_data)
    data2 = np.c_[data2,list_data]
    data2 = np.delete(data2,1,axis=1)
    df_data = pd.DataFrame(data2)
    df_data=df_data.rename(columns={0:'1',1:'2'})
    df_data['1'] = pd.DataFrame(pd.Categorical(df_data['1']).codes)
    return df_data


# In[11]:


train_m = toFormat(train)
train_m.head()


# In[12]:


docSum,samples,logPrior,prior,conditionPro,logConditionPro = myFit(train_m)


# In[13]:


#取后10000条测试
fr = codecs.open('1.txt',encoding='utf8')
test=[]
totals=0
for line in fr.readlines()[-10000:]:
    totals+=1
    arr=line.strip().split('\t')[1:3]
    test.append(arr)
print(test[:5])


# In[14]:


test_m = toFormat(test)
test_m.head()


# In[15]:


metrics,total = testModel(test_m,docSum,samples,logPrior,prior,conditionPro,logConditionPro)
print(metrics)  #混淆矩阵
print(total)    #测试条数


# In[16]:


def showAccuracy(metrics):
    preRight=0
    total=0
    for category,preDict in metrics.items():
        preRight += metrics[category][category]
        for ca,result in preDict.items():
            total +=result
    return preRight/total

print("准确率为:",showAccuracy(metrics))


# In[17]:


def showPrecision(metrics):
    n=len(metrics)
    precision={}
    for i in range(n):
        ySum = sum(metrics[row].get(i,0) for row in range(n))
        p=metrics[i][i]/ySum
        precision[i]=p
    return precision

print('精准率:',showPrecision(metrics))


# In[18]:


def showRecall(metrics):
    n =len(metrics)
    recall={}
    for i in range(n):
        xSum=sum(value for category,value in metrics[i].items())
        p=metrics[i][i]/xSum
        recall[i]=p
    return recall
print('召回率:',showRecall(metrics))


# In[19]:


def showKappa(metrics):
    total=0
    totalForP0=0
    for (category,preDict) in metrics.items():
        totalForP0+=preDict[category]
        for ca,result in preDict.items():
            total+=result
    p0=totalForP0/total
    print('准确数:',totalForP0,'总样本数:',total)
    
    totalForPe=0
    for (category,preDict) in metrics.items():
        xsum=0
        ysum=0
        for (k,v) in preDict.items():
            ysum+=metrics[k][category]
            xsum+=v
        print(xsum,' ',ysum)
        totalForPe+=xsum*ysum
    pe=totalForPe/(total*total)
    k=(p0-pe)/(1-pe)
    return k

showKappa(metrics)


# In[ ]:




 

你可能感兴趣的:(机器学习)