python sklearn实现中文短信垃圾分类

  1. 数据读取
import pandas as pd
import jieba 
data = pd.read_csv(r"E:\数据\实验data\messages.csv",encoding='gbk', header=0, ,names=[“ID”,'label','text'])
#print(data.head())
  1. 短信分词
data['cut_message'] = data["text"].apply(lambda x:' '.join(jieba.cut(x)))#使用空格连接分词
#print(data.head())

x = data['cut_message'].values
y = data['label'].values
  1. 训练集、测试集划分
from sklearn.cross_validation import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.1)#测试集:训练集 =1:9
  1. 模型训练与预测
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
vectorizer = CountVectorizer()
x_train_termcounts = vectorizer.fit_transform(train_x)
 
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_termcounts)

from sklearn.naive_bayes import GaussianNB,MultinomialNB
classifier = MultinomialNB().fit(x_train_tfidf,train_y)

x_input_termcounts = vectorizer.transform(test_x)
x_input_tfidf = tfidf_transformer.transform(x_input_termcounts)

predicted_categories = classifier.predict(x_input_tfidf) #预测分类
  1. 准确率、召回率
from sklearn.metrics import accuracy_score,recall_score  
accuracy_s = accuracy_score(test_y,predicted_categories)
recall_s = recall_score(test_y,predicted_categories)
  1. 混淆矩阵
from sklearn.metrics import confusion_matrix  
confusion_matrix(test_y,predicted_categories)
  1. 输出一部分实例
category_map = {
    0:'normal',  #0代表正常短信
    1:'spam'    #1代表垃圾短信
}
for sentence,category,real in zip(test_x[:10],predicted_categories[:10],test_y[:10]):
    print('\nmessage_content:',sentence,'\npredicted_type:',category_map[category],'real_values:',category_map[real])

你可能感兴趣的:(python代码,商业分析,数据挖掘,python,自然语言处理,机器学习)