改编自博客:
http://blog.csdn.net/github_36326955/article/details/54891204
做个笔记
代码按照1 2 3 4的顺序进行即可:
1.py(corpus_segment.py)
3.py(TFIDF_space.py)
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@version: python2.7.8
@author: XiangguoSun
@contact: [email protected]
@file: TFIDF_space.py
@time: 2017/2/8 11:39
@software: PyCharm
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from sklearn.datasets.base import Bunch
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer
def _readfile(path):
with open(path, "rb") as fp:
content = fp.read()
return content
def _readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch
def _writebunchobj(path, bunchobj):
with open(path, "wb") as file_obj:
pickle.dump(bunchobj, file_obj)
def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):
stpwrdlst = _readfile(stopword_path).splitlines()
bunch = _readbunchobj(bunch_path)
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
if train_tfidf_path is not None:
trainbunch = _readbunchobj(train_tfidf_path)
tfidfspace.vocabulary = trainbunch.vocabulary
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
else:
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
_writebunchobj(space_path, tfidfspace)
print "if-idf词向量空间实例创建成功!!!"
if __name__ == '__main__':
# stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件
# bunch_path = "train_word_bag/train_set.dat"#输入的文件
# space_path = "train_word_bag/tfdifspace.dat"#输出的文件
# vector_space(stopword_path,bunch_path,space_path)
#
# bunch_path = "test_word_bag/test_set.dat"#输入的文件
# space_path = "test_word_bag/testspace.dat"
# train_tfidf_path="train_word_bag/tfdifspace.dat"
# vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)
stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件
train_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/train_set.dat"#输入的文件
space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/tfidfspace.dat"#输出的文件
vector_space(stopword_path,train_bunch_path,space_path)
train_tfidf_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/tfidfspace.dat" # 输入的文件,由上面生成
test_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_word_bag/test_set.dat"#输入的文件
test_space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_word_bag/testspace.dat"#输出的文件
vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path)
4.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@version: python2.7.8
@author: XiangguoSun
@contact: [email protected]
@file: NBayes_Predict.py
@time: 2017/2/8 12:21
@software: PyCharm
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import cPickle as pickle
from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法
# 读取bunch对象
def _readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch
# 导入训练集
trainpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/tfidfspace.dat"
train_set = _readbunchobj(trainpath)
# 导入测试集
testpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_word_bag/testspace.dat"
test_set = _readbunchobj(testpath)
######################################################
# SVM Classifier
from sklearn.svm import SVC
print '*************************\nSVM\n*************************'
clf = SVC(kernel='linear') # default with 'rbf'
clf.fit(train_set.tdm, train_set.label)
# 预测分类结果
predicted = clf.predict(test_set.tdm)
for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
if flabel != expct_cate:
print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate
print "预测完毕!!!"
# 计算分类精度:
from sklearn import metrics
def metrics_result(actual, predict):
print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted'))
print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))
print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))
metrics_result(test_set.label, predicted)
运行结果(这里复制一部分):
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics13.txt : 实际类别: C16-Electronics -->预测类别: C19-Computer
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics48.txt : 实际类别: C16-Electronics -->预测类别: C34-Economy
预测完毕!!!
精度:0.928
召回:0.927
f1-score:0.921
Process finished with exit code 0
用SVM时间会比较长,请耐心等待。