KNN针对中文文本分类

改编自博客:

http://blog.csdn.net/github_36326955/article/details/54891204


做个笔记

代码按照1 2 3 4的顺序进行即可:

1.py(corpus_segment.py)

[python] view plain copy
  1. #!/usr/bin/env python  
  2. # -*- coding: UTF-8 -*-  
  3. """ 
  4. @version: python2.7.8  
  5. @author: XiangguoSun 
  6. @contact: [email protected] 
  7. @file: corpus_segment.py 
  8. @time: 2017/2/5 15:28 
  9. @software: PyCharm 
  10. """  
  11. import sys  
  12. import os  
  13. import jieba  
  14. # 配置utf-8输出环境  
  15. reload(sys)  
  16. sys.setdefaultencoding('utf-8')  
  17. # 保存至文件  
  18. def savefile(savepath, content):  
  19.     with open(savepath, "wb") as fp:  
  20.         fp.write(content)  
  21.     ''''' 
  22.     上面两行是python2.6以上版本增加的语法,省略了繁琐的文件close和try操作 
  23.     2.5版本需要from __future__ import with_statement 
  24.     新手可以参考这个链接来学习http://zhoutall.com/archives/325 
  25.     '''  
  26. # 读取文件  
  27. def readfile(path):  
  28.     with open(path, "rb") as fp:  
  29.         content = fp.read()  
  30.     return content  
  31.   
  32. def corpus_segment(corpus_path, seg_path):  
  33.     ''''' 
  34.     corpus_path是未分词语料库路径 
  35.     seg_path是分词后语料库存储路径 
  36.     '''  
  37.     catelist = os.listdir(corpus_path)  # 获取corpus_path下的所有子目录  
  38.     ''''' 
  39.     其中子目录的名字就是类别名,例如: 
  40.     train_corpus/art/21.txt中,'train_corpus/'是corpus_path,'art'是catelist中的一个成员 
  41.     '''  
  42.   
  43.     # 获取每个目录(类别)下所有的文件  
  44.     for mydir in catelist:  
  45.         ''''' 
  46.         这里mydir就是train_corpus/art/21.txt中的art(即catelist中的一个类别) 
  47.         '''  
  48.         class_path = corpus_path + mydir + "/"  # 拼出分类子目录的路径如:train_corpus/art/  
  49.         seg_dir = seg_path + mydir + "/"  # 拼出分词后存贮的对应目录路径如:train_corpus_seg/art/  
  50.   
  51.         if not os.path.exists(seg_dir):  # 是否存在分词目录,如果没有则创建该目录  
  52.             os.makedirs(seg_dir)  
  53.   
  54.         file_list = os.listdir(class_path)  # 获取未分词语料库中某一类别中的所有文本  
  55.         ''''' 
  56.         train_corpus/art/中的 
  57.         21.txt, 
  58.         22.txt, 
  59.         23.txt 
  60.         ... 
  61.         file_list=['21.txt','22.txt',...] 
  62.         '''  
  63.         for file_path in file_list:  # 遍历类别目录下的所有文件  
  64.             fullname = class_path + file_path  # 拼出文件名全路径如:train_corpus/art/21.txt  
  65.             content = readfile(fullname)  # 读取文件内容  
  66.             '''''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等, 
  67.             接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容 
  68.             '''  
  69.             content = content.replace("\r\n", "")  # 删除换行  
  70.             content = content.replace(" ", "")#删除空行、多余的空格  
  71.             content_seg = jieba.cut(content)  # 为文件内容分词  
  72.             savefile(seg_dir + file_path, " ".join(content_seg))  # 将处理后的文件保存到分词后语料目录  
  73.   
  74.     print "中文语料分词结束!!!"  
  75.   
  76. ''''' 
  77. 如果你对if __name__=="__main__":这句不懂,可以参考下面的文章 
  78. http://imoyao.lofter.com/post/3492bc_bd0c4ce 
  79. 简单来说如果其他python文件调用这个文件的函数,或者把这个文件作为模块 
  80. 导入到你的工程中时,那么下面的代码将不会被执行,而如果单独在命令行中 
  81. 运行这个文件,或者在IDE(如pycharm)中运行这个文件时候,下面的代码才会运行。 
  82. 即,这部分代码相当于一个功能测试。 
  83. 如果你还没懂,建议你放弃IT这个行业。 
  84. '''  
  85. if __name__=="__main__":  
  86.     #对训练集进行分词  
  87.     corpus_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train/"  # 未分词分类语料库路径  
  88.     seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"  # 分词后分类语料库路径,本程序输出结果  
  89.     corpus_segment(corpus_path,seg_path)  
  90.   
  91.     #对测试集进行分词  
  92.     corpus_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/answer/"  # 未分词分类语料库路径  
  93.     seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"  # 分词后分类语料库路径,本程序输出结果  
  94.     corpus_segment(corpus_path,seg_path)  

2.py(corpus2Bunch.py)
[python] view plain copy
  1. #!/usr/bin/env python  
  2. # -*- coding: UTF-8 -*-  
  3. """ 
  4. @version: python2.7.8  
  5. @author: XiangguoSun 
  6. @contact: [email protected] 
  7. @file: corpus2Bunch.py 
  8. @time: 2017/2/7 7:41 
  9. @software: PyCharm 
  10. """  
  11. import sys  
  12. reload(sys)  
  13. sys.setdefaultencoding('utf-8')  
  14. import os#python内置的包,用于进行文件目录操作,我们将会用到os.listdir函数  
  15. import cPickle as pickle#导入cPickle包并且取一个别名pickle  
  16. ''''' 
  17. 事实上python中还有一个也叫作pickle的包,与这里的名字相同了,无所谓 
  18. 关于cPickle与pickle,请参考博主另一篇博文: 
  19. python核心模块之pickle和cPickle讲解 
  20. http://blog.csdn.net/github_36326955/article/details/54882506 
  21. 本文件代码下面会用到cPickle中的函数cPickle.dump 
  22. '''  
  23. from sklearn.datasets.base import Bunch  
  24. #这个您无需做过多了解,您只需要记住以后导入Bunch数据结构就像这样就可以了。  
  25. #今后的博文会对sklearn做更有针对性的讲解  
  26.   
  27.   
  28. def _readfile(path):  
  29.     '''''读取文件'''  
  30.     #函数名前面带一个_,是标识私有函数  
  31.     # 仅仅用于标明而已,不起什么作用,  
  32.     # 外面想调用还是可以调用,  
  33.     # 只是增强了程序的可读性  
  34.     with open(path, "rb") as fp:#with as句法前面的代码已经多次介绍过,今后不再注释  
  35.         content = fp.read()  
  36.     return content  
  37.   
  38. def corpus2Bunch(wordbag_path,seg_path):  
  39.     catelist = os.listdir(seg_path)# 获取seg_path下的所有子目录,也就是分类信息  
  40.     #创建一个Bunch实例  
  41.     bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])  
  42.     bunch.target_name.extend(catelist)  
  43.     ''''' 
  44.     extend(addlist)是python list中的函数,意思是用新的list(addlist)去扩充 
  45.     原来的list 
  46.     '''  
  47.     # 获取每个目录下所有的文件  
  48.     for mydir in catelist:  
  49.         class_path = seg_path + mydir + "/"  # 拼出分类子目录的路径  
  50.         file_list = os.listdir(class_path)  # 获取class_path下的所有文件  
  51.         for file_path in file_list:  # 遍历类别目录下文件  
  52.             fullname = class_path + file_path  # 拼出文件名全路径  
  53.             bunch.label.append(mydir)  
  54.             bunch.filenames.append(fullname)  
  55.             bunch.contents.append(_readfile(fullname))  # 读取文件内容  
  56.             '''''append(element)是python list中的函数,意思是向原来的list中添加element,注意与extend()函数的区别'''  
  57.     # 将bunch存储到wordbag_path路径中  
  58.     with open(wordbag_path, "wb") as file_obj:  
  59.         pickle.dump(bunch, file_obj)  
  60.     print "构建文本对象结束!!!"  
  61.   
  62. if __name__ == "__main__":#这个语句前面的代码已经介绍过,今后不再注释  
  63.     #对训练集进行Bunch化操作:  
  64.     wordbag_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"  # Bunch存储路径,程序输出  
  65.     seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"  # 分词后分类语料库路径,程序输入  
  66.     corpus2Bunch(wordbag_path, seg_path)  
  67.   
  68.     # 对测试集进行Bunch化操作:  
  69.     wordbag_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"  # Bunch存储路径,程序输出  
  70.     seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"  # 分词后分类语料库路径,程序输入  
  71.     corpus2Bunch(wordbag_path, seg_path)  


3.py(TFIDF_space.py)

[python] view plain copy
  1. #!/usr/bin/env python  
  2. # -*- coding: UTF-8 -*-  
  3. """ 
  4. @version: python2.7.8  
  5. @author: XiangguoSun 
  6. @contact: [email protected] 
  7. @file: TFIDF_space.py 
  8. @time: 2017/2/8 11:39 
  9. @software: PyCharm 
  10. """  
  11. import sys  
  12. reload(sys)  
  13. sys.setdefaultencoding('utf-8')  
  14.   
  15. from sklearn.datasets.base import Bunch  
  16. import cPickle as pickle  
  17. from sklearn.feature_extraction.text import TfidfVectorizer  
  18.   
  19. def _readfile(path):  
  20.     with open(path, "rb") as fp:  
  21.         content = fp.read()  
  22.     return content  
  23.   
  24. def _readbunchobj(path):  
  25.     with open(path, "rb") as file_obj:  
  26.         bunch = pickle.load(file_obj)  
  27.     return bunch  
  28.   
  29. def _writebunchobj(path, bunchobj):  
  30.     with open(path, "wb") as file_obj:  
  31.         pickle.dump(bunchobj, file_obj)  
  32.   
  33. def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):  
  34.   
  35.     stpwrdlst = _readfile(stopword_path).splitlines()  
  36.     bunch = _readbunchobj(bunch_path)  
  37.     tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})  
  38.   
  39.     if train_tfidf_path is not None:  
  40.         trainbunch = _readbunchobj(train_tfidf_path)  
  41.         tfidfspace.vocabulary = trainbunch.vocabulary  
  42.         vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)  
  43.         tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)  
  44.   
  45.     else:  
  46.         vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)  
  47.         tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)  
  48.         tfidfspace.vocabulary = vectorizer.vocabulary_  
  49.   
  50.     _writebunchobj(space_path, tfidfspace)  
  51.     print "tf-idf词向量空间实例创建成功!!!"  
  52.   
  53. if __name__ == '__main__':  
  54.   
  55.     # stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件  
  56.     # bunch_path = "train_word_bag/train_set.dat"#输入的文件  
  57.     # space_path = "train_word_bag/tfdifspace.dat"#输出的文件  
  58.     # vector_space(stopword_path,bunch_path,space_path)  
  59.     #  
  60.     # bunch_path = "test_word_bag/test_set.dat"#输入的文件  
  61.     # space_path = "test_word_bag/testspace.dat"  
  62.     # train_tfidf_path="train_word_bag/tfdifspace.dat"  
  63.     # vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)  
  64.   
  65.     stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件  
  66.   
  67.     train_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"#输入的文件  
  68.     space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"#输出的文件  
  69.     vector_space(stopword_path,train_bunch_path,space_path)  
  70.   
  71.     train_tfidf_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"  # 输入的文件,由上面生成  
  72.     test_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"#输入的文件  
  73.     test_space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/testspace.dat"#输出的文件  
  74.   
  75.     vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path) 

4.py

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@version: python2.7.8 
@author: XiangguoSun
@contact: [email protected]
@file: NBayes_Predict.py
@time: 2017/2/8 12:21
@software: PyCharm
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import cPickle as pickle
from sklearn.naive_bayes import MultinomialNB  # 导入多项式贝叶斯算法


# 读取bunch对象
def _readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch

# 导入训练集
trainpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/tfidfspace.dat"
train_set = _readbunchobj(trainpath)

# 导入测试集
testpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_word_bag/testspace.dat"
test_set = _readbunchobj(testpath)

# 训练分类器:输入词袋向量和分类标签,alpha:0.001 alpha越小,迭代次数越多,精度越高
# clf = MultinomialNB(alpha=0.1).fit(train_set.tdm, train_set.label)

######################################################
#KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
print '*************************\nKNN\n*************************'
clf = KNeighborsClassifier()#default with k=5
clf.fit(train_set.tdm, train_set.label)

# 预测分类结果
predicted = clf.predict(test_set.tdm)

for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
    if flabel != expct_cate:
        print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate

print "预测完毕!!!"

# 计算分类精度:
from sklearn import metrics
def metrics_result(actual, predict):
    print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted'))
    print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))

metrics_result(test_set.label, predicted)

依然使用复旦大学的新闻数据集

运行结果(这里复制一部分):

/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics37.txt : 实际类别: C16-Electronics  -->预测类别: C11-Space
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics19.txt : 实际类别: C16-Electronics  -->预测类别: C34-Economy
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics35.txt : 实际类别: C16-Electronics  -->预测类别: C39-Sports
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics31.txt : 实际类别: C16-Electronics  -->预测类别: C11-Space
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics52.txt : 实际类别: C16-Electronics  -->预测类别: C17-Communication
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics07.txt : 实际类别: C16-Electronics  -->预测类别: C17-Communication
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics02.txt : 实际类别: C16-Electronics  -->预测类别: C34-Economy
/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/test_corpus_seg/C16-Electronics/C16-Electronics48.txt : 实际类别: C16-Electronics  -->预测类别: C34-Economy
预测完毕!!!
精度:0.890
召回:0.893
f1-score:0.886


你可能感兴趣的:(Python自然语言处理)