具体描述:
tmt.txt,food.txt,eng,txt三个文件为训练集,里面每一行为一个样本,分别为tmt行业,食品行业和建筑行业的专利,test.txt为测试集,里面有以上三个行业的专利。现需编写python程序,计算test.txt中,这三个行业专利的比例,如{‘tmt’:0.333,'food':0.333,'eng':0.333}。
数据集下载:链接:https://pan.baidu.com/s/1P1_w97Gc1kCedcnsC1kc9A 密码:o4g0
思路:把每个txt文件拆分成许多小的txt文本,放入相应的文件夹下,标签为文件夹名,可以借助Bunch对象标记类别……
1、预处理,分词
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import os
import jieba
# 配置utf-8输出环境
# sys.setdefaultencoding('utf-8')
# 分词结果的保存路径
def savefile(savepath, content):
with open(savepath, "w", encoding='utf-8') as fp:
fp.write(content)
# 读取待分词文件
def readfile(path):
with open(path, "r", encoding='utf-8') as fp:
content = fp.read()
return content
def corpus_segment(corpus_path, seg_path):
'''
corpus_path: 原始训练数据的路径
seg_path: 将原始训练数据分词后的路径
'''
catelist = os.listdir(corpus_path) # 获取corpus_path下的所有子目录
'''
其中子目录的名字就是类别名,例如:
train_corpus/tmt/tmt_21.txt中,'train_corpus/'是corpus_path,'tmt'是catelist中的一个成员
'''
# 获取每个目录(类别)下所有的文件
for mydir in catelist:
'''
这里mydir就是train_corpus/tmt/21.txt中的tmt(即catelist中的一个类别)
'''
class_path = corpus_path + mydir + "/" # 拼出分类子目录的路径如: train_corpus/tmt/
seg_dir = seg_path + mydir + "/" # 拼出分词后存贮的对应目录路径如: train_corpus_seg/tmt/
if not os.path.exists(seg_dir): # 是否存在分词目录, 如果没有则创建该目录
os.makedirs(seg_dir)
file_list = os.listdir(class_path) # 获取未分词语料库中某一类别中的所有文本
'''
例如:
train_corpus/tmt/中的
tmt_21.txt,
tmt_22.txt,
tmt_23.txt
...
file_list=[..., 'tmt_21.txt', 'tmt_22.txt', ...]
'''
for file_path in file_list: # 遍历类别目录下的所有文件
fullname = class_path + file_path # 拼出文件名全路径如:train_corpus/art/21.txt
content = readfile(fullname) # 读取文件内容
content = content.replace("\r\n", "") # 删除换行
content = content.replace(" ", "") # 删除空行、多余的空格
content_seg = jieba.cut(content, HMM=True) # 为文件内容分词
savefile(seg_dir + file_path, " ".join(content_seg)) # 将处理后的文件保存到分词后语料目录
if __name__=="__main__":
# 对训练集进行分词
corpus_path = "./train_corpus/"
seg_path = "./train_corpus_seg/"
corpus_segment(corpus_path, seg_path)
print("训练集分词完成!")
corpus_path = "./test_corpus/"
seg_path = "./test_corpus_seg/"
corpus_segment(corpus_path, seg_path)
print("训练集分词完成!")
2、打标签
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
import os # python内置的包, 用于进行文件目录操作, 之后将会用到os.listdir函数
import pickle # 导入cPickle包并且取一个别名pickle
from sklearn.datasets.base import Bunch
def _readfile(path):
'''读取文件'''
with open(path, "rb") as fp:
content = fp.read()
return content
def corpus2Bunch(wordbag_path,seg_path):
catelist = os.listdir(seg_path) # 获取seg_path下的所有子目录, 也就是分类信息
# 创建一个Bunch实例
bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
bunch.target_name.extend(catelist)
'''
extend(addlist)是python list中的函数,
意思是用新的list(addlist)去扩充原来的list
'''
# 获取每个目录下所有的文件
for mydir in catelist:
class_path = seg_path + mydir + "/" # 拼出分类子目录的路径
file_list = os.listdir(class_path) # 获取class_path下的所有文件
for file_path in file_list: # 遍历类别目录下文件
fullname = class_path + file_path # 拼出文件名全路径
bunch.label.append(mydir)
bunch.filenames.append(fullname)
bunch.contents.append(_readfile(fullname)) # 读取文件内容
# 将bunch存储到wordbag_path路径中
with open(wordbag_path, "wb") as file_obj:
pickle.dump(bunch, file_obj)
if __name__ == "__main__":
# 对训练集进行Bunch化操作
wordbag_path = "train_word_bag/train_set.dat" # Bunch存储路径
seg_path = "train_corpus_seg/"
corpus2Bunch(wordbag_path, seg_path)
print("train Bunch created!")
# 对测试集进行Bunch化操作
wordbag_path = "test_word_bag/test_set.dat" # Bunch存储路径
seg_path = "test_corpus_seg/"
corpus2Bunch(wordbag_path, seg_path)
print("test Bunch created")
3、向量化及特征提取
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 引入Bunch类
from sklearn.datasets.base import Bunch
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
# 读取文件
def _readfile(path):
with open(path, "rb") as fp:
content = fp.read()
return content
# 读取bunch对象
def _readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch
# 写入bunch对象
def _writebunchobj(path, bunchobj):
with open(path, "wb") as file_obj:
pickle.dump(bunchobj, file_obj)
#该函数用于创建TF-IDF词向量空间
def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):
stpwrdlst = _readfile(stopword_path).splitlines() # 读取停用词
bunch = _readbunchobj(bunch_path) # 读取Bunch对象
# 构建tf-idf词向量空间对象
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
# tdm是权值矩阵
# vocabulary是词典索引, 即每个词对应的序号
if train_tfidf_path is not None:
trainbunch = _readbunchobj(train_tfidf_path)
tfidfspace.vocabulary = trainbunch.vocabulary
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
else:
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5) # 使用TfidfVectorizer初始化向量空间模型
# sublinear_tf=true: 计算tf值采用亚线性策略,
# 比如, 我们以前算tf是词频, 现在用1+log(tf)来充当词频
# max_df=0.5: 把出现频率超过50%的词设为停用词, 比如一个词"XX"在tmt中的50%以上的文档都出现了,
# 那么它就没有很强的分类价值, 可以将其设为停用词.
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) # tdm中存的就是权值矩阵
tfidfspace.vocabulary = vectorizer.vocabulary_
_writebunchobj(space_path, tfidfspace)
if __name__ == '__main__':
stopword_path = "train_word_bag/hlt_stop_words.txt"
bunch_path = "train_word_bag/train_set.dat"
space_path = "train_word_bag/tfdifspace.dat"
vector_space(stopword_path,bunch_path,space_path)
print("train TF-IDF created")
bunch_path = "test_word_bag/test_set.dat"
space_path = "test_word_bag/testspace.dat"
train_tfidf_path="train_word_bag/tfdifspace.dat"
vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)
print("test TF-IDF created")
4、用朴素贝叶斯分类
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import pickle
from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法
# 读取bunch对象
def _readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch
# 导入训练集
trainpath = "train_word_bag/tfdifspace.dat" # tfdifspace是一个bunch结构, 包括了tdm矩阵和词典(vocabulary)
train_set = _readbunchobj(trainpath)
# 导入测试集
testpath = "test_word_bag/testspace.dat"
test_set = _readbunchobj(testpath)
# 训练分类器: 输入tdm矩阵(词向量)和分类标签, alpha:0.01 alpha越小, 迭代次数越多, 精度越高
clf = MultinomialNB(alpha=0.01).fit(train_set.tdm, train_set.label)
# 预测分类结果
predicted = clf.predict(test_set.tdm)
tmt_count = 0
food_count = 0
eng_count = 0
for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
# print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate
if flabel != expct_cate:
if expct_cate=="tmt":
tmt_count = tmt_count + 1
if expct_cate=="food":
food_count = food_count + 1
if expct_cate=="eng":
eng_count = eng_count + 1
total = eng_count + food_count + tmt_count
tmt_ratio = float(tmt_count) / float(total)
food_ratio = float(food_count) / float(total)
eng_ratio = float(eng_count) / float(total)
print("predict result: ")
print("tmt_ratio = %r" %(tmt_ratio))
print("food_ratio = %r" %(food_ratio))
print("eng_ratio = %r" %(eng_ratio))
参考文献
https://blog.csdn.net/pangtouyu_qy/article/details/79838681
https://github.com/YasinQiu/Chinese-Text-Classification-NBayes