专利文本分类

具体描述:
tmt.txt,food.txt,eng,txt三个文件为训练集,里面每一行为一个样本,分别为tmt行业,食品行业和建筑行业的专利,test.txt为测试集,里面有以上三个行业的专利。现需编写python程序,计算test.txt中,这三个行业专利的比例,如{‘tmt’:0.333,'food':0.333,'eng':0.333}。

数据集下载:链接:https://pan.baidu.com/s/1P1_w97Gc1kCedcnsC1kc9A 密码:o4g0

思路:把每个txt文件拆分成许多小的txt文本,放入相应的文件夹下,标签为文件夹名,可以借助Bunch对象标记类别……

1、预处理,分词

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import os
import jieba
# 配置utf-8输出环境
# sys.setdefaultencoding('utf-8')
# 分词结果的保存路径
def savefile(savepath, content):
    with open(savepath, "w", encoding='utf-8') as fp:
        fp.write(content)

# 读取待分词文件
def readfile(path):
    with open(path, "r", encoding='utf-8') as fp:
        content = fp.read()
    return content

def corpus_segment(corpus_path, seg_path):
    '''
    corpus_path: 原始训练数据的路径
    seg_path: 将原始训练数据分词后的路径
    '''
    catelist = os.listdir(corpus_path)  # 获取corpus_path下的所有子目录
    '''
    其中子目录的名字就是类别名,例如:
    train_corpus/tmt/tmt_21.txt中,'train_corpus/'是corpus_path,'tmt'是catelist中的一个成员
    '''

    # 获取每个目录(类别)下所有的文件
    for mydir in catelist:
        '''
        这里mydir就是train_corpus/tmt/21.txt中的tmt(即catelist中的一个类别)
        '''
        class_path = corpus_path + mydir + "/"  # 拼出分类子目录的路径如: train_corpus/tmt/
        seg_dir = seg_path + mydir + "/"  # 拼出分词后存贮的对应目录路径如: train_corpus_seg/tmt/

        if not os.path.exists(seg_dir):  # 是否存在分词目录, 如果没有则创建该目录
            os.makedirs(seg_dir)

        file_list = os.listdir(class_path)  # 获取未分词语料库中某一类别中的所有文本
        '''
        例如: 
        train_corpus/tmt/中的
        tmt_21.txt,
        tmt_22.txt,
        tmt_23.txt
        ...
        file_list=[..., 'tmt_21.txt', 'tmt_22.txt', ...]
        '''
        for file_path in file_list:  # 遍历类别目录下的所有文件
            fullname = class_path + file_path  # 拼出文件名全路径如:train_corpus/art/21.txt
            content = readfile(fullname)  # 读取文件内容

            content = content.replace("\r\n", "")  # 删除换行
            content = content.replace(" ", "") # 删除空行、多余的空格
            content_seg = jieba.cut(content, HMM=True)  # 为文件内容分词
            savefile(seg_dir + file_path, " ".join(content_seg))  # 将处理后的文件保存到分词后语料目录


if __name__=="__main__":
    # 对训练集进行分词
    corpus_path = "./train_corpus/"
    seg_path = "./train_corpus_seg/"
    corpus_segment(corpus_path, seg_path)
    print("训练集分词完成!")

    corpus_path = "./test_corpus/"
    seg_path = "./test_corpus_seg/"
    corpus_segment(corpus_path, seg_path)
    print("训练集分词完成!")

2、打标签

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
import os # python内置的包, 用于进行文件目录操作, 之后将会用到os.listdir函数
import pickle # 导入cPickle包并且取一个别名pickle

from sklearn.datasets.base import Bunch

def _readfile(path):
    '''读取文件'''
    with open(path, "rb") as fp:
        content = fp.read()
    return content

def corpus2Bunch(wordbag_path,seg_path):
    catelist = os.listdir(seg_path)  # 获取seg_path下的所有子目录, 也就是分类信息
    # 创建一个Bunch实例
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(catelist)
    '''
    extend(addlist)是python list中的函数, 
    意思是用新的list(addlist)去扩充原来的list
    '''

    # 获取每个目录下所有的文件
    for mydir in catelist:
        class_path = seg_path + mydir + "/"  # 拼出分类子目录的路径
        file_list = os.listdir(class_path)  # 获取class_path下的所有文件
        for file_path in file_list:  # 遍历类别目录下文件
            fullname = class_path + file_path  # 拼出文件名全路径
            bunch.label.append(mydir)
            bunch.filenames.append(fullname)
            bunch.contents.append(_readfile(fullname))  # 读取文件内容

    # 将bunch存储到wordbag_path路径中
    with open(wordbag_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)

if __name__ == "__main__":
    # 对训练集进行Bunch化操作
    wordbag_path = "train_word_bag/train_set.dat"  # Bunch存储路径
    seg_path = "train_corpus_seg/"
    corpus2Bunch(wordbag_path, seg_path)
    print("train Bunch created!")

    # 对测试集进行Bunch化操作
    wordbag_path = "test_word_bag/test_set.dat"  # Bunch存储路径
    seg_path = "test_corpus_seg/"
    corpus2Bunch(wordbag_path, seg_path)
    print("test Bunch created")

3、向量化及特征提取

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')

# 引入Bunch类  
from sklearn.datasets.base import Bunch
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# 读取文件  
def _readfile(path):
    with open(path, "rb") as fp:
        content = fp.read()
    return content

# 读取bunch对象 
def _readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch

# 写入bunch对象 
def _writebunchobj(path, bunchobj):
    with open(path, "wb") as file_obj:
        pickle.dump(bunchobj, file_obj)

#该函数用于创建TF-IDF词向量空间  
def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):

    stpwrdlst = _readfile(stopword_path).splitlines() # 读取停用词  
    bunch = _readbunchobj(bunch_path) # 读取Bunch对象
    # 构建tf-idf词向量空间对象  
    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
    # tdm是权值矩阵
    # vocabulary是词典索引, 即每个词对应的序号

    if train_tfidf_path is not None:
        trainbunch = _readbunchobj(train_tfidf_path)
        tfidfspace.vocabulary = trainbunch.vocabulary
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

    else:
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5) # 使用TfidfVectorizer初始化向量空间模型
        # sublinear_tf=true: 计算tf值采用亚线性策略, 
        # 比如, 我们以前算tf是词频, 现在用1+log(tf)来充当词频
        # max_df=0.5: 把出现频率超过50%的词设为停用词, 比如一个词"XX"在tmt中的50%以上的文档都出现了,
        # 那么它就没有很强的分类价值, 可以将其设为停用词.
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) # tdm中存的就是权值矩阵
        tfidfspace.vocabulary = vectorizer.vocabulary_

    _writebunchobj(space_path, tfidfspace)

if __name__ == '__main__':
    stopword_path = "train_word_bag/hlt_stop_words.txt"
    bunch_path = "train_word_bag/train_set.dat"
    space_path = "train_word_bag/tfdifspace.dat"
    vector_space(stopword_path,bunch_path,space_path)
    print("train TF-IDF created")

    bunch_path = "test_word_bag/test_set.dat"
    space_path = "test_word_bag/testspace.dat"
    train_tfidf_path="train_word_bag/tfdifspace.dat"
    vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)
    print("test TF-IDF created")

4、用朴素贝叶斯分类

#!/usr/bin/env python
# -*- coding: UTF-8 -*-


import pickle
from sklearn.naive_bayes import MultinomialNB  # 导入多项式贝叶斯算法


# 读取bunch对象
def _readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch

# 导入训练集
trainpath = "train_word_bag/tfdifspace.dat" # tfdifspace是一个bunch结构, 包括了tdm矩阵和词典(vocabulary)
train_set = _readbunchobj(trainpath)

# 导入测试集
testpath = "test_word_bag/testspace.dat"
test_set = _readbunchobj(testpath)

# 训练分类器: 输入tdm矩阵(词向量)和分类标签, alpha:0.01 alpha越小, 迭代次数越多, 精度越高
clf = MultinomialNB(alpha=0.01).fit(train_set.tdm, train_set.label)

# 预测分类结果
predicted = clf.predict(test_set.tdm)
tmt_count = 0
food_count = 0
eng_count = 0

for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
    # print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate
    if flabel != expct_cate:
        if expct_cate=="tmt":
            tmt_count = tmt_count + 1
        if expct_cate=="food":
            food_count = food_count + 1
        if expct_cate=="eng":
            eng_count = eng_count + 1

total = eng_count + food_count + tmt_count
tmt_ratio = float(tmt_count) / float(total)
food_ratio = float(food_count) / float(total)
eng_ratio = float(eng_count) / float(total)

print("predict result: ")
print("tmt_ratio = %r" %(tmt_ratio))
print("food_ratio = %r" %(food_ratio))
print("eng_ratio = %r" %(eng_ratio))

参考文献

https://blog.csdn.net/pangtouyu_qy/article/details/79838681
https://github.com/YasinQiu/Chinese-Text-Classification-NBayes

你可能感兴趣的:(专利文本分类)