在此目录下 进行了THUCNews的练习
import fasttext
import jieba
from pandas import np
from sklearn import metrics
import random
def read_file(filename):
i=0;
sentences =[]
out = open('../data/fast_train.txt','a+', encoding='utf-8')
with open(filename, encoding='utf-8') as ft:
for line in ft:
label, content = line.strip().split('\t')
segs = jieba.cut(content)
segs = filter(lambda x:len(x)>1,segs)
sentences.append("__label__"+str(label)+"\t"+" ".join(segs))
random.shuffle(sentences)
for sentence in sentences:
out.write(sentence+"\n")
out.close()
# read_file('../data/cnews.train.txt')
classifier = fasttext.train_supervised('../data/fast_train.txt')
classifier.save_model('new_fasttext.model.bin')
classifier = fasttext.load_model('new_fasttext.model.bin')
categories = ['__label__Sports', '__label__Shares','__label__Finance','__label__Furnishing','__label__Education', '__label__Technology', '__label__Property', '__label__Affairs', '__label__Game', '__label__Entertainment']
# read_file('../data/cnews.test.txt')
result = classifier.test('../data/fast_test.txt')
print(result) #整体的结果为(测试数据量,precision,recall):
# print("准确率为:%f"%result.precision) #貌似已经没有这个变量了 所以不会有result.precision
# print("召回率为: %f"%result.recall)
# with open('../data/cnews.test.txt', encoding='utf-8') as fw:
# contents,labels = [],[]
# for line in fw:
# label ,content = line.strip().split('\t')
# segs = jieba.cut(content)
# segs = filter(lambda x:len(x)>1,segs)
# contents.append(" ".join(segs))
# labels.append('__label__'+label)
# label_predict = [e[0] for e in classifier.predict(contents)]
# print(len(contents))
content =' 2011 年 北京 高考 报名 76007 人 去年 再 降 5 3 本报讯 记者 周逸梅 继 去年 高考 报名 人数 大幅 降低 高考 报名 人数 76007 人 去年 再 降 5 3 这是 北京市 考试院 高招办 主任 高福勤 昨天晚上 高招 教育 广播 中 透露 采用 小 平行 志愿 填报 方式 文科生 占 比例 增加 高福勤 介绍 全市 高考 报名 总数 76007 人 去年 80241 人 减少 4234 人 下降 5 3 统考 报名 数为 70857 人 去年 72008 人 减少 3151 人 下降 4 3 高职 单考 单招 报名 人数 5150 去年 减少 1083 下降 幅度 17 几年 全国 大部分 省市 高考 报名 人数 都 下降 北京 连续 第三年 下降 预计 会 7 万多 数字 稳定 一段时间 高招 计划 还 做 高考 形势 还 分析 高福勤 称前 两年 录取 都 超过 80 低于 比例 文理科 分开 来看 文科 报名 人数 25418 人 占 全体 报名 人数 35 9 去年 减少 225 人 人数 去年 下降'
label_predict = classifier.predict(content) #对content内容进行预测 结果返回label标签以及预测值
print(label_predict)
# print("Precision,Recall and F1-Score....")
# print(label_predict.)
# list_shape = np.array(labels).shape
# print(list_shape) #list没有shape属性 所以将其转换为array之后显示shape属性 (labels输出(10000,0))
#
#
# list_shape2 = np.array(label_predict).shape
# print(list_shape2) # (label_predict输出(2,1))
# print(metrics.classification_report(labels,label_predict,target_names=categories))
content是选取了一条预料进行predict 然后print出了它的label
最后几行代码 计算准确率 召回率 F1分数没有计算 代码有问题。在别篇代码中有实现。
相似的一篇文本分词分析
import os
import re
from types import MethodType, FunctionType
import jieba
def clean_txt(raw):
fil = re.compile(r"[^0-9a-zA-Z\u4e00-\u9fa5]+")
return fil.sub(' ', raw)
def seg(sentence, sw, apply=None):
if isinstance(apply, FunctionType) or isinstance(apply, MethodType):
sentence = apply(sentence)
return ' '.join([i for i in jieba.cut(sentence) if i.strip() and i not in sw])
def stop_words():
with open('stop_words.txt', 'r', encoding='utf-8') as swf:
return [line.strip() for line in swf]
# 对某个sentence进行处理:
content = '上海天然橡胶期价周三再创年内新高,主力合约突破21000元/吨重要关口。'
res = seg(content.lower().replace('\n', ''), stop_words(), apply=clean_txt)
mapper_tag = {
'财经': 'Finance',
'彩票': 'Lottery',
'房产': 'Property',
'股票': 'Shares',
'家居': 'Furnishing',
'教育': 'Education',
'科技': 'Technology',
'社会': 'Sociology',
'时尚': 'Fashion',
'时政': 'Affairs',
'体育': 'Sports',
'星座': 'Constellation',
'游戏': 'Game',
'娱乐': 'Entertainment'
}
from random import shuffle
import pandas as pd
class _MD(object):
mapper = {
str: '',
int: 0,
list: list,
dict: dict,
set: set,
bool: False,
float: .0
}
def __init__(self, obj, default=None):
self.dict = {}
assert obj in self.mapper, \
'got a error type'
self.t = obj
if default is None:
return
assert isinstance(default, obj), \
f'default ({default}) must be {obj}'
self.v = default
def __setitem__(self, key, value):
self.dict[key] = value
def __getitem__(self, item):
if item not in self.dict and hasattr(self, 'v'):
self.dict[item] = self.v
return self.v
elif item not in self.dict:
if callable(self.mapper[self.t]):
self.dict[item] = self.mapper[self.t]()
else:
self.dict[item] = self.mapper[self.t]
return self.dict[item]
return self.dict[item]
def defaultdict(obj, default=None):
return _MD(obj, default)
class TransformData(object):
def to_csv(self, handler, output, index=False):
dd = defaultdict(list)
for line in handler:
# print(handler.read()) #一行总的
label, content = line.split(',', 1) #用,来进行切分 只进行1次切分
# print(label)
dd[label.strip('__label__').strip()].append(content.strip())
df = pd.DataFrame()
for key in dd.dict:
col = pd.Series(dd[key], name=key) #用dd[key]进行查找contents然后 name=key规定其类型 聚类
df = pd.concat([df, col], axis=1)
return df.to_csv(output, index=index, encoding='utf-8')
def split_train_test(source, auth_data=False):
if not auth_data:
train_proportion = 0.8
else:
train_proportion = 0.98
basename = source.rsplit('.', 1)[0]
train_file = basename + '_train.txt'
test_file = basename + '_test.txt'
handel = pd.read_csv(source, index_col=False, low_memory=False)
train_data_set = []
test_data_set = []
for head in list(handel.head()):
train_num = int(handel[head].dropna().__len__() * train_proportion)
sub_list = [f'__label__{head} , {item.strip()}\n' for item in handel[head].dropna().tolist()]
train_data_set.extend(sub_list[:train_num])
test_data_set.extend(sub_list[train_num:])
shuffle(train_data_set)
shuffle(test_data_set)
with open(train_file, 'w', encoding='utf-8') as trainf,\
open(test_file, 'w', encoding='utf-8') as testf:
for tds in train_data_set:
trainf.write(tds)
for i in test_data_set:
testf.write(i)
return train_file, test_file
# 转化成csv
td = TransformData()
handler = open('data.txt',encoding='utf-8')
td.to_csv(handler, 'data.csv')
handler.close()
# 将csv文件切割,会生成两个文件(data_train.txt和data_test.txt)
train_file, test_file = split_train_test('data.csv', auth_data=True)
#训练数据
import fasttext
# from gensim.models im port FastText as fasttext
# from fastText import train_supervised, load_model
def train_model(ipt=None, opt=None, model='', dim=100, epoch=5, lr=0.1, loss='softmax'):
pd.np.set_printoptions(suppress=True)
if os.path.isfile(model):
classifier = fasttext.load_model(model)
else:
classifier = fasttext.train_supervised(ipt, label='__label__', dim=dim, epoch=epoch,
lr=lr, wordNgrams=2, loss=loss)
"""
训练一个监督模型, 返回一个模型对象
@param input: 训练数据文件路径
@param lr: 学习率
@param dim: 向量维度
@param ws: cbow模型时使用
@param epoch: 次数
@param minCount: 词频阈值, 小于该值在初始化时会过滤掉
@param minCountLabel: 类别阈值,类别小于该值初始化时会过滤掉
@param minn: 构造subword时最小char个数
@param maxn: 构造subword时最大char个数
@param neg: 负采样
@param wordNgrams: n-gram个数
@param loss: 损失函数类型, softmax, ns: 负采样, hs: 分层softmax
@param bucket: 词扩充大小, [A, B]: A语料中包含的词向量, B不在语料中的词向量
@param thread: 线程个数, 每个线程处理输入数据的一段, 0号线程负责loss输出
@param lrUpdateRate: 学习率更新
@param t: 负采样阈值
@param label: 类别前缀
@param verbose: ??
@param pretrainedVectors: 预训练的词向量文件路径, 如果word出现在文件夹中初始化不再随机
@return model object
"""
classifier.save_model(opt)
return classifier
dim = 100
lr = 5
epoch = 5
model = f'data_dim{str(dim)}_lr0{str(lr)}_iter{str(epoch)}.model'
classifier = train_model(ipt='data_train.txt',
opt=model,
model=model,
dim=dim, epoch=epoch, lr=0.5
)
result = classifier.test('data_test.txt')
print(result)
# 整体的结果为(测试数据量,precision,recall):
(9885, 0.9740010116337886, 0.9740010116337886)
def cal_precision_and_recall(file='data_test.txt'):
precision = defaultdict(int, 1)
recall = defaultdict(int, 1)
total = defaultdict(int, 1)
with open(file,encoding='utf-8') as f:
for line in f:
label, content = line.split(',', 1)
total[label.strip().strip('__label__')] += 1
labels2 = classifier.predict([seg(sentence=content.strip(), sw='', apply=clean_txt)])
pre_label, sim = labels2[0][0][0], labels2[1][0][0]
recall[pre_label.strip().strip('__label__')] += 1
if label.strip() == pre_label.strip():
precision[label.strip().strip('__label__')] += 1
print('precision', precision.dict)
print('recall', recall.dict)
print('total', total.dict)
for sub in precision.dict:
pre = precision[sub] / total[sub]
rec = precision[sub] / recall[sub]
F1 = (2 * pre * rec) / (pre + rec)
print(f"{sub.strip('__label__')} precision: {str(pre)} recall: {str(rec)} F1: {str(F1)}")
def main(source):
basename = source.rsplit('.', 1)[0]
csv_file = basename + '.csv'
td = TransformData()
handler = open(source,encoding='utf-8')
td.to_csv(handler, csv_file)
handler.close()
train_file, test_file = split_train_test(csv_file)
dim = 100
lr = 5
epoch = 5
model = f'data/data_dim{str(dim)}_lr0{str(lr)}_iter{str(epoch)}.model'
classifier = train_model(ipt=train_file,
opt=model,
model=model,
dim=dim, epoch=epoch, lr=0.5
)
result = classifier.test(test_file)
print(result)
cal_precision_and_recall(test_file)
if __name__ == '__main__':
main('data.txt')
这篇代码中有详细的参数设置 , 如何结巴分词 如何去除停用词 要是在预料文件中整合数据的话 则下面的代码片段可以实现:
import os
fileName="娱乐"
path = "THUCNews/"+ fileName #文件夹目录
files= os.listdir(path) #得到文件夹下的所有文件名称
str = ""
num=0
for file in files: #遍历文件夹
if not os.path.isdir(file): #判断是否是文件夹,不是文件夹才打开
num=num+1
print("第",num,"次打开文件")
f = open(path+"/"+file,encoding='utf-8'); #打开文件
iter_f = iter(f); #创建迭代器
i=0
for line in iter_f: #遍历文件,一行行遍历,读取文本
if i == 0:
LinePack = fileName + " " + line
str = str + LinePack
i = i + 1
extractFile="extract/" + fileName +".txt"
with open(extractFile,"w",encoding='utf-8') as f:
f.write(str)
print("读取完毕!")
print("总条目为:",num) #打印结果
extractTitle功能 将if i ==0 filename放入str字符串中 将其中的title标题抽取出来 然后整合在一起。
接下来是分割数据集 将1/10的分割为test 剩余的则为train
fileName="娱乐"
path = "extract/"+ fileName + ".txt" #文件夹目录
count = len(open(path,'rU',encoding='utf-8').readlines())
print(count)
valNum = (int)(count/10)
print(valNum)
trainNum = count - valNum
print(trainNum)
fileValNamePath="extract/"+ fileName + "Val.txt"
fileTrainNamePath="extract/"+ fileName + "Train.txt"
fileAll = open(path,encoding='utf-8')
#写入两个文件
i=0
for line in fileAll.readlines():
i=i+1
if(i == valNum):
print("即将写入train文件,现在是:",i)
if(i <= valNum):
with open(fileValNamePath,"a",encoding='utf-8') as one:
one.write(line)
else:
with open(fileTrainNamePath,"a",encoding='utf-8') as two:
two.write(line)
print("写入完毕!")
在接下来就是整合所有的分类文件。
如
fileNameList=["财经","彩票","房产","股票","家居","教育","科技","社会","时尚","时政","体育","星座","游戏","娱乐"]
将其所有的filename进行遍历
fileNameList=["财经","彩票","房产","股票","家居","教育","科技","社会","时尚","时政","体育","星座","游戏","娱乐"]
strVal=""
strTrain=""
i=0
for fileName in fileNameList:
fileValNamePath="extract/"+ fileName + "Val.txt"
fileTrainNamePath="extract/"+ fileName + "Train.txt"
f = open(fileValNamePath); # 打开文件
iter_f = iter(f); # 创建迭代器
for line in iter_f: # 遍历文件,一行行遍历,读取文本
i=i+1
strVal = strVal + line
print(i)
f1 = open(fileTrainNamePath); # 打开文件
iter_f = iter(f1); # 创建迭代器
for line in iter_f: # 遍历文件,一行行遍历,读取文本
strTrain = strTrain + line
extractAllVal="extract/0extractAllVal.txt"
extractAllTrain="extract/0extractAllTrain.txt"
with open(extractAllVal,"w") as f:
f.write(strVal)
with open(extractAllTrain,"w") as f:
f.write(strTrain)
print("合并完毕!")
import re
from types import FunctionType, MethodType
import jieba
def clean_txt(raw):
fil = re.compile(r"[^0-9a-zA-Z\u4e00-\u9fa5]+")
# print(fil)
return fil.sub(' ', raw)
def seg(sentence, sw, apply=None):
# a = " "
if isinstance(apply, FunctionType) or isinstance(apply, MethodType):
sentence = apply(sentence)
# [i for i in jieba.cut(sentence) if i.strip() and i not in sw]
# a = [i for i in jieba.cut(sentence) if i.strip() and i not in sw]
# print(a) #['上海', '天然橡胶', '期价', '周三', '再创', '年内', '新高', '主力', '合约', '突破', '21000', '元', '吨', '关口']
# return ' '.join(a) #上海 天然橡胶 期价 周三 再创 年内 新高 主力 合约 突破 21000 元 吨 关口
return ' '.join([i for i in jieba.cut(sentence) if i.strip() and i not in sw])
def stop_words():
with open('stop_words.txt', 'r', encoding='utf-8') as swf:
return [line.strip() for line in swf]
# 对某个sentence进行处理:
content = ' 上海天然橡胶期价周三再创年内新高, 主力合约突破21000元/吨重要关口。'
res = seg(content.lower().replace('\n', ''), stop_words(), apply=clean_txt)
print(res)