下面详细介绍基于FastText文本分类实战。对fasttext原理及模型中的方法不熟悉的话,可以先看看FastText原理解析
训练数据保存在csv文件中,包含labels,text两列,labels有0,1,2三种。
labels,text
0,大华技术:超高精度人体热成像测温系统经信发布测温系统采
1,A股3月迎来艳阳天牛市布局正当时!这类股成主力新宠儿涨停战机
2,泰格医药—公司动态点评:业绩符合预期,三大业务板块值得期待Dword
执行训练之前需要对文本进行分词和去停用词,其中分词步骤可以构建领域词典,以期能够正确分词。然后进一步处理为模型训练要求的格式,处理后的文本格式如下:
国网 江苏 无锡 供电 保电 战疫 有数 服务 有心 中国 电力 报 02 2817 43 关注 __label__0
外汇 新三板 行情 行情 股 新闻 外汇 新三板 工程机械 板块 走强 山河 智能 涨幅 居前 工程机械 板块 走强 山河 智能 涨幅 居前 __label__2
广西 科技 特派员 指导 春耕 广西 农 __label__1
然后直接加载中间数据进行训练,训练代码如下。
import pandas as pd
import jieba
import codecs
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
import numpy as np
'''
训练并验证模型的性能,同时保存训练模型
验证集为从训练集中随机选择25%的数据,可以在test_size=0.25处调整
机遇:0,风险:1,不确定:2
'''
stop_data_dir = 'data/stop_words.txt' #停用词路径
user_dict_dir = "data/userdict_all.txt" #自定义词典路径
train_data_dir = 'data/train.csv'
test_data_dir = 'data/train.csv'
output_dir = 'output.xlsx'
jieba.load_userdict(user_dict_dir)
#加载训练数据
data = pd.read_csv(train_data_dir,encoding='utf-8')
data['segment'] = data['text'].apply(lambda x:jieba.lcut(x))
real = pd.read_csv(test_data_dir,encoding='utf-8')
print('----------开始去停用词--------')
#去停用词
stop_list= [] #用来存停用词的list
with codecs.open(stop_data_dir,encoding='utf-8') as f:
for x in f.readlines():
x1 = x.replace("\n", "").replace("\r","").replace("\r\n","")
stop_list.append(x1)
for i in range(len(data)):
word = data['segment'][i].copy()
for x in word:
if x in stop_list:
data['segment'][i].remove(x)
real['segment'] = real['text'].apply(lambda x:jieba.lcut(x))
for i in range(len(real)):
word = real['segment'][i].copy()
for x in word:
if x in stop_list:
real['segment'][i].remove(x)
train_data, test_data, train_label, test_label = train_test_split(data['segment'], data['label'], test_size=0.1, random_state=42)
train_data.index = range(len(train_data))
train_label.index = range(len(train_label))
test_data.index = range(len(test_data))
test_label.index = range(len(test_label))
#将训练集和测试集中已经去停用词并分词之后的文本处理成fasttext能够处理的格式
with open('./data/train_semantic.txt','w',encoding='utf-8') as f:
for i in range(len(train_data)):
str1 = " ".join(train_data[i])+"\t"+"__label__"+str(train_label[i])+'\n'
f.write(str1)
with open('./data/test_semantic.txt','w',encoding='utf-8') as f:
for i in range(len(test_data)):
str1 = " ".join(test_data[i])+"\t"+"__label__"+str(test_label[i])+'\n'
f.write(str1)
#获取标签
def get_label(pred):
index = np.argmax(pred[1])
label = int(pred[0][index][-1])
return label
def get_origin_label(number):
label_dict = {'0': '机遇', '1': '风险', '2': '不确定'}
return label_dict[number]
def get_proba(pred):
pred_dic = {}
pred_dic[pred[0][0]] = pred[1][0]
pred_dic[pred[0][1]] = pred[1][1]
return pred_dic['__label__1']
print('------------------开始训练模型--------------------')
model = fasttext.train_supervised(input="./data/train_semantic.txt",lr=0.1, epoch=100, wordNgrams=3, dim=300,loss='softmax')
print('------------------模型训练结束--------------------')
# 保存model
model_path = './model/model.bin'
model.save_model(model_path)
f1 = open('./data/yuce.txt', 'w', encoding='utf-8')
test_pred = []
for i in range(len(test_data)):
r = model.predict(" ".join(test_data[i]),k=2)
test_pred.append(get_label(r))
f1.write(str(r) + '**' + str(get_label(r)) + '\n')
acc = accuracy_score(test_pred,test_label)
precision = precision_score(test_pred,test_label,average='micro')
recall = recall_score(test_pred,test_label,average='micro')
f1 = f1_score(test_pred,test_label,average='micro')
print("准确率:"+ str(acc)+"\n")
print("精确率:"+ str(precision)+"\n")
print("召回率:"+ str(recall)+"\n")
print("F1值:"+ str(f1)+"\n")
预测时,首先加载训练后保存下来的模型,然后读取待预测数据进行分词、去停用词处理,最后执行预测。
import pandas as pd
import jieba
import codecs
import fasttext
import json
import numpy as np
'''
新闻文本分类fasttext
加载训练好的模型并对新数据进行预测
'''
stop_data_dir = 'data/stop_words.txt' #停用词路径
user_dict_dir = "data/userdict_all.txt" #自定义词典路径
test_data_dir = 'data/file.csv'
output_dir = 'output.xlsx'
jieba.load_userdict(user_dict_dir)
real = pd.read_csv(test_data_dir,encoding='utf-8')
print('----------开始去停用词--------')
#去停用词
stop_list= [] #用来存停用词的list
with codecs.open(stop_data_dir,encoding='utf-8') as f:
for x in f.readlines():
x1 = x.replace("\n", "").replace("\r","").replace("\r\n","")
stop_list.append(x1)
real['segment'] = real['text'].apply(lambda x:jieba.lcut(x))
for i in range(len(real)):
word = real['segment'][i].copy()
for x in word:
if x in stop_list:
real['segment'][i].remove(x)
#获取标签
def get_label(pred):
index = np.argmax(pred[1])
label = int(pred[0][index][-1])
return label
def get_origin_label(number):
label_dict = {'0': '机遇', '1': '风险', '2': '不确定'}
return label_dict[number]
#获取标签
def get_label(pred):
index = np.argmax(pred[1])
label = int(pred[0][index][-1])
return label
#加载训练好的模型
model_path = './model/model.bin'
model = fasttext.load_model(model_path)
print('------------------正在预测--------------------')
cqk=[]
type_list = []
for u in real['segment'].values:
res = model.predict(" ".join(u),k=2)
cqk.append(get_label(res))
type_list.append(get_origin_label(get_label(res)))
real['label'] = cqk
print('------------------正在写入预测文件--------------------')
#real[['content','pred_score']].to_excel(output_dir,encoding='gbk')
real[['text','label']].to_excel(output_dir,encoding='gbk')
参考:
https://blog.csdn.net/ymaini/article/details/81489599