2020年3月参加了“百度架构师手把手教深度学习”课程,在NLP的文本分类比赛中有一些心得体会:
1.使用预训练的模型结果肯定是好于自建网络,毕竟站在巨人的肩膀上,不过可能并不利于真正的学习掌握基础知识。
2.单个模型比不上模型融合或者结果的再投票,但是在现实应用场景中不大可能有这么大容量的环境支撑。
数据预处理部分,我花了不少精力
统一全角和半角,然后把一些爬取失败的,不可识别的字符去掉。不通的预处理模型,对中文、英文、其它字符有不同的处理方式,但是预先做个处理还是有必要的。
def DBC2SBC(ustring):
rstring =""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if not (0x0021 <= inside_code and inside_code <= 0x7e):
rstring += uchar
continue
rstring += chr(inside_code)
return rstring
def formatText(temptext):
temptext=temptext.strip()
temptext=re.sub("�" ,"",temptext)#无效数据
temptext=re.sub("\x7f","",temptext)#
temptext=re.sub(" "," ",temptext)
temptext=DBC2SBC(temptext)#全角变半角
temptext=re.sub("\u3000"," ",temptext)
temptext=re.sub("・"," ",temptext)
temptext=re.sub("…"," ",temptext)
temptext=temptext.lower()
temptext=temptext.strip()
return temptext
按字对训练集、测试集建立字典,然后排序。验证后,排在后面的也都是一些生僻字,这个字词我认为是没有训练意义的。
f_train=open('./Train.txt','r',encoding='utf-8').read()
for i,line in enumerate(f_train.split("\n")):
if len(line)>0:
content = line.split('\t')
temptext=content[2]
temptext=formatText(temptext)
for key in en_dic_key:
match=re.search(key,temptext)
if match:
train_dic[key]+=1
textl=list(temptext)
for c in textl:
if c not in train_dic:
train_dic[c]=1
else:
train_dic[c]+=1
all_dic=dict.fromkeys(train_dic.keys(),0)
由于新闻标题文本很少,携带信息量也很少,我尝试做语义强化,把符号,数字做统一化处理。效果并不理想。
temptext=re.sub("【","[",temptext)
temptext=re.sub("】","]",temptext)
temptext=re.sub("、"," ",temptext)
temptext=re.sub(":",":",temptext)
temptext=re.sub("―","-",temptext)#字符语义合并
temptext=re.sub("‘","\"",temptext)
temptext=re.sub("’","\"",temptext)
temptext=re.sub("'"," ",temptext)
temptext=re.sub("/"," ",temptext)
temptext=re.sub("□"," ",temptext)
temptext=re.sub("⑿"," ",temptext)
temptext=re.sub("∷"," ",temptext)
temptext=re.sub("丶"," ",temptext)
temptext=re.sub("丨"," ",temptext)
temptext=re.sub("○"," ",temptext)
#temptext=re.sub("-"," signa ",temptext)#字符语义合并
temptext=re.sub("~"," to ",temptext)#字符语义合并
temptext=re.sub("\+"," plus ",temptext)#字符语义合并
temptext=re.sub("="," equ ",temptext)#字符语义合并
temptext=re.sub("\$"," doll ",temptext)#字符语义合并
temptext=re.sub("!"," surp ",temptext)#字符语义合并
temptext=re.sub("\?"," doubt ",temptext)#字符语义合并
temptext=re.sub("\""," quote ",temptext)#字符语义合并
temptext=re.sub(":"," signb ",temptext)#字符语义合并
temptext=re.sub("@"," signc ",temptext)#字符语义合并
temptext=re.sub(">"," signd ",temptext)#字符语义合并
temptext=re.sub("<"," signe ",temptext)#字符语义合并
temptext=re.sub("[Ⅰ,Ⅱ,Ⅲ,Ⅳ,Ⅴ,Ⅵ,Ⅶ,Ⅷ,Ⅸ,Ⅹ,Ⅺ,Ⅻ,]","romad",temptext)
temptext=re.sub("‰ "," perm ",temptext)
temptext=re.sub("%"," perc ",temptext)
由于训练集分类很不均衡,所以我手动进行了数据集的划分。避免出现验证集中有分类太少,完全被忽略的问题。
data=open('./Train.txt','r',encoding='utf-8').read()
lable_list=["0", "1","2","3","4","5","6","7","8","9","10","11","12","13"]
ts = [[] for i in range(14)]
maxlen=0
for i,line in enumerate(data.split("\n")):
if len(line)>0:
content = line.split('\t')
temptext=content[2]
temptext=formatText(temptext)
if len(temptext)>maxlen:
maxlen=len(temptext)
if len(temptext)>64:
print(temptext)
class_id=int(content[0])
ts[class_id].append(temptext+'\t'+content[0]+'\n')
print("最大数据长度:",maxlen)
ftrain=open('./train.tsv','w+',encoding='utf-8')
fdev=open('./dev.tsv','w+',encoding='utf-8')
for clss in ts:
len_cl=len(clss)
lentrain,lendev=0,0
if len_cl>100000:
id_dev=int(len_cl*0.97)
elif len_cl>50000:
id_dev=int(len_cl*0.95)
elif len_cl>10000:
id_dev=int(len_cl*0.90)
else:
id_dev=int(len_cl*0.85)
for line in clss[0:id_dev]:
if len(line)>0:
ftrain.write(line)
lentrain+=1
for line in clss[id_dev:len_cl-1]:
if len(line)>0:
fdev.write(line)
lendev+=1
print(ts.index(clss),"训练集长度",lentrain,"测试集长度",lendev)
ftrain.close()
fdev.close()
使用paddlehub还是很方便进行训练和调试的
import os
from multiprocessing import cpu_count
import numpy as np
import paddle
import paddle.fluid as fluid
import paddlehub as hub
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
module = hub.Module(name="ernie")
flag="all"
inputs, outputs, program = module.context(trainable=True, max_seq_len=64)#
class Mydataset(BaseNLPDataset):
def __init__(self):
# 数据集存放位置
self.dataset_dir = "/home/aistudio/work"
super(Mydataset, self).__init__(
base_path=self.dataset_dir,
train_file="train_"+flag+".tsv",
dev_file="dev_"+flag+".tsv",
test_file="dev_"+flag+".tsv",
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
# 数据集类别集合
label_list=["0","1","2","3","4","5","6","7","8","9","10","11","12","13"])
)
dataset = Mydataset()
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=64)
strategy = hub.AdamWeightDecayStrategy(
weight_decay=0.01,
#lr_scheduler='noam_decay',
warmup_proportion=0.1,
learning_rate=1e-4)
config = hub.RunConfig(
use_cuda=True,
num_epoch=1,
checkpoint_dir="/home/aistudio/work/module_"+flag,
#batch_size=175,
batch_size=108,
log_interval=10,
eval_interval=50,
strategy=strategy)
pooled_output = outputs["pooled_output"]
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
cls_task = hub.TextClassifierTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config,
metrics_choices=["acc","f1"])
run_states = cls_task.finetune_and_eval()
完成后进行预测和输出
import re
flag="all"
test_data_path = "/home/aistudio/work/test_purge.txt"
data=[]
with open(test_data_path, 'r', encoding='utf-8') as test_data:
lines = test_data.readlines()
for line in lines:
temptext=line
templine=[]
templine.append(temptext)
data.append(templine)
print ('数据加载完毕,数据长度:',len(data))
index = 0
run_states = cls_task.predict(data=data)
#lable_name=[flag,"A"+flag]
lable_name= ["财经", "彩票", "房产", "股票", "家居", "教育", "科技",
"社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"]
file_result=open('work/result_'+flag+'.txt','w',encoding='utf-8')
results = [run_state.run_results for run_state in run_states]
for batch_result in results:
# get predict index
batch_result_idx = np.argmax(batch_result, axis=2)[0]
idx_inbatch=0
for result in batch_result_idx:
AccR=batch_result[0][idx_inbatch][result]
#print("%s\tpredict=%s\t%s\tindex=%s" % (data[index][0], lable_name[result],AccR,index))
temptext=data[index][0]
temptext=re.sub("\n","",temptext)
#file_result.write(lable_name[result]+"\t"+str(AccR)+"\n")
file_result.write(lable_name[result]+"\n")
index += 1
idx_inbatch+=1
file_result.close()
print("result ok")
对于自然语言的处理,是不是过多的人工语义强化反而会对神经网络有不利的影响。因为我一直在使用单个模型进行试验,存在强化后和强化前没有明显改善的问题。
对以训练数据极其不平衡的问题,到底是选择均衡好还是尽量多好,这个我还有疑问。