基于THUCNews数据集的文本分类比赛心得和总结

2020年3月参加了“百度架构师手把手教深度学习”课程,在NLP的文本分类比赛中有一些心得体会:
1.使用预训练的模型结果肯定是好于自建网络,毕竟站在巨人的肩膀上,不过可能并不利于真正的学习掌握基础知识。
2.单个模型比不上模型融合或者结果的再投票,但是在现实应用场景中不大可能有这么大容量的环境支撑。

数据预处理

数据预处理部分,我花了不少精力

1.去异常数据

统一全角和半角,然后把一些爬取失败的,不可识别的字符去掉。不通的预处理模型,对中文、英文、其它字符有不同的处理方式,但是预先做个处理还是有必要的。

def DBC2SBC(ustring):
    rstring =""
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 0x3000:
            inside_code = 0x0020
        else:
            inside_code -= 0xfee0
        if not (0x0021 <= inside_code and inside_code <= 0x7e):
            rstring += uchar
            continue
        rstring += chr(inside_code)
    return rstring
    
def formatText(temptext):
    temptext=temptext.strip()
    
    temptext=re.sub("�" ,"",temptext)#无效数据
    temptext=re.sub("\x7f","",temptext)# 
    temptext=re.sub(" "," ",temptext)
    temptext=DBC2SBC(temptext)#全角变半角
    temptext=re.sub("\u3000"," ",temptext)
    temptext=re.sub("・"," ",temptext)
    temptext=re.sub("…"," ",temptext)
      
    temptext=temptext.lower()
    temptext=temptext.strip()
    
    return temptext

2.去停

按字对训练集、测试集建立字典,然后排序。验证后,排在后面的也都是一些生僻字,这个字词我认为是没有训练意义的。

f_train=open('./Train.txt','r',encoding='utf-8').read()
for i,line in enumerate(f_train.split("\n")): 
    if len(line)>0:
        content = line.split('\t')
        temptext=content[2]
        
        temptext=formatText(temptext)
        
        for key in en_dic_key:
            match=re.search(key,temptext)
            
            if match:
                train_dic[key]+=1
                
        
        textl=list(temptext)
                
        for c in textl:
            if c not in train_dic:
                train_dic[c]=1
            else:
                train_dic[c]+=1

all_dic=dict.fromkeys(train_dic.keys(),0)

3.语义强化

由于新闻标题文本很少,携带信息量也很少,我尝试做语义强化,把符号,数字做统一化处理。效果并不理想。

temptext=re.sub("【","[",temptext)
    temptext=re.sub("】","]",temptext)    
    temptext=re.sub("、"," ",temptext) 
    temptext=re.sub(":",":",temptext)
    temptext=re.sub("―","-",temptext)#字符语义合并
    temptext=re.sub("‘","\"",temptext)
    temptext=re.sub("’","\"",temptext)
    temptext=re.sub("'"," ",temptext)
    temptext=re.sub("/"," ",temptext)
    temptext=re.sub("□"," ",temptext)
    temptext=re.sub("⑿"," ",temptext)
    temptext=re.sub("∷"," ",temptext)
    temptext=re.sub("丶"," ",temptext)
    temptext=re.sub("丨"," ",temptext)
    temptext=re.sub("○"," ",temptext)
             
    #temptext=re.sub("-"," signa ",temptext)#字符语义合并
    temptext=re.sub("~"," to ",temptext)#字符语义合并
    temptext=re.sub("\+"," plus ",temptext)#字符语义合并
    temptext=re.sub("="," equ ",temptext)#字符语义合并
    temptext=re.sub("\$"," doll ",temptext)#字符语义合并
    temptext=re.sub("!"," surp ",temptext)#字符语义合并
    temptext=re.sub("\?"," doubt ",temptext)#字符语义合并
    temptext=re.sub("\""," quote ",temptext)#字符语义合并
    temptext=re.sub(":"," signb ",temptext)#字符语义合并
    temptext=re.sub("@"," signc ",temptext)#字符语义合并
    temptext=re.sub(">"," signd ",temptext)#字符语义合并
    temptext=re.sub("<"," signe ",temptext)#字符语义合并
    temptext=re.sub("[Ⅰ,Ⅱ,Ⅲ,Ⅳ,Ⅴ,Ⅵ,Ⅶ,Ⅷ,Ⅸ,Ⅹ,Ⅺ,Ⅻ,]","romad",temptext)
    
    
    
    temptext=re.sub("‰ "," perm ",temptext)
    temptext=re.sub("%"," perc ",temptext)

4.数据集组织

由于训练集分类很不均衡,所以我手动进行了数据集的划分。避免出现验证集中有分类太少,完全被忽略的问题。

data=open('./Train.txt','r',encoding='utf-8').read()

lable_list=["0", "1","2","3","4","5","6","7","8","9","10","11","12","13"]

ts = [[] for i in range(14)]

maxlen=0
for i,line in enumerate(data.split("\n")): 
    
    if len(line)>0:
        content = line.split('\t')
        temptext=content[2]
        
        temptext=formatText(temptext)
        
        
        if len(temptext)>maxlen:
            maxlen=len(temptext)
        
        if len(temptext)>64:
            print(temptext)
        
        class_id=int(content[0])
        ts[class_id].append(temptext+'\t'+content[0]+'\n')

print("最大数据长度:",maxlen)

ftrain=open('./train.tsv','w+',encoding='utf-8')
fdev=open('./dev.tsv','w+',encoding='utf-8')

for clss in ts:
    len_cl=len(clss)
    lentrain,lendev=0,0
    if len_cl>100000:
        id_dev=int(len_cl*0.97)    
    elif len_cl>50000:
        id_dev=int(len_cl*0.95)       
    elif len_cl>10000:
        id_dev=int(len_cl*0.90)        
    else:
        id_dev=int(len_cl*0.85)
        
    
    for line in clss[0:id_dev]:
        
        if len(line)>0:
            ftrain.write(line)
            lentrain+=1
    
    for line in clss[id_dev:len_cl-1]:
                
        if len(line)>0:
            fdev.write(line)
            lendev+=1
    print(ts.index(clss),"训练集长度",lentrain,"测试集长度",lendev)
    
        
ftrain.close()
fdev.close()

训练过程

使用paddlehub还是很方便进行训练和调试的

import os
from multiprocessing import cpu_count
import numpy as np
import paddle
import paddle.fluid as fluid
import paddlehub as hub
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

module = hub.Module(name="ernie")

flag="all"

inputs, outputs, program = module.context(trainable=True, max_seq_len=64)#
class Mydataset(BaseNLPDataset):
    def __init__(self):
        # 数据集存放位置
        self.dataset_dir = "/home/aistudio/work"
        super(Mydataset, self).__init__(
            base_path=self.dataset_dir,
            
            train_file="train_"+flag+".tsv",
            dev_file="dev_"+flag+".tsv",
            test_file="dev_"+flag+".tsv",
            train_file_with_header=False,
            dev_file_with_header=False,
            test_file_with_header=False,  
            # 数据集类别集合
            label_list=["0","1","2","3","4","5","6","7","8","9","10","11","12","13"])
            )
           
dataset = Mydataset()

reader = hub.reader.ClassifyReader(
    dataset=dataset,
    vocab_path=module.get_vocab_path(),
    max_seq_len=64)

strategy = hub.AdamWeightDecayStrategy(
    weight_decay=0.01,
    #lr_scheduler='noam_decay',
    warmup_proportion=0.1,
    learning_rate=1e-4)

config = hub.RunConfig(
    use_cuda=True,
    num_epoch=1,
    checkpoint_dir="/home/aistudio/work/module_"+flag,
    #batch_size=175,
    batch_size=108,
    log_interval=10,
    eval_interval=50,
    strategy=strategy)

pooled_output = outputs["pooled_output"]

feed_list = [
    inputs["input_ids"].name,
    inputs["position_ids"].name,
    inputs["segment_ids"].name,
    inputs["input_mask"].name,
]

cls_task = hub.TextClassifierTask(
        data_reader=reader,
        feature=pooled_output,
        feed_list=feed_list,
        num_classes=dataset.num_labels,
        config=config,
        metrics_choices=["acc","f1"])

run_states = cls_task.finetune_and_eval()

完成后进行预测和输出

import re

flag="all"
test_data_path = "/home/aistudio/work/test_purge.txt"

data=[]
with open(test_data_path, 'r', encoding='utf-8') as test_data:

    lines = test_data.readlines()
    for line in lines:
        temptext=line         
        templine=[]
        templine.append(temptext)
        data.append(templine)
print ('数据加载完毕,数据长度:',len(data))

index = 0

run_states = cls_task.predict(data=data)

#lable_name=[flag,"A"+flag]

lable_name= ["财经", "彩票", "房产", "股票", "家居", "教育", "科技",
                    "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"]


file_result=open('work/result_'+flag+'.txt','w',encoding='utf-8') 
results = [run_state.run_results for run_state in run_states]

for batch_result in results:
    # get predict index    
    batch_result_idx = np.argmax(batch_result, axis=2)[0]
    
    idx_inbatch=0
    for result in batch_result_idx:
        AccR=batch_result[0][idx_inbatch][result]
        #print("%s\tpredict=%s\t%s\tindex=%s" % (data[index][0], lable_name[result],AccR,index))  
                
        temptext=data[index][0]
        temptext=re.sub("\n","",temptext)

                
        #file_result.write(lable_name[result]+"\t"+str(AccR)+"\n")
        file_result.write(lable_name[result]+"\n")
        index += 1
        idx_inbatch+=1
file_result.close()

print("result ok")

思考

对于自然语言的处理,是不是过多的人工语义强化反而会对神经网络有不利的影响。因为我一直在使用单个模型进行试验,存在强化后和强化前没有明显改善的问题。
对以训练数据极其不平衡的问题,到底是选择均衡好还是尽量多好,这个我还有疑问。

你可能感兴趣的:(基于THUCNews数据集的文本分类比赛心得和总结)