最详细NER实战讲解-bilstm+crf(6)数据增强

为什么要做数据增强?提高模型学习各种长短句子的能力

import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import os




# 先把所有的数据 拼接 融合在一起 将所有的字全部换成对应的下标
def get_data_with_windows(name='train'):
    with open (f'data/prepare/dict.pkl', 'rb') as f:
        map_dict = pickle.load(f)
    
    def item2id(data, w2i):
        return  [w2i[x] if x in w2i else w2i['UNK'] for x in data]
    results = []
    root = os.path.join('data/prepare', name)
    files = list(os.listdir(root))
    
    for file in tqdm(files):
        result =[]
        path =  os.path.join(root, file)
        samples = pd.read_csv(path, sep=',')
        num_samples = len(samples)
        sep_index = [-1]+samples[samples['word'] == 'sep'].index.tolist() + [num_samples] # 拿到分割的那些行的下表  # 20 40 50 没有0和最后一个
        
################获取句子并将句子全部转化为id
        for i in range(len(sep_index)):
            start = sep_index[i] + 1
            end = sep_index[i+1]
            data = []
            for feature in samples.columns:  #访问每一列    
                data.append(item2id(list(samples[features])[start:end], map_dict[feature][1]))
            result.append(data)   # 一句话
          
            

怎么做数据增强 要怎么拼接?

两个拼接

# 数据增强
# 两句话两句话拼接  把两个句子的每一个特征都要拼接到一起
two = []
for i in range(len(result)-1):  # -1 10个只能拼接成9个出来
    first=result[i]
    second = result[i+1]
    two.append([first[i]+sencond[k] for k in range(len(first))])
    

三个拼接 道理一样

three = []
for i in range(len(result)-2):
    first = result[i]
    second = result[i+1]
    third = result[i+2]
    three.append([firstp[k] + second[k] + third[k] for k in range(len(first))])

然后把two three result 全部extend到results里面

相当于 把长短句全部放到results里面

    results.extend(result+two+three)
return results

results里面就是我们所有拼接的句子

这里返回之后可以直接写到文件里面去 这样可以避免每次运行都要加载很久的情况 直接读文件就可以了

就改成 

with open(f'data/prepare/' + name +'.pkl', 'wb') as f:
    pickle.dump(results, f)

然后测试一下

if __name__ == '__main__':
    print(get_data_with_windows('train'))

然后 定义batch_manager

class BatchManager(object):
    def __init__(self, batch_size, name='train'):
        with open(f'data/prepare/'+name+'.pkl', 'rb') as f:
            data = pickle.load(f)
        self.batch_data = self.sort_and_pad(data, batch_size)
        self.len_data = len(self.batch_data)
    def sort_and_pad(self, data, batch_size):
        num_batch = int(match.ceil(len(data) /batch_size))  # 总共有多少个批次
        sorted_data = sorted(data, key=lambda x: len(x[0]))  # 按照句子长度排序
        batch_data = list()
        for i in range(num_batch):
            batch_data.append(self.pad_data(sorted_data[i*int(batch_size) : (i+1)*int(batch_size)]))
        return batch_data
    @staticmethod
    def pad_data(data):
        chars = []
        bounds = []
        flags = []
        radicals = []
        pinyins = []
        targets = []
        max_length = max([len(sentence[0]) for sentence in data])
        for line in data:
            char, bound, flag, target, radical, pinyin = line  
            padding = [0] * (max_length - len(char))
            chars.append(char + padding)
            bounds.append(bound + padding)
            flags.append(flag + padding)
            targets.append(target + padding)
            radicals.append(radical + padding)
            pinyins.append(pinyin + padding)
        return [chars, bounds, flags, radicals, pinyins, targets]
    def iter_batch(self, shuffle=False):
        if shuffle:
            random.shuffle(self.batch_data)
        for idx in range(self.len_data):
            yield self.batch_data[idx]   # yield 每次拿一个批次

if __name__ == '__main__':
    # 先把数据写进去
    get_data_with_windows('train')
    # train_data = Batch_Manager(10, 'train')

生成train.pkl之后 再batch_manager

然后对测试集也要处理一下 直接'train' 改'test'就行了  到此 所有的数据准备就完成了 之后只需要构建模型然后训练就行了

你可能感兴趣的:(NER实战系列,深度学习,python,人工智能,lstm,nlp)