为什么要做数据增强?提高模型学习各种长短句子的能力
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import os
# 先把所有的数据 拼接 融合在一起 将所有的字全部换成对应的下标
def get_data_with_windows(name='train'):
with open (f'data/prepare/dict.pkl', 'rb') as f:
map_dict = pickle.load(f)
def item2id(data, w2i):
return [w2i[x] if x in w2i else w2i['UNK'] for x in data]
results = []
root = os.path.join('data/prepare', name)
files = list(os.listdir(root))
for file in tqdm(files):
result =[]
path = os.path.join(root, file)
samples = pd.read_csv(path, sep=',')
num_samples = len(samples)
sep_index = [-1]+samples[samples['word'] == 'sep'].index.tolist() + [num_samples] # 拿到分割的那些行的下表 # 20 40 50 没有0和最后一个
################获取句子并将句子全部转化为id
for i in range(len(sep_index)):
start = sep_index[i] + 1
end = sep_index[i+1]
data = []
for feature in samples.columns: #访问每一列
data.append(item2id(list(samples[features])[start:end], map_dict[feature][1]))
result.append(data) # 一句话
怎么做数据增强 要怎么拼接?
两个拼接
# 数据增强
# 两句话两句话拼接 把两个句子的每一个特征都要拼接到一起
two = []
for i in range(len(result)-1): # -1 10个只能拼接成9个出来
first=result[i]
second = result[i+1]
two.append([first[i]+sencond[k] for k in range(len(first))])
三个拼接 道理一样
three = []
for i in range(len(result)-2):
first = result[i]
second = result[i+1]
third = result[i+2]
three.append([firstp[k] + second[k] + third[k] for k in range(len(first))])
然后把two three result 全部extend到results里面
相当于 把长短句全部放到results里面
results.extend(result+two+three)
return results
results里面就是我们所有拼接的句子
这里返回之后可以直接写到文件里面去 这样可以避免每次运行都要加载很久的情况 直接读文件就可以了
就改成
with open(f'data/prepare/' + name +'.pkl', 'wb') as f:
pickle.dump(results, f)
然后测试一下
if __name__ == '__main__':
print(get_data_with_windows('train'))
然后 定义batch_manager
class BatchManager(object):
def __init__(self, batch_size, name='train'):
with open(f'data/prepare/'+name+'.pkl', 'rb') as f:
data = pickle.load(f)
self.batch_data = self.sort_and_pad(data, batch_size)
self.len_data = len(self.batch_data)
def sort_and_pad(self, data, batch_size):
num_batch = int(match.ceil(len(data) /batch_size)) # 总共有多少个批次
sorted_data = sorted(data, key=lambda x: len(x[0])) # 按照句子长度排序
batch_data = list()
for i in range(num_batch):
batch_data.append(self.pad_data(sorted_data[i*int(batch_size) : (i+1)*int(batch_size)]))
return batch_data
@staticmethod
def pad_data(data):
chars = []
bounds = []
flags = []
radicals = []
pinyins = []
targets = []
max_length = max([len(sentence[0]) for sentence in data])
for line in data:
char, bound, flag, target, radical, pinyin = line
padding = [0] * (max_length - len(char))
chars.append(char + padding)
bounds.append(bound + padding)
flags.append(flag + padding)
targets.append(target + padding)
radicals.append(radical + padding)
pinyins.append(pinyin + padding)
return [chars, bounds, flags, radicals, pinyins, targets]
def iter_batch(self, shuffle=False):
if shuffle:
random.shuffle(self.batch_data)
for idx in range(self.len_data):
yield self.batch_data[idx] # yield 每次拿一个批次
if __name__ == '__main__':
# 先把数据写进去
get_data_with_windows('train')
# train_data = Batch_Manager(10, 'train')
生成train.pkl之后 再batch_manager
然后对测试集也要处理一下 直接'train' 改'test'就行了 到此 所有的数据准备就完成了 之后只需要构建模型然后训练就行了