最近在做实体抽取的时候,一篇文章大约有几千字,按照300字长度进行切割后,会生成数量不等的句子,若是句子少还行,句子多的情况下,则会对造成巨大的计算负担,因为一篇文章中存在关键词的段落是比较少的,为了减轻计算负担,让实体抽取模型仅对有实体的段落进行预测是最佳的选择。首先我是思考了前后各2个段落的方式进行句子筛选,然而偏偏有文章实体是出现在文章中间的,因此不得不考虑对段落进行筛选,采用关键词匹配的方式进行筛选通常都会产生多余的句子,还是无法解决计算负担的问题,因此采用模型的方式进行是最佳的。
文本分类的模型选择有很多,可以采用BERT系列的模型,用专门对于文本分类进行微调后的BERT模型固然可以达到一个比较好的精度,然而却也是增加了计算负担。好在对于实体是否存在这个判断仅仅是粗判断,因此采用CNN也是可以完成任务的,这才有了这篇文章的出现。由于我之前都是用keras进行模型的训练与部署,最近转用torch训练模型,转onnx进行部署的方式,并且明年torch2可能出现,并大幅提升训练与部署速度,因此记录一下基于torch的文本分类模型
若是用词作为最小单位,则生成的是词典。若是用字作为最小单位,则生成的是字典。对于中文来说,字典的性能与词典类似,并且词向量空间小,是个比较好的选择,但是通常预训练好的向量如word2vec均为词向量,因此有预训练向量的情况下,词向量也是不错的选择。首先对训练文本生成字典,字典的主要作用是将字映射成数字
import json
import re
import glob
import torch
import numpy as np
np.random.seed(42)
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch import nn
def get_vocab():
counter = Counter()
with open(r'D:\open_data\ner\bid_data\labeled_data_for_mrc/labeled_data_for_mrc.jsonl', 'r', encoding='utf8') as f:
for i, line in enumerate(f):
if i == 1000:
break
line = json.loads(line)
text = re.sub('\s+', '', line['text'])
counter.update(text)
updated_files = glob.glob(r'D:\open_data\ner\bid_data\labeled_data_for_mrc/updated*')
for file in updated_files:
with open(file, 'r', encoding='utf8') as f:
for i, line in enumerate(f):
line = json.loads(line)
text = re.sub('\s+', '', line['text'])
counter.update(text)
vocabs = list(zip(*counter.most_common()))[0]
with open('vocab.txt', 'w', encoding='utf8') as f:
f.write('[PAD]\n')
f.write('[UNK]\n')
f.write('\n'.join(vocabs))
get_vocab()
由于我的文本比较多,因此看起来比较复杂,其实最主要的内容就是将文本都更新到Counter
中,然后再按照最常见的顺序写到文件中,0号位置放置用于填充的词,1号位置放置未曾出现在词表的,这种顺序将常见的字在空间中聚集起来,私以为对模型学习有所帮助。得到vocab之后,就是获取vocab与id之间的映射关系了:
def load_vocab(vocab_file):
vocab = open(vocab_file, encoding='utf8').read().splitlines()
vocab2id = {x:i for i,x in enumerate(vocab)}
return vocab2id
vocab2id = load_vocab('vocab.txt')
接着就是加载数据了,我将数据处理成列表,每一个元素为(文本,是否存在实体)
def exists(obj):
"""目标是否存在"""
if obj:
return True
else:
return False
def load_data():
"""加载数据"""
out = []
with open(r'D:\open_data\ner\bid_data\labeled_data_for_mrc/labeled_data_for_mrc.jsonl', 'r', encoding='utf8') as f:
for i, line in enumerate(f):
if i == 1000:
break
line = json.loads(line)
text = re.sub('\s+', '', line['text'])
out.append([text, int(exists(line['labels']))])
updated_files = glob.glob(r'D:\open_data\ner\bid_data\labeled_data_for_mrc/updated*')
for file in updated_files:
with open(file, 'r', encoding='utf8') as f:
for i, line in enumerate(f):
line = json.loads(line)
text = re.sub('\s+', '', line['text'])
out.append([text, int(exists(line['label']))])
print(f"total samples: {len(out)} has labels: {sum(list(zip(*out))[1])}")
return out
def split_data(data):
"""切分数据集"""
np.random.shuffle(data)
length = len(data)
train_data, valid_data, test_data = data[:int(length * 0.8)], data[int(length * 0.8):int(length * 0.9)], data[int(length * 0.9):]
return train_data, valid_data, test_data
data = load_data()
train_data, valid_data, test_data = split_data(data)
#total samples: 1122 has labels: 492
我的数据一共包含1122条数据,其中有标签的有492条。接着采用torch的Dataset与DateLoader构建数据生成类,首先定义了tokenize函数,将文本映射成数字向量,定义了padding函数sequence_padding
,collate_fn
用于将单条数据合并成一个batch:
def tokenize(text):
"""将文本映射成数字向量"""
return [vocab2id[x] if x in vocab2id else vocab2id[1] for x in text]
def sequence_padding(inputs, length=None, value=0, seq_dims=1, mode='post'):
"""Numpy函数,将序列padding到同一长度
"""
if length is None:
length = np.max([np.shape(x)[:seq_dims] for x in inputs], axis=0)
elif not hasattr(length, '__getitem__'):
length = [length]
slices = [np.s_[:length[i]] for i in range(seq_dims)]
slices = tuple(slices) if len(slices) > 1 else slices[0]
pad_width = [(0, 0) for _ in np.shape(inputs[0])]
outputs = []
for x in inputs:
x = x[slices]
for i in range(seq_dims):
if mode == 'post':
pad_width[i] = (0, length[i] - np.shape(x)[i])
elif mode == 'pre':
pad_width[i] = (length[i] - np.shape(x)[i], 0)
else:
raise ValueError('"mode" argument must be "post" or "pre".')
x = np.pad(x, pad_width, 'constant', constant_values=value)
outputs.append(x)
return np.array(outputs)
class BidDataset(Dataset):
def __init__(self, data):
super(BidDataset, self).__init__()
self.data = data
def __getitem__(self, index):
d = self.data[index] # 文本,是否存在标签
input_ids = tokenize(d[0])
labels = d[1]
mask = [1] * len(input_ids)
return input_ids, labels, mask
def __len__(self):
return len(self.data)
def collate_fn(batch):
input_ids, labels, mask = list(zip(*batch))
input_ids = torch.LongTensor(sequence_padding(input_ids))
labels = torch.LongTensor(sequence_padding(labels))
mask = torch.LongTensor(sequence_padding(mask))
return input_ids, labels, mask
def get_dataloader(dataset):
return DataLoader(dataset, batch_size=8, collate_fn=collate_fn)
train_dataset, valid_dataset, test_dataset = BidDataset(train_data), BidDataset(valid_data), BidDataset(test_data)
train_dataloader, valid_dataloader, test_dataloader = get_dataloader(train_dataset), get_dataloader(valid_dataset), get_dataloader(test_dataset)
数据处理完后,就可以定义模型了,模型我们采用DGCNN,其拥有较大的感知视野和较快的运行速度:
class ResidualGatedConv1D(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, dilation_rate):
super(ResidualGatedConv1D, self).__init__()
self.out_channels = out_channels
self.conv1d = nn.Conv1d(
in_channels=in_channels,
out_channels=out_channels * 2,
kernel_size=kernel_size,
dilation=dilation_rate,
padding=dilation_rate
)
self.layernorm = nn.LayerNorm([out_channels])
self.alpha = nn.Parameter(torch.zeros(1))
def forward(self, x):
x = x.transpose(2,1)
outputs = self.conv1d(x)
gate = torch.sigmoid(outputs[:, self.out_channels:])
outputs = outputs[:, :self.out_channels] * gate
outputs = self.layernorm(outputs.transpose(2,1))
x = x.transpose(2,1) + self.alpha * outputs
return x
class GlobalAveragePopl1D(nn.Module):
"""对某一维进行平均"""
def __init__(self):
super(GlobalAveragePopl1D, self).__init__()
def forward(self, x):
return torch.mean(x, dim=1)
class DGCNN(nn.Module):
def __init__(self):
super(DGCNN, self).__init__()
self.dgcnn = nn.Sequential(
nn.Embedding(len(vocab2id), 256, padding_idx=0),
ResidualGatedConv1D(256, 256, 3, 1),
nn.Dropout(0.1),
ResidualGatedConv1D(256, 256, 3, 2),
nn.Dropout(0.1),
ResidualGatedConv1D(256, 256, 3, 4),
nn.Dropout(0.1),
ResidualGatedConv1D(256, 256, 3, 8),
nn.Dropout(0.1),
ResidualGatedConv1D(256, 256, 3, 1),
nn.Dropout(0.1),
ResidualGatedConv1D(256, 256, 3, 1),
nn.Dropout(0.1),
GlobalAveragePopl1D(),
nn.Linear(256, 256),
nn.Dropout(0.1),
nn.Linear(256, 1),
nn.Sigmoid()
)
def forward(self, x):
return self.dgcnn(x)
定义好模型之后,就可以愉快的开始训练咯
def loss_fn(y_true, y_pred):
loss = nn.BCELoss()(y_pred, y_true)
return loss
def acc_metric(y_true, y_pred):
y_pred = y_pred > 0.5
correct = torch.sum(y_true == y_pred)
return correct / y_true.shape[0]
def train():
model = DGCNN()
model.cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
best_acc = 0
for _ in range(40):
model.train()
total_loss = 0
total_acc = 0
pbar = tqdm(enumerate(train_dataloader, 1), desc='train')
for batch_id, batch in pbar:
input_ids, label, mask = batch
input_ids, label = input_ids.cuda(), label.cuda()
logits = model(input_ids)
loss = loss_fn(y_true=label, y_pred=logits)
acc = acc_metric(y_true=label, y_pred=logits)
total_loss += loss.item()
total_acc += acc.item()
pbar.set_postfix(loss=total_loss / batch_id, acc=total_acc / batch_id)
optimizer.zero_grad()
loss.backward()
optimizer.step()
pbar = tqdm(enumerate(valid_dataloader, 1), desc='dev')
model.eval()
total_acc = 0
for batch_id, batch in pbar:
input_ids, label, mask = batch
input_ids, label = input_ids.cuda(), label.cuda()
with torch.no_grad():
logits = model(input_ids)
acc = acc_metric(y_true=label, y_pred=logits)
total_acc += acc.item()
pbar.set_postfix(acc=total_acc / batch_id)
if total_acc / batch_id > best_acc:
best_acc = total_acc / batch_id
torch.save(model.state_dict(), 'best_model.pt')
print(f'best model saved at epoch {_} with best acc {best_acc}')
def evaluate():
model = DGCNN()
model.load_state_dict(torch.load('best_model.pt'))
model.cuda()
pbar = tqdm(enumerate(train_dataloader, 1), desc='dev')
model.eval()
total_acc = 0
for batch_id, batch in pbar:
input_ids, label, mask = batch
input_ids, label = input_ids.cuda(), label.cuda()
with torch.no_grad():
logits = model(input_ids)
acc = acc_metric(y_true=label, y_pred=logits)
total_acc += acc.item()
pbar.set_postfix(acc=total_acc / batch_id)
if __name__ == '__main__':
train()
evaluate()
此模型的准确率不错,可以达到90%左右。
然而此精度并不能满足实际使用需求,因此进行了一些改进:
通过这些改进,将准确率提升到了92%,满足实际使用需求了,整体改进代码如下:
import json
import re
import glob
import torch
import numpy as np
import torch.nn.functional as F
np.random.seed(42)
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tqdm import tqdm
from transformers import AdamW, get_polynomial_decay_schedule_with_warmup
torch.manual_seed(3407)
torch.cuda.manual_seed(3407)
torch.cuda.manual_seed_all(3407)
HIDDEN_SIZE = 300
EPOCHS = 10
def load_vocab(vocab_file):
vocab = open(vocab_file, encoding='utf8').read().splitlines()
vocab2id = {x:i for i,x in enumerate(vocab)}
return vocab, vocab2id
vocab, vocab2id = load_vocab('vocab.txt')
def exists(obj):
"""目标是否存在"""
if obj:
return True
else:
return False
def load_data():
"""加载数据"""
out = []
# with open(r'D:\open_data\ner\bid_data\labeled_data_for_mrc/labeled_data_for_mrc.jsonl', 'r', encoding='utf8') as f:
# for i, line in enumerate(f):
# if i == 1000:
# break
# line = json.loads(line)
# text = re.sub('\s+', '', line['text'])
# out.append([text, int(exists(line['labels']))])
#
# updated_files = glob.glob(r'D:\open_data\ner\bid_data\labeled_data_for_mrc/updated*')
# for file in updated_files:
# with open(file, 'r', encoding='utf8') as f:
# for i, line in enumerate(f):
# line = json.loads(line)
# text = re.sub('\s+', '', line['text'])
# out.append([text, int(exists(line['label']))])
with open(r'D:\PekingInfoOtherSearch\bert-mrc-pytorch\predicted_labeled_data_for_mrc.jsonl', 'r', encoding='utf8') as f:
for i, line in enumerate(f):
line = json.loads(line)
text = re.sub('\s+', '', line['text'])
out.append([text, int(exists(line['labels']))])
print(f"total samples: {len(out)} has labels: {sum(list(zip(*out))[1])}")
return out
def split_data(data):
"""切分数据集"""
np.random.shuffle(data)
length = len(data)
train_data, valid_data, test_data = data[:int(length * 0.8)], data[int(length * 0.8):int(length * 0.9)], data[int(length * 0.9):]
return train_data, valid_data, test_data
data = load_data()
train_data, valid_data, test_data = split_data(data)
def tokenize(text):
"""将文本映射成数字向量"""
return [vocab2id[x] if x in vocab2id else vocab2id['[UNK]'] for x in text]
def sequence_padding(inputs, length=None, value=0, seq_dims=1, mode='post'):
"""Numpy函数,将序列padding到同一长度
"""
if length is None:
length = np.max([np.shape(x)[:seq_dims] for x in inputs], axis=0)
elif not hasattr(length, '__getitem__'):
length = [length]
slices = [np.s_[:length[i]] for i in range(seq_dims)]
slices = tuple(slices) if len(slices) > 1 else slices[0]
pad_width = [(0, 0) for _ in np.shape(inputs[0])]
outputs = []
for x in inputs:
x = x[slices]
for i in range(seq_dims):
if mode == 'post':
pad_width[i] = (0, length[i] - np.shape(x)[i])
elif mode == 'pre':
pad_width[i] = (length[i] - np.shape(x)[i], 0)
else:
raise ValueError('"mode" argument must be "post" or "pre".')
x = np.pad(x, pad_width, 'constant', constant_values=value)
outputs.append(x)
return np.array(outputs)
class BidDataset(Dataset):
def __init__(self, data):
super(BidDataset, self).__init__()
self.data = data
def __getitem__(self, index):
d = self.data[index] # 文本,是否存在标签
input_ids = tokenize(d[0])
labels = [d[1]]
mask = [1] * len(input_ids)
return input_ids, labels, mask
def __len__(self):
return len(self.data)
def collate_fn(batch):
input_ids, labels, mask = list(zip(*batch))
input_ids = torch.LongTensor(sequence_padding(input_ids))
labels = torch.FloatTensor(sequence_padding(labels))
mask = torch.LongTensor(sequence_padding(mask))
return input_ids, labels, mask
def get_dataloader(dataset):
return DataLoader(dataset, batch_size=8, collate_fn=collate_fn)
train_dataset, valid_dataset, test_dataset = BidDataset(train_data), BidDataset(valid_data), BidDataset(test_data)
train_dataloader, valid_dataloader, test_dataloader = get_dataloader(train_dataset), get_dataloader(valid_dataset), get_dataloader(test_dataset)
class ResidualGatedConv1D(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, dilation_rate):
super(ResidualGatedConv1D, self).__init__()
self.out_channels = out_channels
self.conv1d = nn.Conv1d(
in_channels=in_channels,
out_channels=out_channels * 2,
kernel_size=kernel_size,
dilation=dilation_rate,
padding=dilation_rate
)
self.layernorm = nn.LayerNorm([out_channels])
self.alpha = nn.Parameter(torch.zeros(1))
def forward(self, x):
x = x.transpose(2,1)
outputs = self.conv1d(x)
gate = torch.sigmoid(outputs[:, self.out_channels:])
outputs = outputs[:, :self.out_channels] * gate
outputs = self.layernorm(outputs.transpose(2,1))
x = x.transpose(2,1) + self.alpha * outputs
return x
class GlobalAveragePopl1D(nn.Module):
"""对某一维进行平均"""
def __init__(self):
super(GlobalAveragePopl1D, self).__init__()
def forward(self, x):
return torch.mean(x, dim=1)
class Embedding(nn.Module):
def __init__(self):
super(Embedding, self).__init__()
self.embed = nn.Embedding(len(vocab2id), HIDDEN_SIZE, padding_idx=0)
# self.embed.weight.data.copy_(torch.tensor(embedding).float())
def forward(self, x):
return self.embed(x)
class DGCNN(nn.Module):
def __init__(self):
super(DGCNN, self).__init__()
drop_rate = 0.1
hidden_size = 300
self.dgcnn = nn.Sequential(
Embedding(),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 1),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 2),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 4),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 8),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 1),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 1),
nn.Dropout(drop_rate),
GlobalAveragePopl1D(),
nn.Linear(hidden_size, hidden_size),
nn.Dropout(drop_rate),
nn.Linear(hidden_size, 1),
nn.Sigmoid()
)
def forward(self, x):
return self.dgcnn(x)
class textCNN(nn.Module):
def __init__(self):
super(textCNN, self).__init__()
self.embed = Embedding()
kernel_wins = [3,4,5]
dim_channel = 100
# Convolutional Layers with different window size kernels
self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, (w, HIDDEN_SIZE)) for w in kernel_wins])
# Dropout layer
self.dropout = nn.Dropout(0.1)
# FC layer
self.fc = nn.Linear(len(kernel_wins) * dim_channel, 1)
def forward(self, x):
emb_x = self.embed(x)
emb_x = emb_x.unsqueeze(1)
con_x = [conv(emb_x) for conv in self.convs]
pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con_x]
fc_x = torch.cat(pool_x, dim=1)
fc_x = fc_x.squeeze(-1)
fc_x = self.dropout(fc_x)
logit = torch.sigmoid(self.fc(fc_x))
return logit
def loss_fn(y_true, y_pred):
loss = nn.BCELoss()(y_pred, y_true)
return loss
def acc_metric(y_true, y_pred):
y_pred = (y_pred > 0.5).float()
correct = torch.sum(y_true == y_pred)
acc = correct / y_true.shape[0]
recall = torch.sum(y_true * y_pred) / torch.sum(y_true).clamp(1e-9)
precision = torch.sum(y_true * y_pred) / torch.sum(y_pred).clamp(1e-9)
return acc, recall, precision
def build_optimizer_and_scheduler(model, warmup_proportion, total_steps):
module = (model.module if hasattr(model, "module") else model)
model_param = module.parameters()
warmup_steps = int(warmup_proportion * total_steps)
optimizer = AdamW(model_param, lr=0.0005, eps=1e-8)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps, lr_end=1e-5)
return optimizer, scheduler
def train():
model = textCNN()
model.cuda()
optimizer, scheduler = build_optimizer_and_scheduler(model, 0.1, len(train_dataloader)*EPOCHS)
best_acc = 0
for _ in range(EPOCHS):
model.train()
total_loss = 0
total_acc = 0
total_recall = 0
total_precison = 0
pbar = tqdm(enumerate(train_dataloader, 1), desc='train')
for batch_id, batch in pbar:
input_ids, label, mask = batch
input_ids, label = input_ids.cuda(), label.cuda()
logits = model(input_ids)
loss = loss_fn(y_true=label, y_pred=logits)
acc, recall, precision = acc_metric(y_true=label, y_pred=logits)
total_loss += loss.item()
total_acc += acc.item()
total_recall += recall.item()
total_precison += precision.item()
pbar.set_description(f'Epoch {_}/{EPOCHS}')
pbar.set_postfix(loss=total_loss / batch_id,
acc=total_acc / batch_id,
recall = total_recall / batch_id,
precision = total_precison / batch_id,
lr=optimizer.param_groups[0]["lr"])
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
pbar = tqdm(enumerate(valid_dataloader, 1), desc='dev')
model.eval()
total_acc = 0
total_recall = 0
total_precison = 0
for batch_id, batch in pbar:
input_ids, label, mask = batch
input_ids, label = input_ids.cuda(), label.cuda()
with torch.no_grad():
logits = model(input_ids)
acc, recall, precision = acc_metric(y_true=label, y_pred=logits)
total_acc += acc.item()
total_recall += recall.item()
total_precison += precision.item()
pbar.set_postfix(acc=total_acc / batch_id,
recall=total_recall / batch_id,
precision=total_precison / batch_id,
)
if total_acc / batch_id > best_acc:
best_acc = total_acc / batch_id
torch.save(model.state_dict(), 'best_model.pt')
print(f'best model saved at epoch {_} with best acc {best_acc}')
def evaluate():
model = DGCNN()
model.load_state_dict(torch.load('best_model.pt'))
model.cuda()
pbar = tqdm(enumerate(test_dataloader, 1), desc='test')
model.eval()
total_acc = 0
total_recall = 0
total_precison = 0
for batch_id, batch in pbar:
input_ids, label, mask = batch
input_ids, label = input_ids.cuda(), label.cuda()
with torch.no_grad():
logits = model(input_ids)
acc, recall, precision = acc_metric(y_true=label, y_pred=logits)
total_acc += acc.item()
total_recall += recall.item()
total_precison += precision.item()
pbar.set_postfix(acc=total_acc / batch_id,
recall=total_recall / batch_id,
precision=total_precison / batch_id,
)
def convert2onnx():
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import torch
if torch.cuda.is_available():
device = 'cuda:0'
else:
device = 'cpu'
model = DGCNN()
model.load_state_dict(torch.load('best_model.pt', map_location=device))
model.to(device)
model.eval()
x = torch.zeros(1, 300, requires_grad=True).long()
torch.onnx.export(model, # model being run
x, # model input (or a tuple for multiple inputs)
"best_model.onnx", # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=14, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = ['x'], # the model's input names
output_names = ['output'], # the model's output names
dynamic_axes={'x' : {0 : 'batch_size', 1: 'seqlen'},# variable length axes
'output' : {0 : 'batch_size', 1: 'seqlen'}})
if __name__ == '__main__':
train()
# evaluate()
# convert2onnx()
上文提到预训练好的词向量,因此我也比较了使用jieba分词核sougou预训练词向量结合的模型效果,最好的结果为准确率0.927,提升了0.7个百分点,也算是不错的提升,最终在实际使用中,采用词向量模型,整体代码如下:
import json
import re
import glob
import torch
import jieba
import numpy as np
import torch.nn.functional as F
np.random.seed(42)
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tqdm import tqdm
from transformers import AdamW, get_polynomial_decay_schedule_with_warmup
torch.manual_seed(3407)
torch.cuda.manual_seed(3407)
torch.cuda.manual_seed_all(3407)
HIDDEN_SIZE = 300
EPOCHS = 10
def load_vocab(vocab_file):
vocab = open(vocab_file, encoding='utf8').read().splitlines()
vocab2id = {x:i for i,x in enumerate(vocab)}
return vocab, vocab2id
vocab, vocab2id = load_vocab('word_vocab.txt')
def load_embedding():
vocab2embed = {}
with open(r'D:\PekingInfoResearch\pretrain_models\word2vec\sgns.sogou.char', encoding='utf8') as f:
f.readline()
for line in tqdm(f, 'load embedding'):
line = line.split()
vocab2embed[line[0]] = list(map(float, line[1:]))
out_embedding = []
for word in vocab:
if word in vocab2embed:
out_embedding.append(vocab2embed[word])
else:
out_embedding.append(np.zeros(300))
return np.array(out_embedding)
def exists(obj):
"""目标是否存在"""
if obj:
return True
else:
return False
def load_data():
"""加载数据"""
out = []
# with open(r'D:\open_data\ner\bid_data\labeled_data_for_mrc/labeled_data_for_mrc.jsonl', 'r', encoding='utf8') as f:
# for i, line in enumerate(f):
# if i == 1000:
# break
# line = json.loads(line)
# text = re.sub('\s+', '', line['text'])
# out.append([text, int(exists(line['labels']))])
#
# updated_files = glob.glob(r'D:\open_data\ner\bid_data\labeled_data_for_mrc/updated*')
# for file in updated_files:
# with open(file, 'r', encoding='utf8') as f:
# for i, line in enumerate(f):
# line = json.loads(line)
# text = re.sub('\s+', '', line['text'])
# out.append([text, int(exists(line['label']))])
with open(r'D:\PekingInfoOtherSearch\bert-mrc-pytorch\predicted_labeled_data_for_mrc.jsonl', 'r', encoding='utf8') as f:
for i, line in enumerate(f):
line = json.loads(line)
text = re.sub('\s+', '', line['text'])
out.append([text, int(exists(line['labels']))])
print(f"total samples: {len(out)} has labels: {sum(list(zip(*out))[1])}")
return out
def split_data(data):
"""切分数据集"""
np.random.shuffle(data)
length = len(data)
train_data, valid_data, test_data = data[:int(length * 0.8)], data[int(length * 0.8):int(length * 0.9)], data[int(length * 0.9):]
return train_data, valid_data, test_data
data = load_data()
train_data, valid_data, test_data = split_data(data)
def tokenize(text):
"""将文本映射成数字向量"""
return [vocab2id[x] if x in vocab2id else vocab2id['[UNK]'] for x in text]
def sequence_padding(inputs, length=None, value=0, seq_dims=1, mode='post'):
"""Numpy函数,将序列padding到同一长度
"""
if length is None:
length = np.max([np.shape(x)[:seq_dims] for x in inputs], axis=0)
elif not hasattr(length, '__getitem__'):
length = [length]
slices = [np.s_[:length[i]] for i in range(seq_dims)]
slices = tuple(slices) if len(slices) > 1 else slices[0]
pad_width = [(0, 0) for _ in np.shape(inputs[0])]
outputs = []
for x in inputs:
x = x[slices]
for i in range(seq_dims):
if mode == 'post':
pad_width[i] = (0, length[i] - np.shape(x)[i])
elif mode == 'pre':
pad_width[i] = (length[i] - np.shape(x)[i], 0)
else:
raise ValueError('"mode" argument must be "post" or "pre".')
x = np.pad(x, pad_width, 'constant', constant_values=value)
outputs.append(x)
return np.array(outputs)
class BidDataset(Dataset):
def __init__(self, data):
super(BidDataset, self).__init__()
self.data = data
def __getitem__(self, index):
d = self.data[index] # 文本,是否存在标签
input_ids = tokenize(jieba.lcut(d[0]))
labels = [d[1]]
mask = [1] * len(input_ids)
return input_ids, labels, mask
def __len__(self):
return len(self.data)
def collate_fn(batch):
input_ids, labels, mask = list(zip(*batch))
input_ids = torch.LongTensor(sequence_padding(input_ids))
labels = torch.FloatTensor(sequence_padding(labels))
mask = torch.LongTensor(sequence_padding(mask))
return input_ids, labels, mask
def get_dataloader(dataset):
return DataLoader(dataset, batch_size=8, collate_fn=collate_fn)
train_dataset, valid_dataset, test_dataset = BidDataset(train_data), BidDataset(valid_data), BidDataset(test_data)
train_dataloader, valid_dataloader, test_dataloader = get_dataloader(train_dataset), get_dataloader(valid_dataset), get_dataloader(test_dataset)
class ResidualGatedConv1D(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, dilation_rate):
super(ResidualGatedConv1D, self).__init__()
self.out_channels = out_channels
self.conv1d = nn.Conv1d(
in_channels=in_channels,
out_channels=out_channels * 2,
kernel_size=kernel_size,
dilation=dilation_rate,
padding=dilation_rate
)
self.layernorm = nn.LayerNorm([out_channels])
self.alpha = nn.Parameter(torch.zeros(1))
def forward(self, x):
x = x.transpose(2,1)
outputs = self.conv1d(x)
gate = torch.sigmoid(outputs[:, self.out_channels:])
outputs = outputs[:, :self.out_channels] * gate
outputs = self.layernorm(outputs.transpose(2,1))
x = x.transpose(2,1) + self.alpha * outputs
return x
class GlobalAveragePopl1D(nn.Module):
"""对某一维进行平均"""
def __init__(self):
super(GlobalAveragePopl1D, self).__init__()
def forward(self, x):
return torch.mean(x, dim=1)
class Embedding(nn.Module):
def __init__(self, embedding=None):
super(Embedding, self).__init__()
self.embed = nn.Embedding(len(vocab2id), HIDDEN_SIZE, padding_idx=0)
if embedding:
self.embed.weight.data.copy_(torch.tensor(embedding).float())
def forward(self, x):
return self.embed(x)
class DGCNN(nn.Module):
def __init__(self):
super(DGCNN, self).__init__()
drop_rate = 0.1
hidden_size = 300
self.dgcnn = nn.Sequential(
Embedding(),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 1),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 2),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 4),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 8),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 1),
nn.Dropout(drop_rate),
ResidualGatedConv1D(hidden_size, hidden_size, 3, 1),
nn.Dropout(drop_rate),
GlobalAveragePopl1D(),
nn.Linear(hidden_size, hidden_size),
nn.Dropout(drop_rate),
nn.Linear(hidden_size, 1),
nn.Sigmoid()
)
def forward(self, x):
return self.dgcnn(x)
class textCNN(nn.Module):
def __init__(self, embedding=None):
super(textCNN, self).__init__()
self.embed = Embedding(embedding)
kernel_wins = [3,4,5]
dim_channel = 100
# Convolutional Layers with different window size kernels
self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, (w, HIDDEN_SIZE)) for w in kernel_wins])
# Dropout layer
self.dropout = nn.Dropout(0.1)
# FC layer
self.fc = nn.Linear(len(kernel_wins) * dim_channel, 1)
def forward(self, x):
emb_x = self.embed(x)
emb_x = emb_x.unsqueeze(1)
con_x = [conv(emb_x) for conv in self.convs]
pool_x = [F.adaptive_max_pool1d(x.squeeze(-1), 1) for x in con_x]
fc_x = torch.cat(pool_x, dim=1)
fc_x = fc_x.squeeze(-1)
fc_x = self.dropout(fc_x)
logit = torch.sigmoid(self.fc(fc_x))
return logit
def loss_fn(y_true, y_pred):
loss = nn.BCELoss()(y_pred, y_true)
return loss
def acc_metric(y_true, y_pred):
y_pred = (y_pred > 0.5).float()
correct = torch.sum(y_true == y_pred)
acc = correct / y_true.shape[0]
recall = torch.sum(y_true * y_pred) / torch.sum(y_true).clamp(1e-9)
precision = torch.sum(y_true * y_pred) / torch.sum(y_pred).clamp(1e-9)
return acc, recall, precision
def build_optimizer_and_scheduler(model, warmup_proportion, total_steps):
module = (model.module if hasattr(model, "module") else model)
model_param = module.parameters()
warmup_steps = int(warmup_proportion * total_steps)
optimizer = AdamW(model_param, lr=0.001, eps=1e-8)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps, lr_end=1e-5)
return optimizer, scheduler
def train(embedding=None):
model = textCNN(embedding)
model.cuda()
optimizer, scheduler = build_optimizer_and_scheduler(model, 0.1, len(train_dataloader)*EPOCHS)
best_acc = 0
for _ in range(EPOCHS):
model.train()
total_loss = 0
total_acc = 0
total_recall = 0
total_precison = 0
pbar = tqdm(enumerate(train_dataloader, 1), desc='train')
for batch_id, batch in pbar:
input_ids, label, mask = batch
input_ids, label = input_ids.cuda(), label.cuda()
logits = model(input_ids)
loss = loss_fn(y_true=label, y_pred=logits)
acc, recall, precision = acc_metric(y_true=label, y_pred=logits)
total_loss += loss.item()
total_acc += acc.item()
total_recall += recall.item()
total_precison += precision.item()
pbar.set_description(f'Epoch {_}/{EPOCHS}')
pbar.set_postfix(loss=total_loss / batch_id,
acc=total_acc / batch_id,
recall = total_recall / batch_id,
precision = total_precison / batch_id,
lr=optimizer.param_groups[0]["lr"])
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
pbar = tqdm(enumerate(valid_dataloader, 1), desc='dev')
model.eval()
total_acc = 0
total_recall = 0
total_precison = 0
for batch_id, batch in pbar:
input_ids, label, mask = batch
input_ids, label = input_ids.cuda(), label.cuda()
with torch.no_grad():
logits = model(input_ids)
acc, recall, precision = acc_metric(y_true=label, y_pred=logits)
total_acc += acc.item()
total_recall += recall.item()
total_precison += precision.item()
pbar.set_postfix(acc=total_acc / batch_id,
recall=total_recall / batch_id,
precision=total_precison / batch_id,
)
if total_acc / batch_id > best_acc:
best_acc = total_acc / batch_id
torch.save(model.state_dict(), 'best_model.word.pt')
print(f'best model saved at epoch {_} with best acc {best_acc}')
def evaluate():
model = textCNN()
model.load_state_dict(torch.load('best_model.word.pt'))
model.cuda()
pbar = tqdm(enumerate(test_dataloader, 1), desc='test')
model.eval()
total_acc = 0
total_recall = 0
total_precison = 0
for batch_id, batch in pbar:
input_ids, label, mask = batch
input_ids, label = input_ids.cuda(), label.cuda()
with torch.no_grad():
logits = model(input_ids)
acc, recall, precision = acc_metric(y_true=label, y_pred=logits)
total_acc += acc.item()
total_recall += recall.item()
total_precison += precision.item()
pbar.set_postfix(acc=total_acc / batch_id,
recall=total_recall / batch_id,
precision=total_precison / batch_id,
)
def convert2onnx():
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import torch
if torch.cuda.is_available():
device = 'cuda:0'
else:
device = 'cpu'
model = textCNN()
model.load_state_dict(torch.load('best_model.word.pt', map_location=device))
model.to(device)
model.eval()
x = torch.zeros(1, 300, requires_grad=True).long()
torch.onnx.export(model, # model being run
x, # model input (or a tuple for multiple inputs)
"best_model.word.onnx", # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=14, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = ['x'], # the model's input names
output_names = ['output'], # the model's output names
dynamic_axes={'x' : {0 : 'batch_size', 1: 'seqlen'},# variable length axes
'output' : {0 : 'batch_size', 1: 'seqlen'}})
if __name__ == '__main__':
# embedding = load_embedding()
# train()
# evaluate()
convert2onnx()
实际使用情况就是,速度很快,效果也很不错!