基于train.txt和train_TAG.txt数据训练一个BiLSTM-CRF命名实体识别模型,进而为test.txt进行序列标注,输出标签文件,标签文件输出格式与train_TAG.txt相同。即保持test.txt中的行次序、分行信息以及行内次序,行内每个字的标签之间用空格分隔。输出文件命名方式:学号.txt。
_forward_alg_new_parallel
def _forward_alg_new_parallel(self, feats):
init_alphas = torch.full([feats.shape[0], self.tagset_size], -10000.).to(self.device)
init_alphas[:, self.tag_to_ix[START_TAG]] = 0.
forward_var_list = []
forward_var_list.append(init_alphas)
for feat_index in range(feats.shape[1]):
gamar_r_l = torch.stack([forward_var_list[feat_index]] * feats.shape[2]).transpose(0, 1)
t_r1_k = torch.unsqueeze(feats[:, feat_index, :], 1).transpose(1, 2)
aa = gamar_r_l + t_r1_k + torch.unsqueeze(self.transitions, 0)
forward_var_list.append(torch.logsumexp(aa, dim=2))
# forward_var_list[-1]: batch_size x tag_num
terminal_var = forward_var_list[-1] + self.transitions[self.tag_to_ix[STOP_TAG]].repeat([feats.shape[0], 1])
alpha = torch.logsumexp(terminal_var, dim=1)
return alpha
_score_sentence_parallel
def _score_sentence_parallel(self, feats, tags):
# Gives the score of provided tag sequences
score = torch.zeros(tags.shape[0]).to(self.device)
tags = torch.cat([torch.full([tags.shape[0], 1], self.tag_to_ix[START_TAG], dtype=torch.long).to(self.device), tags], dim=1)
for i in range(feats.shape[1]):
feat = feats[:, i, :]
score = score + \
self.transitions[tags[:, i + 1], tags[:, i]] + feat[range(feat.shape[0]),tags[:, i + 1]]
score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[:, -1]]
return score
loadData
def loadData(textPath, tagPath):
sents = []
with open(textPath, 'r', encoding='utf-8') as f:
for line in f.readlines():
sents.append(line.split())
tags = []
with open(tagPath, 'r', encoding='utf-8') as f:
for line in f.readlines():
tags.append(line.split())
dataset = list(zip(sents, tags))
return sents, tags, dataset
get_word_to_ix
def get_word_to_ix(data):
word_to_ix = {}
word_to_ix[PAD_TAG] = 0
for sentence, tags in tqdm(data):
for word in sentence:
if word not in word_to_ix:
word_to_ix[word] = len(word_to_ix)
return word_to_ix
prepare_sequence
def prepare_sequence(seq, to_ix):
idxs = []
for w in seq:
if w not in to_ix:
w = PAD_TAG
idxs.append(to_ix[w])
return torch.tensor(idxs, dtype=torch.long).to(device)
prepare_sequence_batch
def prepare_sequence_batch(data ,word_to_ix, tag_to_ix, max_len=100):
print("==============================【Data Processing】================================")
seqs = [i[0] for i in data]
tags = [i[1] for i in data]
# max_len = max([len(seq) for seq in seqs])
seqs_pad = []
tags_pad = []
for seq, tag in zip(seqs, tags):
if len(seq) > max_len:
seq_pad = seq[: max_len]
tag_pad = tag[: max_len]
else:
seq_pad = seq + [PAD_TAG] * (max_len - len(seq))
tag_pad = tag + [PAD_TAG] * (max_len - len(tag))
seqs_pad.append(seq_pad)
tags_pad.append(tag_pad)
idxs_pad = torch.tensor([[word_to_ix[w] for w in seq] for seq in tqdm(seqs_pad)], dtype=torch.long)
tags_pad = torch.tensor([[tag_to_ix[t] for t in tag] for tag in tqdm(tags_pad)], dtype=torch.long)
return idxs_pad, tags_pad
to_tag
def to_tag(tag_list, ix_to_tag):
temp = []
for tag in tag_list:
temp.append(ix_to_tag[tag])
return temp
predict
def predict(model, sentence_set, word_to_ix, ix_to_tag, pre_tag_path=None):
if pre_tag_path == None:
pre_tags = []
for sentence in tqdm(sentence_set):
precheck_sent = prepare_sequence(sentence, word_to_ix)
score, tags = model(precheck_sent)
pre_tags.extend(to_tag(tags, ix_to_tag))
return pre_tags
else:
with open(pre_tag_path, "w") as f:
for sentence in tqdm(sentence_set):
precheck_sent = prepare_sequence(sentence, word_to_ix)
score, tags = model(precheck_sent)
f.write(' '.join(to_tag(tags, ix_to_tag)))
f.write('\n')
evaluate
def evaluate(model, dev_sents, dev_tags, word_to_ix, ix_to_tag):
pre_tags = predict(model, dev_sents, word_to_ix, ix_to_tag)
tags = []
for dev_tag in dev_tags:
tags.extend(dev_tag)
f1 = f1_score(tags, pre_tags)
report = classification_report(tags, pre_tags)
return f1, report
# 用来记录loss和f1值变化
writer = SummaryWriter('./result')
# embedding层维数
EMBEDDING_DIM = 300
# hidden层维数
HIDDEN_DIM = 400
# batch size
BATCH_SIZE = 256
# 训练轮次
NUM_EPOCHS = 30
# 输入最大长度
MAX_LEN = 150
# 检测GPU是否可用
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 创建logger记录训练信息
logger = logging.getLogger('logger')
logger.setLevel(logging.DEBUG)
rotating_handler = logging.handlers.RotatingFileHandler(
'training_log.log', encoding='UTF-8')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
rotating_handler.setFormatter(formatter)
logger.addHandler(rotating_handler)
# 读取训练和评估数据
train_sents, train_tags, train_data = loadData("./data/train.txt", "./data/train_TAG.txt")
dev_sents, dev_tags, dev_data = loadData("./data/dev.txt", "./data/dev_TAG.txt")
# 获取word_to_ix字典
word_to_ix = get_word_to_ix(train_data)
# 初始化model和optimizer
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
# GPU可用的话将模型转移到GPU上
model.to(device)
# Check predictions before training
with torch.no_grad():
print(predict(model, [train_sents[0]], word_to_ix, ix_to_tag))
# 对训练数据进行预处理
sentence_in_pad, targets_pad = prepare_sequence_batch(train_data, word_to_ix, tag_to_ix, max_len=MAX_LEN)
# 创建DataLoader
batch_dataset = Data.TensorDataset(sentence_in_pad, targets_pad)
batch_loader = Data.DataLoader(
dataset=batch_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
num_workers=8
)
# 开始训练
logger.info("=================================【Begin Training】=================================")
for epoch in trange(NUM_EPOCHS):
logger.info("=================================【Begin Epoch_{}】=================================".format(str(epoch)))
epoch_iterator = tqdm(batch_loader, desc="Iteration")
for step, batch in enumerate(epoch_iterator):
model.zero_grad()
sentence_in, targets = batch
loss = model.neg_log_likelihood_parallel(sentence_in.to(device), targets.to(device))
loss.backward()
optimizer.step()
print("\n" + time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()) + \
", epoch: " + str(epoch) + ", loss: " + str(float(loss)))
# logger.info("epoch: " + str(epoch) + ", step: " + str(step) + \
# ", loss: " + str(float(loss)))
# 保存每个epoch的模型作为checkpoint
torch.save(model, './checkpoints/checkpoint_{}.pkl'.format(epoch))
logger.info("Checkpoint has saved as checkpoint_{}.pkl in ./checkpoints".format(epoch))
# 记录loss值
writer.add_scalar('loss', loss, epoch)
logger.info("epoch: " + str(epoch) + ", loss: " + str(float(loss)))
logger.info("=================================【Evaluating】=================================")
# 进行模型评估
f1, report = evaluate(model, dev_sents, dev_tags, word_to_ix, ix_to_tag)
# 记录f1值
writer.add_scalar('f1', f1, epoch)
print("f1_score: " + str(f1))
logger.info("f1_score: " + str(f1))
logger.info("Report: \n" + report)
# 训练完成
with torch.no_grad():
print(predict(model, [train_sents[0]], word_to_ix, ix_to_tag))
logger.info("=================================【Completed】=================================")
训练参数
EMBEDDING_DIM = 300
HIDDEN_DIM = 400
BATCH_SIZE = 256
NUM_EPOCHS = 30
MAX_LEN = 150
lr=0.01
weight_decay=1e-4
loss变化图像
f1变化图像
最后一个epoch的评估报告
https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
https://zhuanlan.zhihu.com/p/61227299
https://www.jiqizhixin.com/articles/2018-10-24-13
https://www.jianshu.com/p/566c6faace64
https://blog.csdn.net/leadai/article/details/80731463