本博文是对上一篇博客(https://blog.csdn.net/jmh1996/article/details/84779680 BiLSTM+CRF(二)命名实体识别 )的完善。
语料库数据格式:
训练集:
source_data.txt :文本
每一行为一个句子,每个句子用“\n”隔开,句子内部词之间用空格分开。
精 品 、 专 题 、 系 列 、 稀 见 程 度 才 是 质 量 的 核 心 。
藏 书 的 数 量 多 少 不 能 反 映 收 藏 的 质 量 , 更 不 是 工 薪 层 的 承 受 范 围 。
书 籍 浩 如 烟 海 , 靠 个 人 的 精 力 与 财 力 不 可 能 广 而 博 之 。
source_label.txt :命名实体 标注文本
第i行为对应source_data第i行的标注结果
结果与结果之间用空格分开
O O O O O O O O O O O O O O O O
O O O O O O O O O O O B-LOC I-LOC I-LOC I-LOC I-LOC O O O O O O O O O O
O O O O O O O O O O O O O O O O O O B-ORG I-ORG I-ORG O O O O O O O O O O O O O O
O O O O B-PER I-PER I-PER O O O B-PER I-PER O O O O B-ORG I-ORG I-ORG I-ORG O O
测试集:
test_data.txt :文本
test_label.txt :标注答案
格式同训练集。
(github上有数据集):https://github.com/jmhIcoding/bilstm-crf/tree/rewrite
__author__ = 'jmh081701'
import json
import copy
import numpy as np
import random
class DATAPROCESS:
def __init__(self,train_data_path,train_label_path,test_data_path,test_label_path,word_embedings_path,vocb_path,seperate_rate=0.1,batch_size=100,
state={'O':0,'B-LOC':1,'I-LOC':2,'B-PER':3,'I-PER':4,'B-ORG':5,'I-ORG':6}):
self.train_data_path =train_data_path
self.train_label_path =train_label_path
self.test_data_path = test_data_path
self.test_label_path = test_label_path
self.word_embedding_path = word_embedings_path
self.vocb_path = vocb_path
self.state = state
self.seperate_rate =seperate_rate
self.batch_size = batch_size
self.sentence_length = 100
#data structure to build
self.train_data_raw=[]
self.train_label_raw =[]
self.valid_data_raw=[]
self.valid_label_raw = []
self.test_data_raw =[]
self.test_label_raw =[]
self.word_embeddings=None
self.id2word=None
self.word2id=None
self.embedding_length =0
self.__load_wordebedding()
self.__load_train_data()
self.__load_test_data()
self.last_batch=0
def __load_wordebedding(self):
self.word_embeddings=np.load(self.word_embedding_path)
self.embedding_length = np.shape(self.word_embeddings)[-1]
with open(self.vocb_path,encoding="utf8") as fp:
self.id2word = json.load(fp)
self.word2id={}
for each in self.id2word:
self.word2id.setdefault(self.id2word[each],each)
def __load_train_data(self):
with open(self.train_data_path,encoding='utf8') as fp:
train_data_rawlines=fp.readlines()
with open(self.train_label_path,encoding='utf8') as fp:
train_label_rawlines=fp.readlines()
total_lines = len(train_data_rawlines)
assert len(train_data_rawlines)==len(train_label_rawlines)
for index in range(total_lines):
data_line = train_data_rawlines[index].split(" ")[:-1]
label_line = train_label_rawlines[index].split(" ")[:-1]
#assert len(data_line)==len(label_line)
#align
if len(data_line) < len(label_line):
label_line=label_line[:len(data_line)]
elif len(data_line)>len(label_line):
data_line=data_line[:len(label_line)]
assert len(data_line)==len(label_line)
#add and seperate valid ,train set.
data=[int(self.word2id.get(each,0)) for each in data_line]
label=[int(self.state.get(each,self.state['O'])) for each in label_line]
if random.uniform(0,1) <self.seperate_rate:
self.valid_data_raw.append(data)
self.valid_label_raw.append(label)
else:
self.train_data_raw.append(data)
self.train_label_raw.append(label)
self.train_batches= [i for i in range(int(len(self.train_data_raw)/self.batch_size) -1)]
self.train_batch_index =0
self.valid_batches=[i for i in range(int(len(self.valid_data_raw)/self.batch_size) -1) ]
self.valid_batch_index = 0
def __load_test_data(self):
with open(self.test_data_path,encoding='utf8') as fp:
test_data_rawlines=fp.readlines()
with open(self.test_label_path,encoding='utf8') as fp:
test_label_rawlines=fp.readlines()
total_lines = len(test_data_rawlines)
assert len(test_data_rawlines)==len(test_label_rawlines)
for index in range(total_lines):
data_line = test_data_rawlines[index].split(" ")[:-1]
label_line = test_label_rawlines[index].split(" ")[:-1]
#assert len(data_line)==len(label_line)
#align
if len(data_line) < len(label_line):
label_line=label_line[:len(data_line)]
elif len(data_line)>len(label_line):
data_line=data_line[:len(label_line)]
assert len(data_line)==len(label_line)
data=[int(self.word2id.get(each,0)) for each in data_line]
label=[int(self.state.get(each,self.state['O'])) for each in label_line]
self.test_data_raw.append(data)
self.test_label_raw.append(label)
def pad_sequence(self,sequence,object_length,pad_value=None):
'''
:param sequence: 待填充的序列
:param object_length: 填充的目标长度
:return:
'''
sequence =copy.deepcopy(sequence)
if pad_value is None:
sequence = sequence*(1+int((0.5+object_length)/(len(sequence))))
sequence = sequence[:object_length]
else:
sequence = sequence+[pad_value]*(object_length- len(sequence))
return sequence
def next_train_batch(self):
#padding
output_x=[]
output_label=[]
efficient_sequence_length=[]
index =self.train_batches[self.train_batch_index]
self.train_batch_index =(self.train_batch_index +1 ) % len(self.train_batches)
datas = self.train_data_raw[index*self.batch_size:(index+1)*self.batch_size]
labels = self.train_label_raw[index*self.batch_size:(index+1)*self.batch_size]
for index in range(self.batch_size):
#复制填充
data= self.pad_sequence(datas[index],self.sentence_length)
label = self.pad_sequence(labels[index],self.sentence_length)
output_x.append(data)
output_label.append(label)
efficient_sequence_length.append(min(100,len(labels[index])))
return output_x,output_label,efficient_sequence_length
#返回的都是下标,注意efficient_sequence_length是有效的长度
def test_data(self):
output_x=[]
output_label=[]
efficient_sequence_length=[]
datas = self.test_data_raw[0:]
labels = self.test_label_raw[0:]
for index in range(len(datas)):
#复制填充
data= self.pad_sequence(datas[index],self.sentence_length)
label = self.pad_sequence(labels[index],self.sentence_length)
output_x.append(data)
output_label.append(label)
efficient_sequence_length.append(min(100,len(labels[index])))
return output_x,output_label,efficient_sequence_length
def next_valid_batch(self):
output_x=[]
output_label=[]
efficient_sequence_length=[]
index =self.valid_batches[self.valid_batch_index]
self.valid_batch_index =(self.valid_batch_index +1 ) % len(self.valid_batches)
datas = self.valid_data_raw[index*self.batch_size:(index+1)*self.batch_size]
labels = self.valid_label_raw[index*self.batch_size:(index+1)*self.batch_size]
for index in range(self.batch_size):
#复制填充
data= self.pad_sequence(datas[index],self.sentence_length)
label = self.pad_sequence(labels[index],self.sentence_length)
output_x.append(data)
output_label.append(label)
efficient_sequence_length.append(min(100,len(labels[index])))
return output_x,output_label,efficient_sequence_length
state={'O':0,'B-LOC':1,'I-LOC':2,'B-PER':3,'I-PER':4,'B-ORG':5,'I-ORG':6}
def extract_named_entity(labels,lens):
#输入是一个句子的标签
B_PER=-1
L_PER=-1
B_LOC=-1
L_LOC=-1
B_ORG=-1
L_ORG=-1
rst = set()
for index in range(lens):
if labels[index]==state['O']:
if B_PER >=0:
rst.add(('PER',B_PER,L_PER))
B_PER=-1
L_PER=0
if B_ORG >=0:
rst.add(('ORG',B_ORG,L_ORG))
B_ORG=-1
L_ORG=0
if B_LOC>=0:
rst.add(('LOC',B_LOC,L_LOC))
B_LOC=-1
L_LOC=0
if labels[index]==state['B-LOC']:
if B_PER >=0:
rst.add(('PER',B_PER,L_PER))
B_PER=-1
L_PER=0
if B_ORG >=0:
rst.add(('ORG',B_ORG,L_ORG))
B_ORG=-1
L_ORG=0
if B_LOC>=0:
rst.add(('LOC',B_LOC,L_LOC))
B_LOC=-1
L_LOC=0
B_LOC=index
L_LOC=1
if labels[index]==state['B-PER']:
if B_PER >=0:
rst.add(('PER',B_PER,L_PER))
B_PER=-1
L_PER=0
if B_ORG >=0:
rst.add(('ORG',B_ORG,L_ORG))
B_ORG=-1
L_ORG=0
if B_LOC>=0:
rst.add(('LOC',B_LOC,L_LOC))
B_LOC=-1
L_LOC=0
B_PER=index
L_PER=1
if labels[index]==state['B-ORG']:
if B_PER >=0:
rst.add(('PER',B_PER,L_PER))
B_PER=-1
L_PER=0
if B_ORG >=0:
rst.add(('ORG',B_ORG,L_ORG))
B_ORG=-1
L_ORG=0
if B_LOC>=0:
rst.add(('LOC',B_LOC,L_LOC))
B_LOC=-1
L_LOC=0
B_ORG=index
L_ORG=1
if labels[index]==state['I-LOC']:
if B_LOC>=0:
L_LOC+=1
if labels[index]==state['I-ORG']:
if B_ORG>=0:
L_ORG+=1
if labels[index]==state['I-PER']:
if B_PER>=0:
L_PER+=1
return rst
def evaluate(predict_labels,real_labels,efficient_length):
#输入的单位是batch;
# predict_labels:[batch_size,sequence_length],real_labels:[batch_size,sequence_length]
sentence_nums =len(predict_labels) #句子的个数
predict_cnt=0
predict_right_cnt=0
real_cnt=0
for sentence_index in range(sentence_nums):
try:
predict_set=extract_named_entity(predict_labels[sentence_index],efficient_length[sentence_index])
real_set=extract_named_entity(real_labels[sentence_index],efficient_length[sentence_index])
right_=predict_set.intersection(real_set)
predict_right_cnt+=len(right_)
predict_cnt += len(predict_set)
real_cnt +=len(real_set)
except Exception as exp:
print(predict_labels[sentence_index])
print(real_labels[sentence_index])
precision = predict_right_cnt/(predict_cnt+0.000000000001)
recall = predict_right_cnt/(real_cnt+0.000000000001)
F1 = 2 * precision*recall/(precision+recall+0.00000000001)
return {'precision':precision,'recall':recall,'F1':F1}
if __name__ == '__main__':
dataGen = DATAPROCESS(train_data_path="data/source_data.txt",
train_label_path="data/source_label.txt",
test_data_path="data/test_data.txt",
test_label_path="data/test_label.txt",
word_embedings_path="data/source_data.txt.ebd.npy",
vocb_path="data/source_data.txt.vab",
batch_size=90,
seperate_rate=0.3
)
datas,labels,efficient_sequence_length = dataGen.test_data()
print(evaluate(labels,labels,efficient_sequence_length))
__author__ = 'jmh081701'
import tensorflow as tf
from tensorflow.contrib import crf
import random
from utils import *
import logging
import datetime
logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)
#超参数
batch_size=100
dataGen = DATAPROCESS(train_data_path="data/source_data.txt",
train_label_path="data/source_label.txt",
test_data_path="data/test_data.txt",
test_label_path="data/test_label.txt",
word_embedings_path="data/source_data.txt.ebd.npy",
vocb_path="data/source_data.txt.vab",
batch_size=batch_size
)
#模型超参数
tag_nums =len(dataGen.state) #标签数目
hidden_nums = 600 #bi-lstm的隐藏层单元数目
learning_rate = 0.00075 #学习速率
sentence_len = dataGen.sentence_length #句子长度,输入到网络的序列长度
frame_size = dataGen.embedding_length #句子里面每个词的词向量长度
#网络的变量
word_embeddings = tf.Variable(initial_value=dataGen.word_embeddings,trainable=True) #参与训练
#输入占位符
input_x = tf.placeholder(dtype=tf.int32,shape=[None,None],name='input_word_id')#输入词的id
input_y = tf.placeholder(dtype=tf.int32,shape=[None,sentence_len],name='input_labels')
sequence_lengths=tf.placeholder(dtype=tf.int32,shape=[None],name='sequence_lengths_vector')
#
with tf.name_scope('projection'):
#投影层,先将输入的词投影成相应的词向量
word_id = input_x
word_vectors = tf.nn.embedding_lookup(word_embeddings,ids=word_id,name='word_vectors')
word_vectors = tf.nn.dropout(word_vectors,0.8)
with tf.name_scope('bi-lstm'):
labels = tf.reshape(input_y,shape=[-1,sentence_len],name='labels')
fw_lstm_cell =tf.nn.rnn_cell.LSTMCell(hidden_nums)
bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(hidden_nums)
#双向传播
output,_state = tf.nn.bidirectional_dynamic_rnn(fw_lstm_cell,bw_lstm_cell,inputs=word_vectors,sequence_length=sequence_lengths,dtype=tf.float32)
fw_output = output[0]#[batch_size,sentence_len,hidden_nums]
bw_output =output[1]#[batch_size,sentence_len,hidden_nums]
contact = tf.concat([fw_output,bw_output],-1,name='bi_lstm_concat')#[batch_size,sentence_len,2*hidden_nums]
contact = tf.nn.dropout(contact,0.9)
s=tf.shape(contact)
contact_reshape=tf.reshape(contact,shape=[-1,2*hidden_nums],name='contact')
W=tf.get_variable('W',dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer(),shape=[2*hidden_nums,tag_nums],trainable=True)
b=tf.get_variable('b',initializer=tf.zeros(shape=[tag_nums]))
p=tf.matmul(contact_reshape,W)+b
logit= tf.reshape(p,shape=[-1,s[1],tag_nums],name='omit_matrix')
with tf.name_scope("crf") :
log_likelihood,transition_matrix=crf.crf_log_likelihood(logit,labels,sequence_lengths=sequence_lengths)
cost = -tf.reduce_mean(log_likelihood)
with tf.name_scope("train-op"):
global_step = tf.Variable(0,name='global_step',trainable=False)
optim = tf.train.AdamOptimizer(learning_rate)
#train_op=optim.minimize(cost)
grads_and_vars = optim.compute_gradients(cost)
grads_and_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_and_vars]
train_op = optim.apply_gradients(grads_and_vars,global_step)
#
#载入模型如果有参数的话
checkpoint_prefix="paras/bilstm-crf-models"
saver = tf.train.Saver()
display_step = len(dataGen.train_batches)
epoch_nums = 40 #迭代的数据轮数
max_batch = len(dataGen.train_batches)*epoch_nums
step=1
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
cpkt=tf.train.get_checkpoint_state(checkpoint_prefix)
if cpkt and cpkt.model_checkpoint_path:
saver.restore(sess,cpkt.model_checkpoint_path)
logging.info("restore from history models.")
else:
logging.warning("retrain a models.")
while step<max_batch:
batch_x,batch_y,efficient_sequence_length = dataGen.next_train_batch()
_,loss,score=sess.run([train_op,cost,logit],{input_x:batch_x,input_y:batch_y,sequence_lengths:efficient_sequence_length})
logging.info({'loss':loss,'step':step})
if(step % display_step ==0):
valid_x,valid_y,efficient_sequence_length=dataGen.next_valid_batch()
scores,transition_matrix_out=sess.run([logit,transition_matrix],{input_x:valid_x,input_y:valid_y,sequence_lengths:efficient_sequence_length})
for i in range(batch_size):
label,_=crf.viterbi_decode(scores[i],transition_params=transition_matrix_out)
label=label[:efficient_sequence_length[i]]
print(label)
logger.info("Save a stage model para.")
saver.save(sess,checkpoint_prefix)
step+=1
saver.save(sess,checkpoint_prefix)
logger.info("save models well.")
data_x,label_y,efficient_sequence_length=dataGen.test_data()
scores,transition_matrix_out=sess.run([logit,transition_matrix],{input_x:data_x,input_y:label_y,sequence_lengths:efficient_sequence_length})
real_labels = label_y
predict_labels =[]
for i in range(len(scores)):
labels,_=crf.viterbi_decode(scores[i],transition_matrix_out)
predict_labels.append(labels)
print("====================TEST======================")
print(evaluate(predict_labels,real_labels,efficient_sequence_length))
print("===================END MODEL==================")
在经过大约30分钟的训练后:
训练结果:
{'recall': 0.7971918876755058, 'F1': 0.8195669607006962, 'precision': 0.8432343234323418}
本实验中,最需要就是要注意dynamic-rnn的sequence_length参数,以及各个超参数的设置。
使用的超参数为:
参数名 | 参数值 |
---|---|
lstm 隐藏层神经元个数 | 600 |
学习速率 | 0.00075 |
batch_size | 100 |
句子截断长度 | 100 |
梯度截断 | [-5,5] |
epoch数目 | 40 |
标签数目 | 7 |
40个epoch,跑了30分钟后最终在test数据集的性能为:
precision | recall | F1 |
---|---|---|
0.84 | 0.797 | 0.8195 |
应该还可以调整一下,获取更好的准确率。
Tensorflow:1.3.0
python :3.5