BiLSTM+CRF(三)命名实体识别 实践与总结

本博文是对上一篇博客(https://blog.csdn.net/jmh1996/article/details/84779680 BiLSTM+CRF(二)命名实体识别 )的完善。

数据处理功能模块

语料库数据格式:
训练集:
source_data.txt :文本
每一行为一个句子,每个句子用“\n”隔开,句子内部词之间用空格分开。

精 品 、 专 题 、 系 列 、 稀 见 程 度 才 是 质 量 的 核 心 。
藏 书 的 数 量 多 少 不 能 反 映 收 藏 的 质 量 , 更 不 是 工 薪 层 的 承 受 范 围 。
书 籍 浩 如 烟 海 , 靠 个 人 的 精 力 与 财 力 不 可 能 广 而 博 之 。

source_label.txt :命名实体 标注文本
第i行为对应source_data第i行的标注结果
结果与结果之间用空格分开

O O O O O O O O O O O O O O O O
O O O O O O O O O O O B-LOC I-LOC I-LOC I-LOC I-LOC O O O O O O O O O O
O O O O O O O O O O O O O O O O O O B-ORG I-ORG I-ORG O O O O O O O O O O O O O O
O O O O B-PER I-PER I-PER O O O B-PER I-PER O O O O B-ORG I-ORG I-ORG I-ORG O O

测试集:
test_data.txt :文本
test_label.txt :标注答案
格式同训练集。
(github上有数据集):https://github.com/jmhIcoding/bilstm-crf/tree/rewrite

__author__ = 'jmh081701'
import  json

import  copy
import  numpy as np
import  random

class  DATAPROCESS:
    def __init__(self,train_data_path,train_label_path,test_data_path,test_label_path,word_embedings_path,vocb_path,seperate_rate=0.1,batch_size=100,
                 state={'O':0,'B-LOC':1,'I-LOC':2,'B-PER':3,'I-PER':4,'B-ORG':5,'I-ORG':6}):
        self.train_data_path =train_data_path
        self.train_label_path =train_label_path
        self.test_data_path = test_data_path
        self.test_label_path = test_label_path
        self.word_embedding_path = word_embedings_path
        self.vocb_path  = vocb_path
        self.state = state
        self.seperate_rate =seperate_rate
        self.batch_size = batch_size
        self.sentence_length = 100

        #data structure to build
        self.train_data_raw=[]
        self.train_label_raw =[]
        self.valid_data_raw=[]
        self.valid_label_raw = []

        self.test_data_raw =[]
        self.test_label_raw =[]

        self.word_embeddings=None
        self.id2word=None
        self.word2id=None
        self.embedding_length =0

        self.__load_wordebedding()


        self.__load_train_data()

        self.__load_test_data()

        self.last_batch=0
    def __load_wordebedding(self):
        self.word_embeddings=np.load(self.word_embedding_path)
        self.embedding_length = np.shape(self.word_embeddings)[-1]
        with open(self.vocb_path,encoding="utf8") as fp:
            self.id2word = json.load(fp)
        self.word2id={}
        for each in self.id2word:
            self.word2id.setdefault(self.id2word[each],each)

    def __load_train_data(self):

        with open(self.train_data_path,encoding='utf8') as fp:
            train_data_rawlines=fp.readlines()
        with open(self.train_label_path,encoding='utf8') as fp:
            train_label_rawlines=fp.readlines()
        total_lines = len(train_data_rawlines)
        assert len(train_data_rawlines)==len(train_label_rawlines)

        for index in range(total_lines):
            data_line = train_data_rawlines[index].split(" ")[:-1]
            label_line = train_label_rawlines[index].split(" ")[:-1]
            #assert len(data_line)==len(label_line)
            #align
            if len(data_line) < len(label_line):
                label_line=label_line[:len(data_line)]
            elif len(data_line)>len(label_line):
                data_line=data_line[:len(label_line)]
            assert len(data_line)==len(label_line)
            #add and seperate valid ,train set.
            data=[int(self.word2id.get(each,0)) for each in data_line]
            label=[int(self.state.get(each,self.state['O'])) for each in label_line]
            if random.uniform(0,1) <self.seperate_rate:
                self.valid_data_raw.append(data)
                self.valid_label_raw.append(label)
            else:
                self.train_data_raw.append(data)
                self.train_label_raw.append(label)
        self.train_batches= [i for i in range(int(len(self.train_data_raw)/self.batch_size) -1)]
        self.train_batch_index =0
        self.valid_batches=[i for i in range(int(len(self.valid_data_raw)/self.batch_size) -1) ]
        self.valid_batch_index = 0
    def __load_test_data(self):
        with open(self.test_data_path,encoding='utf8') as fp:
            test_data_rawlines=fp.readlines()
        with open(self.test_label_path,encoding='utf8') as fp:
            test_label_rawlines=fp.readlines()
        total_lines = len(test_data_rawlines)
        assert len(test_data_rawlines)==len(test_label_rawlines)

        for index in range(total_lines):
            data_line = test_data_rawlines[index].split(" ")[:-1]
            label_line = test_label_rawlines[index].split(" ")[:-1]
            #assert len(data_line)==len(label_line)
            #align
            if len(data_line) < len(label_line):
                label_line=label_line[:len(data_line)]
            elif len(data_line)>len(label_line):
                data_line=data_line[:len(label_line)]
            assert len(data_line)==len(label_line)

            data=[int(self.word2id.get(each,0)) for each in data_line]
            label=[int(self.state.get(each,self.state['O'])) for each in label_line]
            self.test_data_raw.append(data)
            self.test_label_raw.append(label)


    def pad_sequence(self,sequence,object_length,pad_value=None):
        '''
        :param sequence: 待填充的序列
        :param object_length:  填充的目标长度
        :return:
        '''
        sequence =copy.deepcopy(sequence)
        if pad_value is None:
            sequence = sequence*(1+int((0.5+object_length)/(len(sequence))))
            sequence = sequence[:object_length]
        else:
            sequence = sequence+[pad_value]*(object_length- len(sequence))
        return sequence

    def next_train_batch(self):
        #padding
        output_x=[]
        output_label=[]
        efficient_sequence_length=[]
        index =self.train_batches[self.train_batch_index]
        self.train_batch_index =(self.train_batch_index +1 ) % len(self.train_batches)
        datas = self.train_data_raw[index*self.batch_size:(index+1)*self.batch_size]
        labels = self.train_label_raw[index*self.batch_size:(index+1)*self.batch_size]
        for index in range(self.batch_size):
            #复制填充
            data= self.pad_sequence(datas[index],self.sentence_length)
            label = self.pad_sequence(labels[index],self.sentence_length)
            output_x.append(data)
            output_label.append(label)
            efficient_sequence_length.append(min(100,len(labels[index])))
        return output_x,output_label,efficient_sequence_length
        #返回的都是下标,注意efficient_sequence_length是有效的长度

    def test_data(self):
        output_x=[]
        output_label=[]
        efficient_sequence_length=[]
        datas = self.test_data_raw[0:]
        labels = self.test_label_raw[0:]
        for index in range(len(datas)):
            #复制填充
            data= self.pad_sequence(datas[index],self.sentence_length)
            label = self.pad_sequence(labels[index],self.sentence_length)
            output_x.append(data)
            output_label.append(label)
            efficient_sequence_length.append(min(100,len(labels[index])))
        return output_x,output_label,efficient_sequence_length
    def next_valid_batch(self):
        output_x=[]
        output_label=[]
        efficient_sequence_length=[]
        index =self.valid_batches[self.valid_batch_index]
        self.valid_batch_index =(self.valid_batch_index +1 ) % len(self.valid_batches)
        datas = self.valid_data_raw[index*self.batch_size:(index+1)*self.batch_size]
        labels = self.valid_label_raw[index*self.batch_size:(index+1)*self.batch_size]
        for index in range(self.batch_size):
            #复制填充
            data= self.pad_sequence(datas[index],self.sentence_length)
            label = self.pad_sequence(labels[index],self.sentence_length)
            output_x.append(data)
            output_label.append(label)
            efficient_sequence_length.append(min(100,len(labels[index])))
        return output_x,output_label,efficient_sequence_length


state={'O':0,'B-LOC':1,'I-LOC':2,'B-PER':3,'I-PER':4,'B-ORG':5,'I-ORG':6}
def extract_named_entity(labels,lens):
#输入是一个句子的标签
    B_PER=-1
    L_PER=-1

    B_LOC=-1
    L_LOC=-1

    B_ORG=-1
    L_ORG=-1
    rst = set()
    for index in range(lens):
        if labels[index]==state['O']:
            if B_PER >=0:
                rst.add(('PER',B_PER,L_PER))
                B_PER=-1
                L_PER=0
            if B_ORG >=0:
                rst.add(('ORG',B_ORG,L_ORG))
                B_ORG=-1
                L_ORG=0
            if B_LOC>=0:
                rst.add(('LOC',B_LOC,L_LOC))
                B_LOC=-1
                L_LOC=0
        if labels[index]==state['B-LOC']:
            if B_PER >=0:
                rst.add(('PER',B_PER,L_PER))
                B_PER=-1
                L_PER=0
            if B_ORG >=0:
                rst.add(('ORG',B_ORG,L_ORG))
                B_ORG=-1
                L_ORG=0
            if B_LOC>=0:
                rst.add(('LOC',B_LOC,L_LOC))
                B_LOC=-1
                L_LOC=0
            B_LOC=index
            L_LOC=1

        if labels[index]==state['B-PER']:
            if B_PER >=0:
                rst.add(('PER',B_PER,L_PER))
                B_PER=-1
                L_PER=0
            if B_ORG >=0:
                rst.add(('ORG',B_ORG,L_ORG))
                B_ORG=-1
                L_ORG=0
            if B_LOC>=0:
                rst.add(('LOC',B_LOC,L_LOC))
                B_LOC=-1
                L_LOC=0
            B_PER=index
            L_PER=1

        if labels[index]==state['B-ORG']:
            if B_PER >=0:
                rst.add(('PER',B_PER,L_PER))
                B_PER=-1
                L_PER=0
            if B_ORG >=0:
                rst.add(('ORG',B_ORG,L_ORG))
                B_ORG=-1
                L_ORG=0
            if B_LOC>=0:
                rst.add(('LOC',B_LOC,L_LOC))
                B_LOC=-1
                L_LOC=0
            B_ORG=index
            L_ORG=1

        if labels[index]==state['I-LOC']:
            if B_LOC>=0:
                L_LOC+=1
        if labels[index]==state['I-ORG']:
            if B_ORG>=0:
                L_ORG+=1

        if labels[index]==state['I-PER']:
            if B_PER>=0:
                L_PER+=1
    return  rst

def evaluate(predict_labels,real_labels,efficient_length):
#输入的单位是batch;
# predict_labels:[batch_size,sequence_length],real_labels:[batch_size,sequence_length]
    sentence_nums =len(predict_labels) #句子的个数
    predict_cnt=0
    predict_right_cnt=0
    real_cnt=0
    for sentence_index in range(sentence_nums):
        try:
            predict_set=extract_named_entity(predict_labels[sentence_index],efficient_length[sentence_index])
            real_set=extract_named_entity(real_labels[sentence_index],efficient_length[sentence_index])
            right_=predict_set.intersection(real_set)
            predict_right_cnt+=len(right_)
            predict_cnt += len(predict_set)
            real_cnt +=len(real_set)
        except Exception as exp:
            print(predict_labels[sentence_index])
            print(real_labels[sentence_index])
    precision = predict_right_cnt/(predict_cnt+0.000000000001)
    recall = predict_right_cnt/(real_cnt+0.000000000001)
    F1 = 2 * precision*recall/(precision+recall+0.00000000001)
    return {'precision':precision,'recall':recall,'F1':F1}

if __name__ == '__main__':
    dataGen = DATAPROCESS(train_data_path="data/source_data.txt",
                          train_label_path="data/source_label.txt",
                          test_data_path="data/test_data.txt",
                          test_label_path="data/test_label.txt",
                          word_embedings_path="data/source_data.txt.ebd.npy",
                          vocb_path="data/source_data.txt.vab",
                          batch_size=90,
                          seperate_rate=0.3
                        )
    datas,labels,efficient_sequence_length = dataGen.test_data()
    print(evaluate(labels,labels,efficient_sequence_length))


bilstm+crf 网络部分

__author__ = 'jmh081701'
import  tensorflow as tf
from  tensorflow.contrib import  crf
import  random
from  utils import *
import logging
import datetime

logging.basicConfig(level=logging.INFO,format="%(asctime)s  - %(message)s")
logger = logging.getLogger(__name__)

#超参数
batch_size=100
dataGen = DATAPROCESS(train_data_path="data/source_data.txt",
                          train_label_path="data/source_label.txt",
                          test_data_path="data/test_data.txt",
                          test_label_path="data/test_label.txt",
                          word_embedings_path="data/source_data.txt.ebd.npy",
                          vocb_path="data/source_data.txt.vab",
                          batch_size=batch_size
                        )
#模型超参数
tag_nums =len(dataGen.state)    #标签数目
hidden_nums = 600                #bi-lstm的隐藏层单元数目
learning_rate = 0.00075          #学习速率
sentence_len = dataGen.sentence_length #句子长度,输入到网络的序列长度
frame_size = dataGen.embedding_length #句子里面每个词的词向量长度

#网络的变量
word_embeddings =  tf.Variable(initial_value=dataGen.word_embeddings,trainable=True) #参与训练
#输入占位符
input_x = tf.placeholder(dtype=tf.int32,shape=[None,None],name='input_word_id')#输入词的id
input_y = tf.placeholder(dtype=tf.int32,shape=[None,sentence_len],name='input_labels')
sequence_lengths=tf.placeholder(dtype=tf.int32,shape=[None],name='sequence_lengths_vector')
#
with tf.name_scope('projection'):
    #投影层,先将输入的词投影成相应的词向量
    word_id = input_x
    word_vectors = tf.nn.embedding_lookup(word_embeddings,ids=word_id,name='word_vectors')
    word_vectors = tf.nn.dropout(word_vectors,0.8)
with tf.name_scope('bi-lstm'):

    labels = tf.reshape(input_y,shape=[-1,sentence_len],name='labels')
    fw_lstm_cell =tf.nn.rnn_cell.LSTMCell(hidden_nums)
    bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(hidden_nums)
    #双向传播
    output,_state = tf.nn.bidirectional_dynamic_rnn(fw_lstm_cell,bw_lstm_cell,inputs=word_vectors,sequence_length=sequence_lengths,dtype=tf.float32)
    fw_output = output[0]#[batch_size,sentence_len,hidden_nums]
    bw_output =output[1]#[batch_size,sentence_len,hidden_nums]
    contact = tf.concat([fw_output,bw_output],-1,name='bi_lstm_concat')#[batch_size,sentence_len,2*hidden_nums]
    contact = tf.nn.dropout(contact,0.9)
    s=tf.shape(contact)
    contact_reshape=tf.reshape(contact,shape=[-1,2*hidden_nums],name='contact')
    W=tf.get_variable('W',dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer(),shape=[2*hidden_nums,tag_nums],trainable=True)
    b=tf.get_variable('b',initializer=tf.zeros(shape=[tag_nums]))
    p=tf.matmul(contact_reshape,W)+b
    logit= tf.reshape(p,shape=[-1,s[1],tag_nums],name='omit_matrix')


with tf.name_scope("crf") :
    log_likelihood,transition_matrix=crf.crf_log_likelihood(logit,labels,sequence_lengths=sequence_lengths)
    cost = -tf.reduce_mean(log_likelihood)
with tf.name_scope("train-op"):
    global_step = tf.Variable(0,name='global_step',trainable=False)
    optim = tf.train.AdamOptimizer(learning_rate)
    #train_op=optim.minimize(cost)
    grads_and_vars = optim.compute_gradients(cost)
    grads_and_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_and_vars]
    train_op = optim.apply_gradients(grads_and_vars,global_step)
#
#载入模型如果有参数的话
checkpoint_prefix="paras/bilstm-crf-models"
saver = tf.train.Saver()

display_step = len(dataGen.train_batches)
epoch_nums = 40 #迭代的数据轮数
max_batch = len(dataGen.train_batches)*epoch_nums
step=1
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    cpkt=tf.train.get_checkpoint_state(checkpoint_prefix)
    if  cpkt and cpkt.model_checkpoint_path:
        saver.restore(sess,cpkt.model_checkpoint_path)
        logging.info("restore from history models.")
    else:
        logging.warning("retrain a models.")
    while step<max_batch:
        batch_x,batch_y,efficient_sequence_length = dataGen.next_train_batch()
        _,loss,score=sess.run([train_op,cost,logit],{input_x:batch_x,input_y:batch_y,sequence_lengths:efficient_sequence_length})
        logging.info({'loss':loss,'step':step})
        if(step % display_step ==0):
            valid_x,valid_y,efficient_sequence_length=dataGen.next_valid_batch()
            scores,transition_matrix_out=sess.run([logit,transition_matrix],{input_x:valid_x,input_y:valid_y,sequence_lengths:efficient_sequence_length})
            for i in range(batch_size):
                label,_=crf.viterbi_decode(scores[i],transition_params=transition_matrix_out)
                label=label[:efficient_sequence_length[i]]
                print(label)
            logger.info("Save a stage model para.")
            saver.save(sess,checkpoint_prefix)
        step+=1
    saver.save(sess,checkpoint_prefix)
    logger.info("save models well.")
    data_x,label_y,efficient_sequence_length=dataGen.test_data()
    scores,transition_matrix_out=sess.run([logit,transition_matrix],{input_x:data_x,input_y:label_y,sequence_lengths:efficient_sequence_length})
    real_labels = label_y
    predict_labels =[]
    for i in range(len(scores)):
        labels,_=crf.viterbi_decode(scores[i],transition_matrix_out)
        predict_labels.append(labels)
    print("====================TEST======================")
    print(evaluate(predict_labels,real_labels,efficient_sequence_length))
    print("===================END MODEL==================")

在经过大约30分钟的训练后:
训练结果:

{'recall': 0.7971918876755058, 'F1': 0.8195669607006962, 'precision': 0.8432343234323418}

总结

本实验中,最需要就是要注意dynamic-rnn的sequence_length参数,以及各个超参数的设置。
使用的超参数为:

参数名 参数值
lstm 隐藏层神经元个数 600
学习速率 0.00075
batch_size 100
句子截断长度 100
梯度截断 [-5,5]
epoch数目 40
标签数目 7

40个epoch,跑了30分钟后最终在test数据集的性能为:

precision recall F1
0.84 0.797 0.8195

应该还可以调整一下,获取更好的准确率。

运行环境

Tensorflow:1.3.0
python :3.5

你可能感兴趣的:(自然语言处理)