原始论文地址:https://arxiv.org/abs/1603.01354
word级别的 一个序列长度: input_word=tf.placeholder([None,seqlen]), 就是分完词之后的 “我 在 吃饭”
char级别的是 input_char=tf.placeholder([None,seqlen,maxchar_perword]) 这里的意思是在word级别中的每个word是又多少个字组成,一般英文单词的char级别有七八个的情况,中文经过分词之后基本上最多应该就4个,个别能到5个。对应的输入大概是这样
[[我],[在],[吃,饭]],只不过在做模型的训练的时候最好每个batch都需要pad到同一个长度。
然后就是word 和 char级别经过embeding的方式,这里input_char经过embeding之后变为一个4维的tensor,然后经过一层二维卷积--relu--最大池化 再和word级别的embeding之后的tensor做做concat,进入lstm,后面再进入crf层,原理基本上就是这样,实现在tensorflow中比较简单。
只有word的形式,也就是输入input_word=tf.placeholder([None,seqlen]),分别经过bilstm和cnn层,两者output后的结果再做concat,然后进入crf层,这种实现方式我简单写了下,看下代码:
import tensorflow as tf
import numpy as np
from tensorflow.contrib import rnn
class BiLstmCnnCRF(object):
def __init__(
self, input_x,input_y,batch_size, num_tags, word_vocab_size,
word_embedd_dim, grad_clip,dropout,regularization,seq_len,
n_hidden_LSTM=200):
self.word_vocab_size=word_vocab_size
self.word_embedd_dim=word_embedd_dim
self.input_x = input_x
self.input_y = input_y
self.batch_size=batch_size
self.regularization=regularization
self.dropout_keep_prob = dropout
self.seq_len=seq_len
self.max_sequence_in_batch = tf.constant(value=self.seq_len,dtype=tf.int32)
self.sequence_lengths =tf.convert_to_tensor(self.batch_size * [self.max_sequence_in_batch], dtype=tf.int32)
with tf.name_scope("word_embedding"):
self.w_word = tf.Variable(tf.random_uniform([self.word_vocab_size, self.word_embedd_dim], -1, 1), trainable=True,
name="w_word")
self.embedded_words = tf.nn.embedding_lookup(self.w_word, self.input_x, name="embedded_words")
with tf.name_scope("cnn"):
#batchsize*80*200*1
cnn_input = tf.reshape(self.embedded_words,[-1,self.seq_len, self.word_embedd_dim,1])
cnn_filter = tf.get_variable(name='filter',
shape=[1, 1, 2, 30],
initializer=tf.random_uniform_initializer(-0.01, 0.01),
dtype=tf.float32)
cnn_bias = tf.get_variable(name='cnn_bias',
shape=[30],
initializer=tf.random_uniform_initializer(-0.01, 0.01),
dtype=tf.float32)
# batchsize*80*100*30
cnn_network = tf.add(tf.nn.conv2d(cnn_input ,
cnn_filter,
strides=[1, 1, 2, 1],
padding="VALID",
name="conv"),
cnn_bias);
relu_applied = tf.nn.relu(cnn_network)
max_pool = tf.nn.max_pool(relu_applied,
ksize=[1, 1, 100, 1],
strides=[1, 1, 1, 1],
padding='VALID')
self.cnn_output = tf.reshape(max_pool, [-1, self.seq_len, 30])
with tf.name_scope("biLSTM"):
# forward LSTM cell
lstm_fw_cell = rnn.BasicLSTMCell(n_hidden_LSTM, state_is_tuple=True)
lstm_bw_cell = rnn.BasicLSTMCell(n_hidden_LSTM, state_is_tuple=True)
(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,
lstm_bw_cell, self.embedded_words,
dtype=tf.float32) # output : [batch_size, timesteps, cell_fw.output_size]
self.biLstm = tf.concat([output_fw, output_bw], axis=-1, name="biLstm")
self.biLstm_clip = tf.clip_by_value(self.biLstm, -grad_clip, grad_clip)
self.biLstm_dropout = tf.nn.dropout(self.biLstm_clip, self.dropout_keep_prob)
with tf.name_scope("concat"):
self.outpu_concat=tf.concat([self.cnn_output,self.biLstm_dropout],axis=-1)
with tf.name_scope("output"):
W_out = tf.get_variable("W_out", shape=[2 * n_hidden_LSTM+30, num_tags],
initializer=tf.contrib.layers.xavier_initializer())
b_out = tf.Variable(tf.constant(0.0, shape=[num_tags]), name="b_out")
self.biLstm_reshaped = tf.reshape(self.outpu_concat, [-1,
2 * n_hidden_LSTM+30]) # [batch_size * timesteps , 2*n_hidden_LSTM] obtained by statement print(self.biLstm.get_shape())
self.predictions = tf.nn.xw_plus_b(self.biLstm_reshaped, W_out, b_out,
name="predictions") # input : [batch_size * timesteps , 2*n_hidden_LSTM] * [2*n_hidden_LSTM, num_classes] = [batch_size * timesteps , num_classes]
self.logits = tf.reshape(self.predictions, [self.batch_size, -1, num_tags],
name="logits") # output [batch_size, max_seq_len]
# self.logits_soft=tf.nn.softmax(logits=self.logits,name="logits_soft")
#
# self.pred=tf.reshape(self.logits_soft,[self.batch_size,-1],name="pred")
labels_softmax_argmax = tf.argmax(self.logits, axis=-1)
self.pred = tf.cast(labels_softmax_argmax, tf.int32,name="pred")
with tf.name_scope("l2loss"):
self.tv = tf.trainable_variables()
self.regularization_cost = self.regularization * tf.reduce_sum([tf.nn.l2_loss(v) for v in self.tv])
with tf.name_scope("loss"):
log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
self.logits, self.input_y,self.sequence_lengths)
#+self.regularization_cost +self.regularization_cost
self.loss = tf.reduce_mean(-log_likelihood, name="loss")+self.regularization_cost
self.train_op = tf.train.AdamOptimizer().minimize(self.loss)
with tf.name_scope("crf_pred"):
self.viterbi_sequence, viterbi_score=tf.contrib.crf.crf_decode(self.logits, self.transition_params, self.sequence_lengths)
只是简单了写个实现方式