代码来源:flyai
深度学习框架:tensorflow
公众号:深度学习视觉
完整代码获取:公众号后台回复(命名实体识别demo代码解读)
input:
output:
将字典中的文字全部用embedding表示,{index:embedding}。
tensorflow提供索引的方式,每次索引对应word的embedding向量。
input_x:[batch_size,max_sentence_length,embedding]
input_y:[batch_size,max_sentence_length]
4. batch_size:每批次sentence的条数。
5. max_sentence_length:max指的是本批次句子中最大的长度,其它不足该长度的句子做padding操作,length中所有位置都有label。
6. label:[‘B-LAW’,‘I-LOC’,…]
# -*- coding: utf-8 -*-
import os
import json
import path
# 所有文字的字典{word:index}
src_vocab_file = os.path.join(path.DATA_PATH,'words.dict')
# 所有文字的向量表示{word:[embedding]}
word_embedding_file = os.path.join(path.DATA_PATH,'embedding.json')
# 字向量的维度
embeddings_size = 200
# 最大句子的长度,不足的padding,超过的截断处理
max_sequence = 100
# 防止过拟合,dropout也是代价最小的ensemble操作。
dropout = 0.6
# 学习率
leanrate = 0.001
# 获取文字数量
with open(os.path.join(path.DATA_PATH,'words.dict'), 'r') as vocab_file:
vocab_size = len(json.load(vocab_file))
src_unknown_id = vocab_size
src_padding = vocab_size + 1
# 这是只一种标记方式
# B(begin)就是开始位置的意思(inner)I-TIME的前一个标记只能是B-TIME
label_dic=['B-LAW','B-ROLE','B-TIME','I-LOC','I-LAW','B-PER','I-PER','B-ORG','I-ROLE','I-CRIME','B-CRIME','I-ORG','B-LOC','I-TIME','O','padding']
label_len=len(label_dic)
# -*- coding: utf-8 -*
import argparse
from flyai.dataset import Dataset
from tensorflow.contrib.rnn import DropoutWrapper
import tensorflow as tf
from model import Model
from path import MODEL_PATH, LOG_PATH
import config
from utils import load_word2vec_embedding
import numpy as np
# 超参
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--EPOCHS", default=30, type=int, help="train epochs")
parser.add_argument("-b", "--BATCH", default=128, type=int, help="batch size")
args = parser.parse_args()
# 数据获取辅助类
dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)
# 模型操作辅助类
modelpp = Model(dataset)
# 得到训练和测试的数据
unit_num = config.embeddings_size # 默认词向量的大小等于RNN(每个time step) 和 CNN(列) 中神经单元的个数, 为了避免混淆model中全部用unit_num表示。
time_step = config.max_sequence # 每个句子的最大长度和time_step一样,为了避免混淆model中全部用time_step表示。
DROPOUT_RATE = config.dropout
LEARN_RATE=config.leanrate
TAGS_NUM = config.label_len
# ——————————————————定义神经网络变量——————————————————
class NER_net:
def __init__(self,embedding, batch_size=args.BATCH):
'''
:param scope_name:
:param iterator: 调用tensorflow DataSet API把数据feed进来。
:param embedding: 提前训练好的word embedding
:param batch_size:
'''
self.batch_size = batch_size
self.embedding = embedding
# ——————————————————导入数据——————————————————————
self.input= tf.placeholder(tf.int32, shape=[None,None],name="input")
self.label = tf.placeholder(tf.int32, shape=[None,None],name="label")
self.seq_length = tf.placeholder(tf.int32, shape=[None], name="max_sequence_in_batch")
self._build_net()
def _build_net(self):
# embedding_lookup这个函数就是一个索引,抽取input句子中word对应的embedding,每个索引的长度都是一个embedding_size
# 这里总共有batch_size个句子,每个句子中有time_step(max_sentenc_length)个words,每个words的长度为embedding_size
# x: [batch_size, time_step, embedding_size], float32
self.x = tf.nn.embedding_lookup(self.embedding,self.input)
# time_step == max_sentenc_length
# y: [batch_size, time_step]
self.y = self.label
cell_forward = tf.contrib.rnn.BasicLSTMCell(unit_num)
cell_backward = tf.contrib.rnn.BasicLSTMCell(unit_num)
if DROPOUT_RATE is not None:
cell_forward = DropoutWrapper(cell_forward, input_keep_prob=1.0, output_keep_prob=DROPOUT_RATE)
cell_backward = DropoutWrapper(cell_backward, input_keep_prob=1.0, output_keep_prob=DROPOUT_RATE)
# time_major 可以适应输入维度。
outputs, bi_state = \
tf.nn.bidirectional_dynamic_rnn(cell_forward, cell_backward, self.x, dtype=tf.float32)
forward_out, backward_out = outputs
outputs = tf.concat([forward_out, backward_out], axis=2)
# projection:
W = tf.get_variable("projection_w", [2 * unit_num, TAGS_NUM])
b = tf.get_variable("projection_b", [TAGS_NUM])
x_reshape = tf.reshape(outputs, [-1, 2 * unit_num])
projection = tf.add(tf.matmul(x_reshape, W),b,name='projection')
nsteps = tf.shape(outputs)[1]
# -1 to time step
self.outputs = tf.reshape(projection, [-1, nsteps, TAGS_NUM],name='output')
self.log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
self.outputs, self.y, self.seq_length)
self.transition_params=tf.add(self.transition_params,0,name='transition_params')
# Add a training op to tune the parameters.
self.loss = tf.reduce_mean(-self.log_likelihood)
self.train_op = tf.train.AdamOptimizer(LEARN_RATE).minimize(self.loss)
tf.summary.scalar('loss', self.loss)
#训练神经网络
# 这里主要将字典中所有的vocab单词向量化,函数里面有文件读取
embedding = load_word2vec_embedding(config.vocab_size)
net = NER_net(embedding) # embedding所有词的向量(vocab_size + 2=20000+2, embeddings_size=200)
with tf.Session() as sess:
merged = tf.summary.merge_all() # 将图形、训练过程等数据合并在一起
writer = tf.summary.FileWriter(LOG_PATH, sess.graph) # 将训练日志写入到logs文件夹下
sess.run(tf.global_variables_initializer())
print(dataset.get_step())
for i in range(dataset.get_step()):
x_train, y_train, x_test, y_test= dataset.next_batch(args.BATCH)
max_sentenc_length = max(map(len, x_train))
sequence_len = np.asarray([len(x) for x in x_train])
# padding
# x_train: [batch_size, time_step, embedding_size], float32
# 不足最大长度的部分用vocab_size+1这个数字表示,其实应该用"num"这个字符表示更好,不用数字
# label不足的部分用TAGS_NUM-1这个数字表示,也应该改为非数字的特殊字符表示。
x_train = np.asarray([list(x[:]) + (max_sentenc_length - len(x)) * [config.src_padding] for x in x_train])
y_train = np.asarray([list(y[:]) + (max_sentenc_length - len(y)) * [TAGS_NUM-1] for y in y_train])
res,loss_,_= sess.run([merged,net.loss,net.train_op],
feed_dict={net.input: x_train,net.label: y_train,net.seq_length: sequence_len})
print('steps:{}loss:{}'.format(i,loss_))
writer.add_summary(res, i) # 将日志数据写入文件
if i%50==0:
modelpp.save_model(sess, MODEL_PATH, overwrite=True)