神经网络机器翻译seq2seq(nlp实践2)

本实践采用的是IWLST TED演讲en_zh数据集,基于tensorflow 实战google深度学习框架(第二版)一书所进行的实验,大部分代码直接是用的书上的,为了更好地巩固知识,所以整理成博客。

首先从相关网站上下载数据集,解压,代码如下所示:

wget https://wit3.fbk.eu/archive/2015-01//texts/en/zh/en-zh.tgz
tar xzvf en-zh.tgz
cd en-zh/

我们只关注train.tags.en-zh.en和train.tags.en-zh.zh两个文本数据文件,但是都带有html标记,需要进行预处理一下:

IWSLT15.TED.dev2010.en-zh.en.xml  IWSLT15.TED.tst2011.en-zh.en.xml  IWSLT15.TED.tst2013.en-zh.en.xml  train.tags.en-zh.zh
IWSLT15.TED.dev2010.en-zh.zh.xml  IWSLT15.TED.tst2011.en-zh.zh.xml  IWSLT15.TED.tst2013.en-zh.zh.xml  train.zh
IWSLT15.TED.tst2010.en-zh.en.xml  IWSLT15.TED.tst2012.en-zh.en.xml  README
IWSLT15.TED.tst2010.en-zh.zh.xml  IWSLT15.TED.tst2012.en-zh.zh.xml  train.tags.en-zh.en

在对原始数据进行处理时,需要进行分词、建库和数字化等操作,在将数据处理成可以输入的数据时还得进行padding。在这里,本人选取的中文和英文的分词工具都是stanfordcorenlp,相关知识请参考这篇博客。下面直接上代码:

#coding:utf-8
import collections
from operator import itemgetter
from stanfordcorenlp import StanfordCoreNLP
import tqdm

#第一步,把英文跟中文弄成一行一句的格式
def deletehtml(filename1,filename2):
	f1 = open(filename1,'r')
	f2 = open(filename2,'r') 

	data1 = f1.readlines()
	data2 = f2.readlines()
	assert len(data1)==len(data2)#用codecs会导致报错不知道为什么
	fw1 = open(filename1+".deletehtml",'w')
	fw2 = open(filename2+".deletehtml",'w')

	print("deletehtml...")

	for line1,line2 in tqdm.tqdm(zip(data1,data2)):
		line1 = line1.strip()
		line2 = line2.strip()
		if line1 and line2:
			if '<' not in line1 and '>' not in line1 and '<' not in line2 and '>' not in line2:
				fw1.write(line1+"\n")
				fw2.write(line2+"\n")
	fw1.close()
	f1.close()
	fw2.close()
	f2.close()

	return filename1+".deletehtml",filename2+".deletehtml"

#第二步,分词并建立词库
def segement_sentence(filename,vocab_size,lang='en'):
	nlp = StanfordCoreNLP("../stanford-corenlp-full-2018-10-05",lang=lang)
	with open(filename,'r') as f:
		data = f.readlines()
		counter = collections.Counter()
		f1 = open(filename+".segment",'w')
		print("segmenting...")
		for line in tqdm.tqdm(data):
			line = line.strip()
			word_list = nlp.word_tokenize(line.strip())
			sentence = ' '.join(word_list)
			f1.write(sentence+"\n")
			for word in word_list:
				counter[word] += 1
		f1.close()
	nlp.close()

	sorted_word_to_cnt = sorted(counter.items(),key=itemgetter(1),reverse=True)
	sorted_words =  ["","",""] + [x[0] for x in sorted_word_to_cnt] 

	if len(sorted_words)>vocab_size:
		sorted_words = sorted_words[:vocab_size]
	assert len(sorted_words)<=vocab_size
	with open(filename+".vocab",'w') as fw:
		for word in sorted_words:
			fw.write(word+"\n")
	return filename+".segment"

#第三步,将文本转换成数字编号
def convert_to_id(filename,vocab_file):
	with open(vocab_file,"r") as f:
		data = f.readlines()
		vocab = [w.strip() for w in data]
	word_to_id = {k:v for (k,v) in zip(vocab,range(len(vocab)))}

	with open(filename,"r") as f:
		data = f.readlines()
		f1 = open(filename+".id",'w')
		print("converting...")
		for line in tqdm.tqdm(data):
			words = line.strip().split()+[""]
			ids = ' '.join([str(word_to_id[word]) 
				if word in word_to_id else str(word_to_id[""]) 
				for word in words])
			f1.write(ids+"\n")
		f1.close()
	return filename+".id"

def main():
	src = "train.tags.en-zh.en"#有html标记
	trg = "train.tags.en-zh.zh"#同
	src_vocab_size = 10000
	trg_vocab_size = 4000

	src1,trg1 = deletehtml(src,trg)
	

	src2 = segement_sentence(src1,src_vocab_size,lang='en')
	trg2 = segement_sentence(trg1,trg_vocab_size,lang='zh')

	src3 = convert_to_id(src+".deletehtml.segment",src+".deletehtml.vocab")
	trg3 = convert_to_id(trg+".deletehtml.segment",trg+".deletehtml.vocab")

if __name__ == '__main__':
	main()

使用上述代码对数据文件进行处理后,我们就得到了每一行一句话,一句话都转成编号的形式,句子的某尾加标志。①但是由于每句话的长短不一,需要将一个batch中的较短句子的长度padding至最长句子的长度,这样才能使encoder正常工作;②在decoder阶段,由于输入需要以开头,那么还得将目标语言再进行处理一下,x y z 变为 x y z。直接上完整的训练代码:

#coding:utf-8
import tensorflow as tf

MAX_LEN = 50
SOS_ID = 1

SRC_TRAIN_DATA = "../train.tags.en-zh.en.deletehtml.segment.id"
TRG_TRAIN_DATA = "../train.tags.en-zh.zh.deletehtml.segment.id"
CHECKPOINT_PATH = "./seq2seq_ckpt"

HIDDEN_SIZE = 1024
NUM_LAYERS = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
NUM_EPOCH = 5
KEEP_PROB = 0.8
MAX_GRAD_NORM = 5
SHARE_EMB_AND_SOFTMAX = True

class NMTModel(object):
	def __init__(self):
		self.enc_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\
		 for _ in range(NUM_LAYERS)])
		self.dec_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\
		 for _ in range(NUM_LAYERS)])

		self.src_embedding = tf.get_variable(
			"src_emb",[SRC_VOCAB_SIZE,HIDDEN_SIZE])
		self.trg_embedding = tf.get_variable(
			"trg_emb",[TRG_VOCAB_SIZE,HIDDEN_SIZE])
		
		if SHARE_EMB_AND_SOFTMAX:
			self.softmax_weight = tf.transpose(self.trg_embedding)
		else:
			self.softmax_weight = tf.get_variable("weight",[HIDDEN_SIZE,TRG_VOCAB_SIZE])
		self.softmax_bias = tf.get_variable("softmax_bias",[TRG_VOCAB_SIZE])

	def forward(self,src_input,src_size,trg_input,trg_label,trg_size):
		batch_size = tf.shape(src_input)[0]
		src_emb = tf.nn.embedding_lookup(self.src_embedding,src_input)
		trg_emb = tf.nn.embedding_lookup(self.trg_embedding,trg_input)

		src_emb = tf.nn.dropout(src_emb,KEEP_PROB)
		trg_emb = tf.nn.dropout(trg_emb,KEEP_PROB)

		with tf.variable_scope("encoder"):
			enc_outputs,enc_state = tf.nn.dynamic_rnn(
				self.enc_cell,src_emb,src_size,dtype=tf.float32)

		with tf.variable_scope("decoder"):
			dec_outputs, _ = tf.nn.dynamic_rnn(
				self.dec_cell,trg_emb,trg_size,initial_state=enc_state)

		output = tf.reshape(dec_outputs,[-1,HIDDEN_SIZE])
		logits = tf.matmul(output,self.softmax_weight) + self.softmax_bias
		loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(trg_label,[-1]),logits=logits)

		label_weights = tf.sequence_mask(trg_size,maxlen=tf.shape(trg_label)[1],dtype=tf.float32)
		label_weights = tf.reshape(label_weights,[-1])

		cost = tf.reduce_sum(loss*label_weights)
		cost_per_token = cost / tf.reduce_sum(label_weights)

		trainable_variables = tf.trainable_variables()

		grads = tf.gradients(cost / tf.to_float(batch_size), trainable_variables)
		grads,_ = tf.clip_by_global_norm(grads,MAX_GRAD_NORM)
		optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
		train_op = optimizer.apply_gradients(zip(grads,trainable_variables))

		return cost_per_token,train_op

def run_epoch(session,cost_op,train_op,saver,step):
	while True:
		try:
			cost,_ = session.run([cost_op,train_op])
			if step%10 == 0:
				print("steps %d, per token cost is %.3f"%(step,cost))
			if step%200 == 0:
				saver.save(session,CHECKPOINT_PATH,global_step=step)
			step += 1
		except tf.errors.OutOfRangeError:
			break
	return step


def MakeDataset(file_path):
	dataset = tf.data.TextLineDataset(file_path)
	dataset = dataset.map(lambda string: tf.string_split([string]).values)
	dataset = dataset.map(lambda string: tf.string_to_number(string,tf.int32))
	dataset = dataset.map(lambda x: (x,tf.size(x)))
	return dataset

def MakeSrcTrgDataset(src_path,trg_path,batch_size):
	src_data = MakeDataset(src_path)
	trg_data = MakeDataset(trg_path)

	dataset = tf.data.Dataset.zip((src_data,trg_data))

	def FilterLength(src_tuple,trg_tuple):
		((src_input,src_len),(trg_label,trg_len)) = (src_tuple,trg_tuple)
		src_len_ok = tf.logical_and(tf.greater(src_len,1),tf.less_equal(src_len,MAX_LEN))
		trg_len_ok = tf.logical_and(tf.greater(trg_len,1),tf.less_equal(trg_len,MAX_LEN))
		return tf.logical_and(src_len_ok,trg_len_ok)
	dataset = dataset.filter(FilterLength)

	def MakeTrgInput(src_tuple,trg_tuple):
		((src_input,src_len),(trg_label,trg_len)) = (src_tuple,trg_tuple)
		trg_input = tf.concat([[SOS_ID],trg_label[:-1]],axis=0)
		return ((src_input,src_len),(trg_input,trg_label,trg_len))
	dataset = dataset.map(MakeTrgInput)
	dataset = dataset.shuffle(10000)

	padded_shapes = (
		(tf.TensorShape([None]),
		 tf.TensorShape([])),
		(tf.TensorShape([None]),
		 tf.TensorShape([None]),
		 tf.TensorShape([])))
	batched_dataset = dataset.padded_batch(batch_size,padded_shapes)
	return batched_dataset
				

def main():
	initializer = tf.random_uniform_initializer(-0.05,0.05)
	with tf.variable_scope("nmt_model",reuse=None,initializer=initializer):
		train_model = NMTModel()

	data = MakeSrcTrgDataset(SRC_TRAIN_DATA,TRG_TRAIN_DATA,BATCH_SIZE)
	iterator = data.make_initializable_iterator()
	(src,src_size),(trg_input,trg_label,trg_size) = iterator.get_next()

	cost_op,train_op = train_model.forward(src,src_size,trg_input,trg_label,trg_size)
	saver = tf.train.Saver()
	step = 0

	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7,allow_growth=True)
	session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

	with session as sess:
		tf.global_variables_initializer().run()
		for i in range(NUM_EPOCH):
			print("In iteration: %d"%(i+1))
			sess.run(iterator.initializer)
			step = run_epoch(sess,cost_op,train_op,saver,step)

if __name__ == '__main__':
	main()

在对新句子进行inference时,需要确定解码时的条件,因此要用到tf.while_loop函数,这里直接给出预测代码:

#coding:utf-8
import tensorflow as tf


CHECKPOINT_PATH = "./seq2seq_ckpt-9000"

HIDDEN_SIZE = 1024
NUM_LAYERS = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
SHARE_EMB_AND_SOFTMAX = True
SOS_ID = 1
EOS_ID = 2

class NMTModel(object):
	def __init__(self):
		self.enc_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\
		 for _ in range(NUM_LAYERS)])
		self.dec_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\
		 for _ in range(NUM_LAYERS)])

		self.src_embedding = tf.get_variable(
			"src_emb",[SRC_VOCAB_SIZE,HIDDEN_SIZE])
		self.trg_embedding = tf.get_variable(
			"trg_emb",[TRG_VOCAB_SIZE,HIDDEN_SIZE])
		
		if SHARE_EMB_AND_SOFTMAX:
			self.softmax_weight = tf.transpose(self.trg_embedding)
		else:
			self.softmax_weight = tf.get_variable("weight",[HIDDEN_SIZE,TRG_VOCAB_SIZE])
		self.softmax_bias = tf.get_variable("softmax_bias",[TRG_VOCAB_SIZE])

	def inference(self,src_input):
		src_size = tf.convert_to_tensor([len(src_input)],dtype=tf.int32)
		src_input = tf.convert_to_tensor([src_input],dtype=tf.int32)
		src_emb = tf.nn.embedding_lookup(self.src_embedding,src_input)

		with tf.variable_scope("encoder"):
			enc_outputs,enc_state = tf.nn.dynamic_rnn(
				self.enc_cell,src_emb,src_size,dtype=tf.float32)
		MAX_DEC_LEN = 100

		with tf.variable_scope("decoder/rnn/multi_rnn_cell"):
			init_array = tf.TensorArray(dtype=tf.int32,size=0,dynamic_size=True,clear_after_read=False)
			init_array = init_array.write(0,SOS_ID)

			init_loop_var = (enc_state,init_array,0)

			def continue_loop_condition(state,trg_ids,step):
				return tf.reduce_all(tf.logical_and(tf.not_equal(trg_ids.read(step),EOS_ID),tf.less(step,MAX_DEC_LEN-1)))

			def loop_body(state,trg_ids,step):
				trg_input = [trg_ids.read(step)]
				trg_emb = tf.nn.embedding_lookup(self.trg_embedding,trg_input)

				dec_outputs,next_state = self.dec_cell.call(state=state,inputs=trg_emb)
				output = tf.reshape(dec_outputs,[-1,HIDDEN_SIZE])
				logits = (tf.matmul(output,self.softmax_weight) + self.softmax_bias)
				next_id = tf.argmax(logits,axis=1,output_type=tf.int32)

				trg_ids = trg_ids.write(step+1,next_id[0])
				return next_state,trg_ids,step+1

			state,trg_ids,step = tf.while_loop(
				continue_loop_condition,loop_body,init_loop_var)
			return trg_ids.stack()

def main():
	from stanfordcorenlp import StanfordCoreNLP
	nlp = StanfordCoreNLP("../../stanford-corenlp-full-2018-10-05",lang='en')
	with tf.variable_scope("nmt_model",reuse=None):
		model = NMTModel()
	vocab_file = "../train.tags.en-zh.en.deletehtml.vocab"
	sentence = "It is very beautiful!"
	with open(vocab_file,'r') as f:
		data = f.readlines()
		words = [w.strip() for w in data]
	word_to_id = {k:v for (k,v) in zip(words,range(len(words)))}
	wordlist = nlp.word_tokenize(sentence.strip()) + [""]
	# print(wordlist)
	idlist = [str(word_to_id[w]) if w in word_to_id else str(word_to_id[""]) for w in wordlist]
	idlist = [int(i) for i in idlist]
	# print(idlist)

	output_op = model.inference(idlist)
	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7,allow_growth=True)
	session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
	saver = tf.train.Saver()
	saver.restore(session,CHECKPOINT_PATH)

	output = session.run(output_op)

	vocab_file2 = "../train.tags.en-zh.zh.deletehtml.vocab"
	with open(vocab_file2,'r') as f2:
		data2 = f2.readlines()
		words = [w.strip() for w in data2]
	id_to_word = {k:v for (k,v) in zip(range(len(words)),words)}
	print([id_to_word[i] for i in output])
	session.close()

	nlp.close()

if __name__ == '__main__':
	main()

下面给出预测结果:

['', '这', '是', '非常', '美丽', '的', '!', '']

由于不想让篇幅过长,所以直接上干货,具体原理请参考开头所说的书,有疑问请在下面留言,我的软件配置是python3.6.5 + tensorflow-gpu==1.12 + cuda9.0。

你可能感兴趣的:(tensorflow)