大家好!我复现的代码是用RNN实现句子生成器,来自paper 'On the difficulty of training Recurrent Neural Networks'.
之前寒小阳老师有篇博客(http://blog.csdn.net/han_xiaoyang/article/details/51253274)是复现这个代码,但是运行不出的,有些细节省略了。我经过一个下午琢磨终于running-Ok!!!
首先给出原文地址:https://arxiv.org/abs/1211.5063
数据集:reddit-comments-2015-08.csv(https://github.com/dennybritz/rnn-tutorial-rnnlm/tree/master/data)
100%可运行代码(python):
RNN部分:
# coding:utf-8 import csv import itertools import operator import timeit import nltk import numpy as np import sys import utils from datetime import datetime '''词向量映射 ''' vocabulary_size = 8000 unknown_token = "UNKNOWN_TOKEN" sentence_start_token = "SENTENCE_START" sentence_end_token = "SENTENCE_END" # 读取数据,添加SENTENCE_START和SENTENCE_END在开头和结尾 print "Reading CSV file..." with open('C:/Users/admin/Desktop/test_txt/reddit-comments-2015-08.csv', 'rb') as f: reader = csv.reader(f, skipinitialspace=True) reader.next() # 分句 sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader]) # 添加SENTENCE_START和SENTENCE_END sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences] print "Parsed %d sentences." % (len(sentences)) # 分词 tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] # 统计词频 word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print "Found %d unique words tokens." % len(word_freq.items()) # 取出高频词构建词到位置,和位置到词的索引 vocab = word_freq.most_common(vocabulary_size-1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) print "Using vocabulary size %d." % vocabulary_size print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]) # 把所有词表外的词都标记为unknown_token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] print "\nExample sentence: '%s'" % sentences[0] print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0] # 构建完整训练集 X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) print X_train '''构建RNN ''' #初始化参数:U,V和W class RNNNumpy: def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4): # 给词表大小,隐状态维度等赋值 self.word_dim = word_dim self.hidden_dim = hidden_dim self.bptt_truncate = bptt_truncate # 随机初始化参数U,V,W self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim)) self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim)) self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim)) #前向计算:根据权重计算下一层/下一个时间点的输出、隐状态的过程(简单实现) def forward_propagation(self, x): # 总共的 序列/时间 长度 T = len(x) # 在前向计算时,我们保留所有的时间隐状态s,因为之后会用到 # 初始的隐状态设为 0 s = np.zeros((T + 1, self.hidden_dim)) s[-1] = np.zeros(self.hidden_dim) # 在前向计算时,我们也保留所有的时间点输出o,之后也会用到 o = np.zeros((T, self.word_dim)) # 每个时间/序列点 进行运算 for t in np.arange(T): # one-hot编码矩阵乘法,可以视作列选择 s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1])) o[t] = utils.softmax(self.V.dot(s[t])) return [o, s] RNNNumpy.forward_propagation = forward_propagation #取最大概率那个词 def predict(self, x): # 做前向计算,取概率最高的词下标 o, s = self.forward_propagation(x) return np.argmax(o, axis=1) RNNNumpy.predict = predict #试跑一下 np.random.seed(10) model = RNNNumpy(vocabulary_size) o, s = model.forward_propagation(X_train[2]) print o.shape print o #验证下predict函数 predictions = model.predict(X_train[10]) print predictions.shape print predictions #计算损失函数 def calculate_total_loss(self, x, y): L = 0 # 对于每句话 for i in np.arange(len(y)): o, s = self.forward_propagation(x[i]) # 我们只在乎预测对的那个词的概率 correct_word_predictions = o[np.arange(len(y[i])), y[i]] # 加到总loss里 L += -1 * np.sum(np.log(correct_word_predictions)) return L def calculate_loss(self, x, y): # 除以总样本数 N = np.sum((len(y_i) for y_i in y)) return self.calculate_total_loss(x,y)/N RNNNumpy.calculate_total_loss = calculate_total_loss RNNNumpy.calculate_loss = calculate_loss #用随机梯度下降和时间反向传播训练RNN def bptt(self, x, y): T = len(y) # 反向计算 o, s = self.forward_propagation(x) # We accumulate the gradients in these variables dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) delta_o = o delta_o[np.arange(len(y)), y] -= 1. # For each output backwards... for t in np.arange(T)[::-1]: dLdV += np.outer(delta_o[t], s[t].T) # Initial delta calculation delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2)) # Backpropagation through time (for at most self.bptt_truncate steps) for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]: # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step) dLdW += np.outer(delta_t, s[bptt_step-1]) dLdU[:,x[bptt_step]] += delta_t # 每一步更新delta delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2) return [dLdU, dLdV, dLdW] RNNNumpy.bptt = bptt #Gradient Checking def gradient_check(self, x, y, h=0.001, error_threshold=0.01): # Calculate the gradients using backpropagation. We want to checker if these are correct. bptt_gradients = model.bptt(x, y) # List of all parameters we want to check. model_parameters = ['U', 'V', 'W'] # Gradient check for each parameter for pidx, pname in enumerate(model_parameters): # Get the actual parameter value from the mode, e.g. model.W parameter = operator.attrgetter(pname)(self) print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape)) # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ... it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: ix = it.multi_index # Save the original value so we can reset it later original_value = parameter[ix] # Estimate the gradient using (f(x+h) - f(x-h))/(2*h) parameter[ix] = original_value + h gradplus = model.calculate_total_loss([x],[y]) parameter[ix] = original_value - h gradminus = model.calculate_total_loss([x],[y]) estimated_gradient = (gradplus - gradminus)/(2*h) # Reset parameter to original value parameter[ix] = original_value # The gradient for this parameter calculated using backpropagation backprop_gradient = bptt_gradients[pidx][ix] # calculate The relative error: (|x - y|/(|x| + |y|)) relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient)) # If the error is to large fail the gradient check if relative_error > error_threshold: print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix) print "+h Loss: %f" % gradplus print "-h Loss: %f" % gradminus print "Estimated_gradient: %f" % estimated_gradient print "Backpropagation gradient: %f" % backprop_gradient print "Relative Error: %f" % relative_error return it.iternext() print "Gradient check for parameter %s passed." % (pname) RNNNumpy.gradient_check = gradient_check # To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 100 np.random.seed(10) model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000) model.gradient_check([0,1,2,3], [1,2,3,4]) #SGD Implementation # Performs one step of SGD. def numpy_sdg_step(self, x, y, learning_rate): # Calculate the gradients dLdU, dLdV, dLdW = self.bptt(x, y) # Change parameters according to gradients and learning rate self.U -= learning_rate * dLdU self.V -= learning_rate * dLdV self.W -= learning_rate * dLdW RNNNumpy.sgd_step = numpy_sdg_step #根据梯度来迭代和更新参数,也就是要顺着坡度方向下山 # Outer SGD Loop # - model: The RNN model instance # - X_train: The training data set # - y_train: The training data labels # - learning_rate: Initial learning rate for SGD # - nepoch: Number of times to iterate through the complete dataset # - evaluate_loss_after: Evaluate the loss after this many epochs def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5): # We keep track of the losses so we can plot them later """ :type model: object """ losses = [] num_examples_seen = 0 for epoch in range(nepoch): # Optionally evaluate the loss if (epoch % evaluate_loss_after == 0): loss = model.calculate_loss(X_train, y_train) losses.append((num_examples_seen, loss)) time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print "%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss) # Adjust the learning rate if loss increases if (len(losses) > 1 and losses[-1][1] > losses[-2][1]): learning_rate = learning_rate * 0.5 print "Setting learning rate to %f" % learning_rate sys.stdout.flush() # For each training example... for i in range(len(y_train)): # One SGD step model.sgd_step(X_train[i], y_train[i], learning_rate) num_examples_seen += 1 # 在小样本集上试试 np.random.seed(10) # 在小样本集上试试 model = RNNNumpy(vocabulary_size) losses = train_with_sgd(model, X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1) #Training our Network with Theano and the GPU from rnn_theano import RNNTheano, gradient_check_theano np.random.seed(10) # To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 5 model = RNNTheano(grad_check_vocab_size, 10) gradient_check_theano(model, [0,1,2,3], [1,2,3,4]) np.random.seed(10) model = RNNTheano(vocabulary_size) model.sgd_step(X_train[10], y_train[10], 0.005) model = RNNTheano(vocabulary_size, hidden_dim=50) # losses = train_with_sgd(model, X_train, y_train, nepoch=50) # save_model_parameters_theano('./data/trained-model-theano.npz', model) utils.load_model_parameters_theano('C:/Users/admin/Desktop/test_txt/trained-model-theano.npz', model) #Generating Text def generate_sentence(model): # We start the sentence with the start token new_sentence = [word_to_index[sentence_start_token]] # Repeat until we get an end token while not new_sentence[-1] == word_to_index[sentence_end_token]: next_word_probs = model.forward_propagation(new_sentence) sampled_word = word_to_index[unknown_token] # We don't want to sample unknown words while sampled_word == word_to_index[unknown_token]: samples = np.random.multinomial(1, next_word_probs[-1]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word) sentence_str = [index_to_word[x] for x in new_sentence[1:-1]] return sentence_str num_sentences = 20 senten_min_length = 10 for i in range(num_sentences): sent = [] # We want long sentences, not sentences with one or two words while len(sent) < senten_min_length: sent = generate_sentence(model) print " ".join(sent)-------------------------------------------------------------------------------------------------------
被调用函数:
utils.py
import numpy as np def softmax(x): xt = np.exp(x - np.max(x)) return xt / np.sum(xt) def save_model_parameters_theano(outfile, model): U, V, W = model.U.get_value(), model.V.get_value(), model.W.get_value() np.savez(outfile, U=U, V=V, W=W) print "Saved model parameters to %s." % outfile def load_model_parameters_theano(path, model): npzfile = np.load(path) U, V, W = npzfile["U"], npzfile["V"], npzfile["W"] model.hidden_dim = U.shape[0] model.word_dim = U.shape[1] model.U.set_value(U) model.V.set_value(V) model.W.set_value(W) print "Loaded model parameters from %s. hidden_dim=%d word_dim=%d" % (path, U.shape[0], U.shape[1]) -------------------------------------------------------------------------------------------------------被调用函数:
rnn_theano.pyimport numpy as np import theano as theano import theano.tensor as T from utils import * import operator class RNNTheano: def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4): # Assign instance variables self.word_dim = word_dim self.hidden_dim = hidden_dim self.bptt_truncate = bptt_truncate # Randomly initialize the network parameters U = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, word_dim)) V = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (word_dim, hidden_dim)) W = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (hidden_dim, hidden_dim)) # Theano: Created shared variables self.U = theano.shared(name='U', value=U.astype(theano.config.floatX)) self.V = theano.shared(name='V', value=V.astype(theano.config.floatX)) self.W = theano.shared(name='W', value=W.astype(theano.config.floatX)) # We store the Theano graph here self.theano = {} self.__theano_build__() def __theano_build__(self): U, V, W = self.U, self.V, self.W x = T.ivector('x') y = T.ivector('y') def forward_prop_step(x_t, s_t_prev, U, V, W): s_t = T.tanh(U[:, x_t] + W.dot(s_t_prev)) o_t = T.nnet.softmax(V.dot(s_t)) return [o_t[0], s_t] [o, s], updates = theano.scan( forward_prop_step, sequences=x, outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))], non_sequences=[U, V, W], truncate_gradient=self.bptt_truncate, strict=True) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) # Gradients dU = T.grad(o_error, U) dV = T.grad(o_error, V) dW = T.grad(o_error, W) # Assign functions self.forward_propagation = theano.function([x], o) self.predict = theano.function([x], prediction) self.ce_error = theano.function([x, y], o_error) self.bptt = theano.function([x, y], [dU, dV, dW]) # SGD learning_rate = T.scalar('learning_rate') self.sgd_step = theano.function([x, y, learning_rate], [], updates=[(self.U, self.U - learning_rate * dU), (self.V, self.V - learning_rate * dV), (self.W, self.W - learning_rate * dW)]) def calculate_total_loss(self, X, Y): return np.sum([self.ce_error(x, y) for x, y in zip(X, Y)]) def calculate_loss(self, X, Y): # Divide calculate_loss by the number of words num_words = np.sum([len(y) for y in Y]) return self.calculate_total_loss(X, Y) / float(num_words) def gradient_check_theano(model, x, y, h=0.001, error_threshold=0.01): # Overwrite the bptt attribute. We need to backpropagate all the way to get the correct gradient model.bptt_truncate = 1000 # Calculate the gradients using backprop bptt_gradients = model.bptt(x, y) # List of all parameters we want to chec. model_parameters = ['U', 'V', 'W'] # Gradient check for each parameter for pidx, pname in enumerate(model_parameters): # Get the actual parameter value from the mode, e.g. model.W parameter_T = operator.attrgetter(pname)(model) parameter = parameter_T.get_value() print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape)) # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ... it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: ix = it.multi_index # Save the original value so we can reset it later original_value = parameter[ix] # Estimate the gradient using (f(x+h) - f(x-h))/(2*h) parameter[ix] = original_value + h parameter_T.set_value(parameter) gradplus = model.calculate_total_loss([x], [y]) parameter[ix] = original_value - h parameter_T.set_value(parameter) gradminus = model.calculate_total_loss([x], [y]) estimated_gradient = (gradplus - gradminus) / (2 * h) parameter[ix] = original_value parameter_T.set_value(parameter) # The gradient for this parameter calculated using backpropagation backprop_gradient = bptt_gradients[pidx][ix] # calculate The relative error: (|x - y|/(|x| + |y|)) relative_error = np.abs(backprop_gradient - estimated_gradient) / ( np.abs(backprop_gradient) + np.abs(estimated_gradient)) # If the error is to large fail the gradient check if relative_error > error_threshold: print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix) print "+h Loss: %f" % gradplus print "-h Loss: %f" % gradminus print "Estimated_gradient: %f" % estimated_gradient print "Backpropagation gradient: %f" % backprop_gradient print "Relative Error: %f" % relative_error return it.iternext() print "Gradient check for parameter %s passed." % (pname) -------------------------------------------------------------------------------------------------------运行环境:Win10 64bit+python2.7
结果截图: