循环神经网络对长度可变的序列数据有较强的处理能力,在NLP领域独领风骚。
10.1 循环神经网络简介
1.循环神经网络的前向传播程序设计
import numpy as np
x=[0.8,0.1]
init_state=[0.3,0.6]
W=np.asarray([[0.2,0.4],[0.7,0.3]])
U=np.asarray([0.8,0.1])
b_h=np.asarray([0.2,0.1])
V=np.asarray([[0.5],[0.5]])
b_o=0.1
for i in range(len(x)):
before_activation=np.dot(init_state,W)+x[i]*U+b_h
state=np.tanh(before_activation)
init_state=state
final_output=np.dot(state,V)+b_o
print("t%s state : %s"%(i+1,state))
print("t%s output: %s\n"%(i+1,final_output))
2.循环神经网络的梯度
3.循环神经网络的不同设计模式
(1)每个时刻都有输出,并且在隐藏层之间引入定向循环。
(2)每个时刻都有输出,且在该时刻的输出到下一时刻的隐藏层之间有循环连接。相对于前一种一般效果较差,因为o比h而言缺少对过去的重要信息。
(3)隐藏层之间存在着循环连接,但是输出在若干时刻后,而不是每一个时刻都有输出。
10.2 自然语言建模和词向量
Word2Vec 是Google推出的一款计算词向量的工具。
1.统计学语言模型
2.Word2Vec
VSM(向量空间模型),通过统计语义假说(Statistical Semantics Hypothesis,语言的统计特征隐藏着语义的信息)。比较流行的两个派生版本为:Bag of Worlds Hypothesis(统计一篇文章的词频,使用较高频次出现的词代表文档的主题) 和Distributional Hypothesis(上下文环境相似的两个词语义也相近)。
向量空间模型可大致分为两类:一类是计数模型(Latent Sematic Analysis):统计在语料库中相邻出现的词的频率,再把这些计数的结果转化为小而稠密的矩阵。另一类是预测模型(Neural Probabistic Language Models):根据某个词相邻的词推测出这个词及其空间向量。
3.使用TensorFlow实现Word2Vec
import numpy as np
import tensorflow as tf
import collections
import random
import zipfile
vocabulary_size=50000
file="./Word2vec/text8.zip"
def read_data(file):
with zipfile.ZipFile(file = file) as f:
original_data=tf.compat.as_str(f.read(f.namelist()[0])).split()
return original_data
original_words=read_data(file)
print("len of original word:",len(original_words))
def build_vocabulary(original_words):
count=[["unknown",-1]]
count.extend(collections.Counter(original_words).most_common(vocabulary_size-1))
print(count)
dictionary=dict()
for word, _ in count:
dictionary[word]=len(dictionary)
data=list()
unknown_count=0
for word in original_words:
if word in dictionary:
index=dictionary[word]
else:
index=0
unknown_count+=1
data.append(index)
count[0][1]=unknown_count
reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
return data,count,dictionary,reverse_dictionary
data,count,dictionary,reverse_dictionary=build_vocabulary(original_words)
print("Most common words (+unkonwn)",count[:5])
print("Sample data",data[:10],[reverse_dictionary[i] for i in data[:10]])
data_index=0
data,count,dictionary,reverse_dictionary=build_vocabulary(original_words)
def generate_batch(batch_size,num_of_samples,skip_distance):
global data_index
batch=np.ndarray(shape=(batch_size),dtype=np.int32)
labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
num_of_sample_words=2*skip_distance+1
buffer=collections.deque(maxlen==num_of_sample_words)
for _ in range(num_of_sample_words):
buffer.append(data[data_index])
data_index=(data_index+1)
for i in range(batch_size // num_of_samples):
target=skip_distance
targets_to_avoid=[skip_distance]
for j in range(num_of_samples):
while target in targets_to_avoid:
target=random.randint(0,num_of_sample_words-1)
targets_to_avoid.append(target)
batch[i*num_of_samples +j]=buffer[skip_distance]
labels[i*num_of_samples+j,0]=buffer[target]
buffer.append(data[data_index])
data_index=(data_index+1)
return batch,labels
batch,labels=generate_batch(batch_size=8,num_of_samples=2,skip_distance=1)
for i in range(8):
print(batch[i],reverse_dictionary[batch[i]],"->",labels[i,0],reverse_dictionary[labels[i,0]])
import numpy as np
import tensorflow.compat.v1 as tf
import math
import vocabulary
max_steps=10000
batch_size=128
embedding_size=128
skip_distance=1
num_of_samples=2
vocabulary_size=50000
valid_sample=np.random.choice(100,16,replace=False)
num_sampled=64
with tf.Graph().as_default():
train_inputs=tf.placeholder(tf.int32,shape=[batch_size])
train_labels=tf.placeholder(tf.int32,shape=[batch_size,1])
embeddings=tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
embed=tf.nn.embedding_lookup(embeddings,train_inputs)
nce_weight=tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
nce_biases=tf.Variable(tf.zeros([vocabulary_size]))
nec_loss=tf.nn.nce_loss(weights=nce_weight,biases=nce_biases,labels=train_labels,inputs=embed,num_sampled=num_sampled,num_classes=vocabulary_size)
loss=tf.reduce_mean(nec_loss)
optimizer=tf.train.GradientDescentOptimizer(1.0).minimize(loss)
norm=tf.sqrt(tf.reduce_mean(tf.square(embeddings),1,keep_dims=True))
normal_embeddings=embeddings/norm
valid_inputs=tf.constant(valid_sample,dtype=tf.int32)
valid_embeddings=tf.nn.embedding_lookup(normal_embeddings,valid_inputs)
similarity=tf.matmul(valid_embeddings,normal_embeddings,transpose_b=True)
with tf.Session() as sess:
tf.global_variables_initializer().run()
#total_loss -- avg_loss
total_loss=0
average_loss=0
for step in range(max_steps+1):
batch_inputs,batch_labels=vocabulary.generate_batch(batch_size,num_of_samples,skip_distance)
loss_val, _ =sess.run([loss,optimizer],feed_dict={
train_inputs:batch_inputs,
train_labels:batch_labels})
total_loss+=loss_val
if step>0 and step%1000==0:
average_loss=total_loss/1000
print("Average loss at %d step is : %f "%(step,average_loss))
average_loss=0
total_loss=0
if step>0 and step%5000==0:
similar=similarity.eval()
for i in range(16):
nearest=(-similar[i,:]).argsort()[1:8+1]
valid_word=vocabulary.reverse_dictionary[valid_sample[i]]
nearset_information="Nearest to %s is : " %valid_word
for j in range(8):
close_word=vocabulary.reverse_dictionary[nearest[j]]
nearset_information="%s %s"(nearset_information,close_word)
print("valid_word is: %s"%valid_word)
print(nearset_information)
final_embeddings=normalize_embeddings.eval()
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import vocabulary
import Word2Vec_skip
tsne=TSNE(perplexity=30,n_comonents=2,init="pca",n_iter=5000)
plot_only=100
low_dim_embbs=tsne.fit_transform(Word2Vec_skip.final_embeddings[:plot_only,:])
labels=list()
for i in range(plot_only):
labels.append(vocabulary.reverse_dictionary[i])
plt.figure(figsize=(20,20))
for j,label in enumerate(labels):
x,y=low_dim_embbs[j,:]
plt.scatter(x,y)
plt.annotate(label,xy=(x,y),xytext=(5,2),textcoords="offset points",ha="right",va="bottom")
plt.savefig(fname="after_tsne.png")
10.3 LSTM实现自然语言建模
import tensorflow as tf
(train_images,train_labels),(test_images,test_labels)=tf.keras.datasets.mnist.load_data()
train_images,test_images=train_images/255.0,test_images/255.0
sample,sample_label=train_images[0],train_labels[0]
def build_model():
rnn_layer=tf.keras.layers.RNN(tf.keras.layers.LSTMCell(units=64),input_shape=(None,28))
model=tf.keras.models.Sequential([rnn_layer,tf.keras.layers.BatchNormalization(),tf.keras.layers,Dense(units=10,activation='softmax')])
return model
model=build_model()
model.compile(loss='sparse_categorical_crossentropy',optimizer='SGD',metrics=['accuracy'])
model.fit(train_images,train_labels,validation_data=(test_images,test_labels),batch_size=100,epochs=20)
model.summary()
LSTM在自然语言建模中的应用
http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
git clone https://github.com/tensorflow/models.git
使用Models库中/tutorials/rnn/ptb路径下的reader.py文件来操作PTB数据集的内容。
import numpy as np
import time
import tensorflow.compat.v1 as tf
import reader
class Config(object):
init_scale=0.1
learning_rate=1.0
max_grad_norm=5
num_layers=2
num_steps=20
word_dimension=200
max_epoch=4
total_epoch=13
keep_prob=1.0
lr_decay=0.5
batch_size=20
vocab_size=10000
class PTBModel(object):
def __init__(self,is_training,config,data,name=None):
self.batch_size=config.batch_size
self.num_steps=config.num_steps
self.epoch_size=((len(data)//self.batch_size)-1)//self.num_steps
self.input_data,self.targets=reader.ptb_producer(data,self.batch_size,self.num_steps,name=name)
self.keep_prob=config.keep_prob
self.word_dimension=config.word_dimension
lstm_cell=tf.nn.rnn_cell,BasicLSTMCell(self.word_dimension,forget_bias=0.0,state_is_tuple=True)
if is_training and config.keep_prob <1:
lstm_cell=tf.nn.rnn_cell.DropoutWrapper(lstm_cell,output_keep_prob=self.keep_prob)
self.num_layers=config.num_steps
cell_layer=tf.nn.rnn_cell.MultiRNNCell([lstm_cell for _ in range(self.num_layers)],state_is_tuple=True)
self.initial_state=cell_layer.zero_state(self.batch_size,tf.float32)
self.vocab_size=config.vocab_size
embedding=tf.get_variable("embedding",
[self.vocab_size,
self.word_dimension],
dtype=tf.float32)
inputs=tf.nn.embedding_lookup(embedding,self.input_data)
outputs=[]
state=self.initial_state
with tf.variable_scope("RNN"):
for time_step in range(self.num_steps):
if time_step>0:
tf.get_variable_scope().reuse_variable()
cell_output,state=cell_layer(inputs[:,time_step,:],state)
outputs.append(cell_output)
output=tf.rehape(tf.concat(outputs,1),[-1,self.word_dimension])
weight=tf.get_variable("softmax_w",[self.word_dimension,self.vocab_size],dtype=tf.float32)
bias=tf.get_variable("softmax_b",[self.vocab_size],dtype=tf.float32)
logits=tf.matmul(output,weight)+bias
loss=tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits],
[tf.rehape(self.targets,[-1])],
[tf.ones([self.batch_size*self.num_steps],dtype=tf.float32)])
self.cost=tf.reduce_sum(loss)/self.batch_size
self.final_state=state
if not is_training:
return
self.learning_rate=tf.Variables(.0,trainable=False)
gradients=tf.gradients(self.cost,tf.trainable_variables())
clipped_grads, _ =tf.clip_by_global_norm(gradients,config.max_grad_norm)
SGDOPtimizer=tf.train.GradientDescentOptimizer(self.learning_rate)
self.train_op=SGDOPtimizer.apply_gradients(zip(clipped_grads,tf.trainable_variables()),
global_step=tf.train.get_or_create_global_step())
self.new_learning_rate=tf.placeholder(tf.float32,shape=[],name="new_learning_rate")
self.learning_rate_update=tf.assign(self.learning_rate,self.new_learning_rate)
def assign_lr(selfself,session,lr_value):
session.run(self.learning_rate_update,feed_dict={self.new_learning_rate:lr_value})
def run_epoch(session,model,train_op=None,output_log=False):
start_time=time.time()
costs=0
iters=0
state=session.run(model.initial_state)
fetches={
"costs":model.cost,
"final_stat":model.final_state,
}
if train_op is not None:
fetches["train_op"]=train_op
for step in range(model.epoch_size):
feed_dict={}
for i,(c,h) in enumerate(model.initial_state):
feed_dict[c]=state[i].c
feed_dict[h]=state[i].h
result=session.run(fetches,feed_dict)
cost=result["cost"]
state=result["final_state"]
cost += cost
iters += model.num_steps
if output_log and step % (model.epoch_size//10)==10:
print("step%.3f perplexity: %.3f speed: %.0f words/sec"%(step,np.exp(costs/iters),iters*model.batch_size/(time.time()-start_time)))
return np.exp(costs/iters)
train_data,valid_data,test_data,_=reader.ptb_raw_data("../PTB/simplep-examples/data/")
train_config=Config()
valid_config=Config()
test_config=Config()
test_config.batch_size=1
test_config.num_steps=1
with tf.Graph().as_default():
initializer=tf.random_uniform_initializer(-config.init_scale,config.init_scale)
with tf.name_scope("Train"):
with tf.variable_scope("Model",reuse=None,initializer=initializer):
Model_train=PTBModel(is_training=True,
config=train_config,
data=train_data,
name="TrainModel")
with tf.name_scope("Valid"):
with tf.variable_scope("Model", reuse=None, initializer=initializer):
Model_train = PTBModel(is_training=False,
config=valid_config,
data=valid_data,
name="ValidModel")
with tf.name_scope("Test"):
with tf.variable_scope("Model", reuse=None, initializer=initializer):
Model_train = PTBModel(is_training=False,
config=test_config,
data=test_data,
name="TestModel")
sv=tf.train.Supervisor()
with sv.managed_session() as session:
for i in range(config.total_epoch):
lr_decay=config.lr_decay ** max(i+1-config.max_epoch,0.0)
Model_train.assign_lr(session,config.learning_rate*lr_decay)
print("Epoch: %d Learnong rate : %.3f" %(i+1,session.run(Model_train.learning_rate)))
train_perplexity=run_epoch(session,Model_train,train_op=Model_train.train_op,output_log=True)
print("Epoch: %d Train Perplexity: %.3f" % (i+1,train_perplexity))
valid_perplexity=run_epoch(session,Model_valid)
print("Epoch: %d Valid Perplexity: %.3f"%(i+1,valid_perplexity))
test_perplexity=run_epoch(session,Model_test)
print("Test Perplexity : %.3f"% test_perplexity)
Dropout
import tensorflow.compat.v1 as tf
lstm=tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
dropout_lstm=tf.nn.run_cell.DropoutWrapper(lstm,output_keep_prob=0.5)
statcked_lstm=tf.nn.run_cell.MultiRNNCell([dropout_lstm]*number_of_layers)
10.4 循环神经网络的变种
深层循环神经网络
import tensorflow.compat.v1 as tf
lstm=tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
statcked_lstm=tf.nn.run_cell.MultiRNNCell([lstm]*number_of_layers)
for i in range(num_steps):
if i>0:
tf.get_variable_scope().reuse_variables()
stacked_lstm_output,state=stacked_lstm(current_input,state)
final_output=fc(statcked_lstm_output)
loss += calculate_loss(final_output,expexted_output)