Tensorflow学习笔记十——循环神经网络

循环神经网络对长度可变的序列数据有较强的处理能力,在NLP领域独领风骚。
10.1 循环神经网络简介
1.循环神经网络的前向传播程序设计

import numpy as np

x=[0.8,0.1]
init_state=[0.3,0.6]
W=np.asarray([[0.2,0.4],[0.7,0.3]])
U=np.asarray([0.8,0.1])
b_h=np.asarray([0.2,0.1])

V=np.asarray([[0.5],[0.5]])
b_o=0.1

for i in range(len(x)):
    before_activation=np.dot(init_state,W)+x[i]*U+b_h
    state=np.tanh(before_activation)
    init_state=state
    final_output=np.dot(state,V)+b_o
    print("t%s state : %s"%(i+1,state))
    print("t%s output: %s\n"%(i+1,final_output))

2.循环神经网络的梯度
3.循环神经网络的不同设计模式
(1)每个时刻都有输出,并且在隐藏层之间引入定向循环。
(2)每个时刻都有输出,且在该时刻的输出到下一时刻的隐藏层之间有循环连接。相对于前一种一般效果较差,因为o比h而言缺少对过去的重要信息。
(3)隐藏层之间存在着循环连接,但是输出在若干时刻后,而不是每一个时刻都有输出。
10.2 自然语言建模和词向量
Word2Vec 是Google推出的一款计算词向量的工具。
1.统计学语言模型
2.Word2Vec
VSM(向量空间模型),通过统计语义假说(Statistical Semantics Hypothesis,语言的统计特征隐藏着语义的信息)。比较流行的两个派生版本为:Bag of Worlds Hypothesis(统计一篇文章的词频,使用较高频次出现的词代表文档的主题) 和Distributional Hypothesis(上下文环境相似的两个词语义也相近)。
向量空间模型可大致分为两类:一类是计数模型(Latent Sematic Analysis):统计在语料库中相邻出现的词的频率,再把这些计数的结果转化为小而稠密的矩阵。另一类是预测模型(Neural Probabistic Language Models):根据某个词相邻的词推测出这个词及其空间向量。
3.使用TensorFlow实现Word2Vec

  • 到http://mattmahoney.net/dc/test8.zip下载训练要用到的语料库文件,下载到的文件名为text8.zip。
  • Word2Vec_vocabulary.py
 import numpy as np
import tensorflow as tf
import collections
import random
import zipfile

vocabulary_size=50000

file="./Word2vec/text8.zip"
def read_data(file):
    with zipfile.ZipFile(file = file) as f:
        original_data=tf.compat.as_str(f.read(f.namelist()[0])).split()
    return original_data
original_words=read_data(file)
print("len of original word:",len(original_words))
def build_vocabulary(original_words):
    count=[["unknown",-1]]
    count.extend(collections.Counter(original_words).most_common(vocabulary_size-1))
    print(count)
    dictionary=dict()
    for word, _ in count:
        dictionary[word]=len(dictionary)
    
    data=list()
    unknown_count=0
    
    for word in original_words:
        if word in dictionary:
            index=dictionary[word]
        else:
            index=0
            unknown_count+=1
        data.append(index)
        count[0][1]=unknown_count
        reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
    return data,count,dictionary,reverse_dictionary

data,count,dictionary,reverse_dictionary=build_vocabulary(original_words)
print("Most common words (+unkonwn)",count[:5])
print("Sample data",data[:10],[reverse_dictionary[i] for i in data[:10]])

data_index=0
data,count,dictionary,reverse_dictionary=build_vocabulary(original_words)

def generate_batch(batch_size,num_of_samples,skip_distance):
    global data_index
    batch=np.ndarray(shape=(batch_size),dtype=np.int32)
    labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
    num_of_sample_words=2*skip_distance+1
    buffer=collections.deque(maxlen==num_of_sample_words)
    for _ in range(num_of_sample_words):
        buffer.append(data[data_index])
        data_index=(data_index+1)
    
    for i in range(batch_size // num_of_samples):
        target=skip_distance
        targets_to_avoid=[skip_distance]
        for j in range(num_of_samples):
            while target in targets_to_avoid:
                target=random.randint(0,num_of_sample_words-1)
                
            targets_to_avoid.append(target)
            batch[i*num_of_samples +j]=buffer[skip_distance]
            labels[i*num_of_samples+j,0]=buffer[target]
        buffer.append(data[data_index])
        data_index=(data_index+1)
    return batch,labels

batch,labels=generate_batch(batch_size=8,num_of_samples=2,skip_distance=1)
for i in range(8):
    print(batch[i],reverse_dictionary[batch[i]],"->",labels[i,0],reverse_dictionary[labels[i,0]])
  • Word2Vec_skip.py
import numpy as np
import tensorflow.compat.v1 as tf
import math
import vocabulary

max_steps=10000
batch_size=128
embedding_size=128
skip_distance=1
num_of_samples=2
vocabulary_size=50000

valid_sample=np.random.choice(100,16,replace=False)
num_sampled=64

with tf.Graph().as_default():
    train_inputs=tf.placeholder(tf.int32,shape=[batch_size])
    train_labels=tf.placeholder(tf.int32,shape=[batch_size,1])

    embeddings=tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
    embed=tf.nn.embedding_lookup(embeddings,train_inputs)

    nce_weight=tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
    nce_biases=tf.Variable(tf.zeros([vocabulary_size]))
    nec_loss=tf.nn.nce_loss(weights=nce_weight,biases=nce_biases,labels=train_labels,inputs=embed,num_sampled=num_sampled,num_classes=vocabulary_size)

    loss=tf.reduce_mean(nec_loss)

    optimizer=tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    norm=tf.sqrt(tf.reduce_mean(tf.square(embeddings),1,keep_dims=True))

    normal_embeddings=embeddings/norm

    valid_inputs=tf.constant(valid_sample,dtype=tf.int32)
    valid_embeddings=tf.nn.embedding_lookup(normal_embeddings,valid_inputs)

    similarity=tf.matmul(valid_embeddings,normal_embeddings,transpose_b=True)
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        #total_loss -- avg_loss
        total_loss=0
        average_loss=0
        for step in range(max_steps+1):
            batch_inputs,batch_labels=vocabulary.generate_batch(batch_size,num_of_samples,skip_distance)
            loss_val, _ =sess.run([loss,optimizer],feed_dict={
                train_inputs:batch_inputs,
                train_labels:batch_labels})
            total_loss+=loss_val
            if step>0 and step%1000==0:
                average_loss=total_loss/1000
                print("Average loss at %d step is : %f "%(step,average_loss))
                average_loss=0
                total_loss=0
            if step>0 and step%5000==0:
                similar=similarity.eval()
                for i in range(16):
                    nearest=(-similar[i,:]).argsort()[1:8+1]
                    valid_word=vocabulary.reverse_dictionary[valid_sample[i]]
                    nearset_information="Nearest to %s is : " %valid_word
                    for j in range(8):
                        close_word=vocabulary.reverse_dictionary[nearest[j]]
                        nearset_information="%s %s"(nearset_information,close_word)
                    print("valid_word is: %s"%valid_word)
                    print(nearset_information)
            final_embeddings=normalize_embeddings.eval()
          
  • Word2Vec.py
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import vocabulary
import Word2Vec_skip

tsne=TSNE(perplexity=30,n_comonents=2,init="pca",n_iter=5000)
plot_only=100

low_dim_embbs=tsne.fit_transform(Word2Vec_skip.final_embeddings[:plot_only,:])
labels=list()
for i in range(plot_only):
    labels.append(vocabulary.reverse_dictionary[i])
    
plt.figure(figsize=(20,20))

for j,label in enumerate(labels):
    x,y=low_dim_embbs[j,:]
    
    plt.scatter(x,y)
    plt.annotate(label,xy=(x,y),xytext=(5,2),textcoords="offset points",ha="right",va="bottom")
    
plt.savefig(fname="after_tsne.png")

10.3 LSTM实现自然语言建模

import tensorflow as tf
(train_images,train_labels),(test_images,test_labels)=tf.keras.datasets.mnist.load_data()

train_images,test_images=train_images/255.0,test_images/255.0
sample,sample_label=train_images[0],train_labels[0]

def build_model():
    rnn_layer=tf.keras.layers.RNN(tf.keras.layers.LSTMCell(units=64),input_shape=(None,28))
    model=tf.keras.models.Sequential([rnn_layer,tf.keras.layers.BatchNormalization(),tf.keras.layers,Dense(units=10,activation='softmax')])
    return model

model=build_model()
model.compile(loss='sparse_categorical_crossentropy',optimizer='SGD',metrics=['accuracy'])
model.fit(train_images,train_labels,validation_data=(test_images,test_labels),batch_size=100,epochs=20)
model.summary()

LSTM在自然语言建模中的应用

  • 下载来自Tomas Mikolov网站上的PTB数据集:
 http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
  • 获取Github上开源的Models库,其中有一些操作PTB数据集的API。终端执行
 git clone https://github.com/tensorflow/models.git

使用Models库中/tutorials/rnn/ptb路径下的reader.py文件来操作PTB数据集的内容。

import numpy as np
import time
import tensorflow.compat.v1 as tf
import reader
class Config(object):
    init_scale=0.1
    learning_rate=1.0
    max_grad_norm=5
    num_layers=2
    num_steps=20
    word_dimension=200
    max_epoch=4
    total_epoch=13
    keep_prob=1.0
    lr_decay=0.5
    batch_size=20
    vocab_size=10000

class PTBModel(object):
    def __init__(self,is_training,config,data,name=None):
        self.batch_size=config.batch_size
        self.num_steps=config.num_steps

        self.epoch_size=((len(data)//self.batch_size)-1)//self.num_steps

        self.input_data,self.targets=reader.ptb_producer(data,self.batch_size,self.num_steps,name=name)

        self.keep_prob=config.keep_prob
        self.word_dimension=config.word_dimension
        lstm_cell=tf.nn.rnn_cell,BasicLSTMCell(self.word_dimension,forget_bias=0.0,state_is_tuple=True)

        if is_training and config.keep_prob <1:
            lstm_cell=tf.nn.rnn_cell.DropoutWrapper(lstm_cell,output_keep_prob=self.keep_prob)

            self.num_layers=config.num_steps
            cell_layer=tf.nn.rnn_cell.MultiRNNCell([lstm_cell for _ in range(self.num_layers)],state_is_tuple=True)

            self.initial_state=cell_layer.zero_state(self.batch_size,tf.float32)

            self.vocab_size=config.vocab_size
            embedding=tf.get_variable("embedding",
                                      [self.vocab_size,
                                       self.word_dimension],
                                      dtype=tf.float32)
            inputs=tf.nn.embedding_lookup(embedding,self.input_data)
            outputs=[]
            state=self.initial_state
            with tf.variable_scope("RNN"):
                for time_step in range(self.num_steps):
                    if time_step>0:
                        tf.get_variable_scope().reuse_variable()
                    cell_output,state=cell_layer(inputs[:,time_step,:],state)
                    outputs.append(cell_output)
            output=tf.rehape(tf.concat(outputs,1),[-1,self.word_dimension])

            weight=tf.get_variable("softmax_w",[self.word_dimension,self.vocab_size],dtype=tf.float32)
            bias=tf.get_variable("softmax_b",[self.vocab_size],dtype=tf.float32)
            logits=tf.matmul(output,weight)+bias

            loss=tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits],
                                                                    [tf.rehape(self.targets,[-1])],
                                                                    [tf.ones([self.batch_size*self.num_steps],dtype=tf.float32)])
            self.cost=tf.reduce_sum(loss)/self.batch_size
            self.final_state=state

            if not is_training:
                return

            self.learning_rate=tf.Variables(.0,trainable=False)

            gradients=tf.gradients(self.cost,tf.trainable_variables())

            clipped_grads, _ =tf.clip_by_global_norm(gradients,config.max_grad_norm)

            SGDOPtimizer=tf.train.GradientDescentOptimizer(self.learning_rate)

            self.train_op=SGDOPtimizer.apply_gradients(zip(clipped_grads,tf.trainable_variables()),
                                                       global_step=tf.train.get_or_create_global_step())
            self.new_learning_rate=tf.placeholder(tf.float32,shape=[],name="new_learning_rate")
            self.learning_rate_update=tf.assign(self.learning_rate,self.new_learning_rate)
    def assign_lr(selfself,session,lr_value):
        session.run(self.learning_rate_update,feed_dict={self.new_learning_rate:lr_value})
    def run_epoch(session,model,train_op=None,output_log=False):
        start_time=time.time()
        costs=0
        iters=0
        state=session.run(model.initial_state)

        fetches={
            "costs":model.cost,
            "final_stat":model.final_state,
        }
        if train_op is not None:
            fetches["train_op"]=train_op
        for step in range(model.epoch_size):
            feed_dict={}
            for i,(c,h) in enumerate(model.initial_state):
                feed_dict[c]=state[i].c
                feed_dict[h]=state[i].h

            result=session.run(fetches,feed_dict)
            cost=result["cost"]
            state=result["final_state"]
            cost += cost
            iters += model.num_steps

            if output_log and step % (model.epoch_size//10)==10:
                print("step%.3f perplexity: %.3f speed: %.0f words/sec"%(step,np.exp(costs/iters),iters*model.batch_size/(time.time()-start_time)))
        return np.exp(costs/iters)

train_data,valid_data,test_data,_=reader.ptb_raw_data("../PTB/simplep-examples/data/")
train_config=Config()
valid_config=Config()
test_config=Config()

test_config.batch_size=1
test_config.num_steps=1

with tf.Graph().as_default():
    initializer=tf.random_uniform_initializer(-config.init_scale,config.init_scale)

    with tf.name_scope("Train"):
        with tf.variable_scope("Model",reuse=None,initializer=initializer):
            Model_train=PTBModel(is_training=True,
                                 config=train_config,
                                 data=train_data,
                                 name="TrainModel")

    with tf.name_scope("Valid"):
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            Model_train = PTBModel(is_training=False,
                                   config=valid_config,
                                   data=valid_data,
                                   name="ValidModel")

    with tf.name_scope("Test"):
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            Model_train = PTBModel(is_training=False,
                                   config=test_config,
                                   data=test_data,
                                   name="TestModel")

    sv=tf.train.Supervisor()
    with sv.managed_session() as session:
        for i in range(config.total_epoch):
            lr_decay=config.lr_decay ** max(i+1-config.max_epoch,0.0)
            Model_train.assign_lr(session,config.learning_rate*lr_decay)
            print("Epoch: %d Learnong rate : %.3f" %(i+1,session.run(Model_train.learning_rate)))
            train_perplexity=run_epoch(session,Model_train,train_op=Model_train.train_op,output_log=True)
            print("Epoch: %d Train Perplexity: %.3f" % (i+1,train_perplexity))

            valid_perplexity=run_epoch(session,Model_valid)
            print("Epoch: %d Valid Perplexity: %.3f"%(i+1,valid_perplexity))
        test_perplexity=run_epoch(session,Model_test)
        print("Test Perplexity : %.3f"% test_perplexity)

Dropout

import tensorflow.compat.v1 as tf
lstm=tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
dropout_lstm=tf.nn.run_cell.DropoutWrapper(lstm,output_keep_prob=0.5)
statcked_lstm=tf.nn.run_cell.MultiRNNCell([dropout_lstm]*number_of_layers) 

10.4 循环神经网络的变种
深层循环神经网络

import tensorflow.compat.v1 as tf
lstm=tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
statcked_lstm=tf.nn.run_cell.MultiRNNCell([lstm]*number_of_layers)
for i in range(num_steps):
    if i>0:
        tf.get_variable_scope().reuse_variables()
    stacked_lstm_output,state=stacked_lstm(current_input,state)
    final_output=fc(statcked_lstm_output)
    loss += calculate_loss(final_output,expexted_output)

你可能感兴趣的:(笔记,tensorflow,神经网络,深度学习,rnn,lstm)