import os
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
# 对全局随机数生成种子的设置
tf.random.set_seed(22)
# 使用相同的参数,每次生成的随机数都相同
np.random.seed(22)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# startwith('2.') 这个函数用于判断tf.__version__的版本信息是否以'2.0'开头,返回True或者False
# assert 关键字用于判断该关键字后面的表达式返回值,True则不报错,返回False则报错‘AssertionError: ’
assert tf.__version__.startswith('2.')
assert np.__version__.startswith('1.16.2')
batchsz = 512
# the most frequent words
total_words = 10000 # 设定常用的单词数目为 10000
max_review_len = 80 # 设定每个句子中单词个数的最大值,即可以统一padding为这样的长度
# max_review_len = 100
embedding_len = 100 # 每个单词的编码维度,即用100维的向量表示一个单词
# 载入数据, imdb 是一个关于电影评论的数据集,参数num_words=total_words 限时单词数量为total_words
# 把超出这个范围的生僻单词视为同一个单词
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=total_words)
# x_train: [b, 80] 把x_train中每条评论(句子) padding为统一的长度,不足的话补0,超过的截取
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_review_len)
# x_test: [b, 80] 把x_test padding为统一的长度——80
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=max_review_len)
# 对数据集进行切片处理
db_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
# batch()的参数 drop_remainer 设置为 True 是丢弃最末尾的 batch可能出现不为整数的batch
db_train = db_train.shuffle(1000).batch(batchsz, drop_remainder=True)
db_test = tf.data.Dataset.from_tensor_slices((x_test, y_test))
db_test = db_test.batch(batchsz, drop_remainder=True)
# 定义MyRNN类
class MyRNN(keras.Model):
# 定义初始化方法,其中 units 参数表示在RNN网络Cell中的维度(节点数)隐含层的维度 h_dim
def __init__(self, units):
super(MyRNN, self).__init__()
# 建立初始化状态
# [b, 64] 这里是列表
self.state0 = [tf.zeros([batchsz, units])]
self.state1 = [tf.zeros([batchsz, units])]
# self.state2 = [tf.zeros([batchsz, units])]
# 添加BatchNormalization
self.bn1 = layers.BatchNormalization()
# embedding 层,用于数据类型的编码(嵌入表示),第一个参数表示数据中单词数量的总数,
# 第二个参数表示每个单词的编码维度,
# 第三个单词表示每个句子的长度(全部padding为80了)
# [b, 80] => [b, 80, 100]
# transform text to embedding representation
self.embedding = layers.Embedding(total_words, embedding_len, input_length=max_review_len)
# 定义RNN单元
# [b, 80, 100] , h_dim: 64
# units 参数表示Cell的隐含层的维度 h_dim
# dropout=0.5表示随机丢弃节点,提高效率并且降低过拟合(一般只在training时起作用)
# [b, 80, 100] => [b, 64]
self.rnn_cell0 = layers.SimpleRNNCell(units, dropout=0.99)
self.rnn_cell1 = layers.SimpleRNNCell(units, dropout=0.99)
# self.rnn_cell2 = layers.SimpleRNNCell(units, dropout=0.5)
# 定义全连接层,用于分类,输入维度为1,即一个节点
# [b, 64] => [b, 1]
self.outlayer = layers.Dense(1)
def call(self, inputs, training=None):
"""
通过设置training=None,在测试时dropout失效
net(x) ; net(x, training=True) ; net(x, training=None) --> train mode
net(x, training=False) --> test mode
"""
x = inputs # [b, 80]
x = self.embedding(x) # [b, 80] => [b, 80, 100]
state0 = self.state0 # [batchsz, units]
state1 = self.state1 # [batchsz, units]
# state2 = self.state2
# [b, 80, 100] => [b, 64]
for word in tf.unstack(x, axis=1): # word: [b, 100] * 80
# out0: [b, 64] 添加training参数,在training时进行 dropout操作
out0, state0 = self.rnn_cell0(word, state0, training)
# out1: [b, 64]
out1, state1 = self.rnn_cell1(out0, state1, training)
# out2: [b, 64]
# out2, state2 = self.rnn_cell1(out1, state2, training)
# out1 --> x : [b, 64] => [b, 1]
x = self.outlayer(out1)
# p(y is pos | x )
x_BN = self.bn1(x)
prob = tf.sigmoid(x_BN)
return prob
units = 64
# units = 150
epochs = 100
model = MyRNN(units)
model.compile(optimizer = keras.optimizers.Adam(1e-3),
loss = tf.losses.BinaryCrossentropy(),
metrics = ['accuracy'])
# 训练RNN
model.fit(db_train, epochs=epochs, validation_data=db_test)
# 测试评估
model.evaluate(db_test)
dropout | 获得最好的Val_accuracy | 相应的epoch | 备注 |
0.0 | 0.8175 | 3 | |
0.1 | 0.8168 | 2 | |
0.2 | 0.8187 | 4 | |
0.3 | 0.8260 | 4 | |
0.4 | 0.8258 | 4 | |
0.5 | 0.8259 | 4 | |
0.6 | 0.8310 | 5 | |
0.7 | 0.8336 | 7 | |
0.8 | 0.8310 | 15左右 | |
0.9 | 0.8331 | 40~43 | 不稳定 |
0.99 | 0.5000 | ~ |