卷积神经网络主要由一下5种结构组成:
conv2d(
input,
filter,
strides,
padding,
use_cudnn_on_gpu=None,
data_format=None,
name=None
)
那么具体的卷积方法用一个实例来解释:
(1)输入的数据是[100,80,100,4]的数据,经过的卷积核是[8,8,4,32],步长为[1,4,4,1]策略是valid,那么首先输入的batch=100是不会变的,深度4要变成输出的32,输入图片长度80要在长为8的卷积核下以步长4划过一次,那么抹布的右边缘所处的像素点横坐标应当依次是8,12,16,20……80一共19次计算,所以输出结果的长应当是19,同理,输出结果的宽应当是24,因此输出结果的形状应当是[100,19,24,32]
(2)将第一步的结果输入卷积核[4,4,32,64],步长调整为[1,2,2,1],模式依旧是valid,那么输出结果是[100,9,11,64]
(3)将第二步的结果输入卷积核[3,3,64,128],步长调整为[1,1,1,1],模式调整为same,那么输出结果是[100,9,11,128]
上图所展示的是max_pooling即取区域最大。上图左边部分中 左上角2x2的矩阵中6最大,右上角2x2的矩阵中8最大,左下角2x2的矩阵中3最大,右下角2x2的矩阵中4最大,所以得到上图右边部分的结果:[[6, 8], [3,4]]。
avg_pooling即取区域的平均值,上图的经过avg_pooling得到的结果就是
[[ 13 4 \frac{13}{4} 413, 21 4 \frac{21}{4} 421], [ 2 2 2, 2 2 2]]
tf.nn.max_pool(value, ksize, strides, padding, name=None)
上图中CNN要做的事情是:给定一张图片,是车还是马未知,是什么车也未知,现在需要模型判断这张图片里具体是一个什么东西,总之输出一个结果:如果是车 那是什么车。
RNNs的目的使用来处理序列数据。在传统的神经网络模型中,是从输入层到隐含层再到输出层,层与层之间是全连接的,每层之间的节点是无连接的。但是这种普通的神经网络对于很多问题却无能无力。例如,你要预测句子的下一个单词是什么,一般需要用到前面的单词,因为一个句子中前后单词并不是独立的。RNNs之所以称为循环神经网路,即一个序列当前的输出与前面的输出也有关。具体的表现形式为网络会对前面的信息进行记忆并应用于当前输出的计算中,即隐藏层之间的节点不再无连接而是有连接的,并且隐藏层的输入不仅包括输入层的输出还包括上一时刻隐藏层的输出。理论上,RNNs能够对任何长度的序列数据进行处理。但是在实践中,为了降低复杂性往往假设当前的状态只与前面的几个状态相关。
首先看一个简单的循环神经网络如,它由输入层、一个隐藏层和一个输出层组成:
# 束搜索(Beam Search)
def beam_decode(y, beam_size=10):
T, V = y.shape
log_y = np.log(y)
beam = [([], 0)]
for t in range(T):
new_beam = []
for prefix, score in beam:
for i in range(V): # for every state
new_prefix = prefix + [i]
new_score = score + log_y[t, i]
new_beam.append((new_prefix, new_score))
# top beam_size
new_beam.sort(key=lambda x: x[1], reverse=True)
beam = new_beam[:beam_size]
return beam
知乎链接
数据预处理
定义X,Y
inputs = tf.placeholder(tf.float32, [None, 100, 32, 1], name='inputs')
targets = tf.sparse_placeholder(tf.int32, name='targets')
batch_size = tf.shape(inputs)[0]
构建CNN网络
def CNN(inputs):
"""
:param inputs: shape [?, 100, 32, 1]
:return:
"""
# conv1 : [?, 100, 32, 64]
conv1 = tf.layers.conv2d(inputs=inputs, filters=64, kernel_size=(3, 3), strides=(1,1), padding="same",
activation=tf.nn.relu)
# pool1 : [?, 50, 16, 64]
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
# pool1 : [?, 50, 16, 128]
conv2 = tf.layers.conv2d(inputs=pool1, filters=128, kernel_size=(3, 3), strides=(1,1), padding="same",
activation=tf.nn.relu)
# pool2 : [?, 25, 8, 128]
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
# conv3 : [?, 25, 8, 256]
conv3 = tf.layers.conv2d(inputs=pool2, filters=256, kernel_size=(3, 3), strides=(1,1), padding="same",
activation=tf.nn.relu)
# 处理数据 易于训练
bnorm1 = tf.layers.batch_normalization(conv3)
# conv4 : [?, 25, 8, 256]
conv4 = tf.layers.conv2d(inputs=bnorm1, filters=256, kernel_size=(3, 3), padding="same",
activation=tf.nn.relu)
# pool3 : [?, 25, 4, 256]
pool3 = tf.layers.max_pooling2d(inputs=conv4, pool_size=[2, 2], strides=[1, 2], padding="same")
# pool3 : [?, 25, 4, 512]
conv5 = tf.layers.conv2d(inputs=pool3, filters=512, kernel_size=(3, 3), padding="same",
activation=tf.nn.relu)
# Batch normalization layer
bnorm2 = tf.layers.batch_normalization(conv5)
# conv6: [?, 25, 4, 512]
conv6 = tf.layers.conv2d(inputs=bnorm2, filters=512, kernel_size=(3, 3), padding="same",
activation=tf.nn.relu)
# conv6: [?, 25, 2, 512]
pool4 = tf.layers.max_pooling2d(inputs=conv6, pool_size=[2, 2], strides=[1, 2], padding="same")
# conv7: [?, 24, 1, 512]
conv7 = tf.layers.conv2d(inputs=pool4, filters=512, kernel_size=(2, 2), padding="valid",
activation=tf.nn.relu)
return conv7
处理 CNN OUT
# 初始化CNN
cnn_output = CNN(inputs)
# rnn 神经元数量
max_char_count = tf.shape(cnn_output)[1]
# 计算sequence_length inputs的第一个维度是batch_size
batch_size = tf.shape(inputs)[0]
sequence_length = tf.fill([tf.shape(inputs)[0]], value=max_char_count, name='seq_len')
使用双向LSTM, 构建RNN网络
def BidirectionnalRNN(inputs, seq_len):
"""
Bidirectionnal LSTM Recurrent Neural Network part
"""
with tf.variable_scope(name_or_scope='bidirectional-rnn-1'):
# Forward
lstm_fw_cell_1 = rnn.BasicLSTMCell(256)
# Backward
lstm_bw_cell_1 = rnn.BasicLSTMCell(256)
inter_output, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell_1, lstm_bw_cell_1, inputs, seq_len, dtype=tf.float32)
# 拼接LSTM inter_output: [?, 24, 512]
inter_output = tf.concat(inter_output, 2)
with tf.variable_scope(name_or_scope='bidirectional-rnn-2'):
# Forward
lstm_fw_cell_2 = rnn.BasicLSTMCell(256)
# Backward
lstm_bw_cell_2 = rnn.BasicLSTMCell(256)
# 拼接LSTM inter_output: [?, 24, 512]
outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell_2, lstm_bw_cell_2, inter_output, seq_len, dtype=tf.float32)
outputs = tf.concat(outputs, 2)
return outputs
初始化CRNN网络
crnn_model = BidirectionnalRNN(cnn_output, sequence_length)
初始化全连接网络
# Flatten batch_size logits : [?, 512]
logits = tf.reshape(crnn_model, [-1, 512])
# 全连接层权重 NUM_CLASSES: 0-9 a-Z 36分类 + 1 blank字符 = 37
W = tf.Variable(tf.truncated_normal([512, config.NUM_CLASSES], stddev=0.1), name="W")
# 全连接层偏置
b = tf.Variable(tf.constant(0., shape=[config.NUM_CLASSES]), name="b")
# 外积 logits
logits = tf.matmul(logits, W) + b
# 还原 batch_size维度
logits = tf.reshape(logits, [-1, max_char_count, config.NUM_CLASSES])
# 最后输出 第一列与第二列置换 为了求ctc_loss
logits = tf.transpose(logits, (1, 0, 2))
计算predict
# 定义tensor map
label_text = tf.contrib.lookup.HashTable(
tf.contrib.lookup.KeyValueTensorInitializer(tf.constant(config.ALPHABET_INDEX, dtype=tf.int64),
tf.constant(config.ALPHABET, dtype=tf.string)),
default_value='-'
)
# beam_width为选取的最优路径个数
decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, sequence_length, beam_width=100,
top_paths=1, merge_repeated=False)
# 转化为sparse_tensor
dense_decoded = tf.sparse_tensor_to_dense(decoded[0], default_value=-1)
predict_out = label_text.lookup(dense_decoded, name='prediction')
定义ctc loss
# loss [?] 一维 batch_size 长度
loss = tf.nn.ctc_loss(targets, logits, sequence_length)
# 求平均值
cost = tf.reduce_mean(loss)
定义优化器
# 学习率为0.0001 学习率概念可去学习梯度下降
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)
定义精确率
accuracy = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))
加入tensorboard summary视图
tf.summary.scalar('loss', cost)
tf.summary.scalar('accuracy', accuracy)
summary_merged = tf.summary.merge_all()
init 变量
inits = [tf.global_variables_initializer(), tf.tables_initializer()]