本文写作目的旨在记载tensorflow学习过程中对遇到的op的理解和思考,同时记录用CNN实现文本分类的一种实现思路。文中所用代码来自网络,为了突出主题,对代码作了一定的简化。
先上代码:
文本数据加载及预处理,inputs.py
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from nltk.tokenize import word_tokenize
import codecs
import tensorflow as tf
import re
import collections
import numpy as np
BATCH_SIZE = 33
VOCAB_SIZE = 18592
SEQUENCE_LENGTH = 37
NUM_CLASSES = 2
NUM_EXAMPLES_PER_EPOCH = 10662
NUM_EXPOCHES = 2
def clean_sentence(sentence):
"""
Tokenization/sentence cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
sentence = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", sentence)
sentence = re.sub(r"\'s", " \'s", sentence)
sentence = re.sub(r"\'ve", " \'ve", sentence)
sentence = re.sub(r"n\'t", " n\'t", sentence)
sentence = re.sub(r"\'re", " \'re", sentence)
sentence = re.sub(r"\'d", " \'d", sentence)
sentence = re.sub(r"\'ll", " \'ll", sentence)
sentence = re.sub(r",", " , ", sentence)
sentence = re.sub(r"!", " ! ", sentence)
sentence = re.sub(r"\(", " \( ", sentence)
sentence = re.sub(r"\)", " \) ", sentence)
sentence = re.sub(r"\?", " \? ", sentence)
sentence = re.sub(r"\s{2,}", " ", sentence)
return sentence.strip().lower()
def build_vocab():
positive_sentences = codecs.open("test.pos").readlines()
negative_sentences = codecs.open("test.neg").readlines()
num_positive = len(positive_sentences)
sentences = positive_sentences + negative_sentences
clean = map(lambda sentence: word_tokenize(clean_sentence(sentence)), sentences)
line = reduce(lambda x, y: x+y, clean)
counter = collections.Counter(line)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word2id = dict(zip(words, range(2, len(words)+2)))
word2id[""] = -1
word2id[""] = 0
word2id[""] = 1
vocab = list(words) + ["", "", ""]
array = [[0] + [word2id[word] for word in sent] + [1] for sent in clean]
return vocab, word2id, array, num_positive
def input_producer(train=True):
vocab, word2id, array, num_positive = build_vocab()
num_sents = len(array)
labels = np.ones([num_sents])
labels[num_positive + 1:] = 0
max_length = max(map(len, array))
pad = map(lambda sent: sent + [-1] * (max_length-len(sent)), array)
data = np.hstack((np.array(pad), np.expand_dims(labels, 1)))
np.random.shuffle(data)
total_inputs = tf.convert_to_tensor(data[:, :-1])
total_inputs = tf.cast(x=total_inputs, dtype=tf.int32)
total_labels = tf.convert_to_tensor(data[:, -1])
total_labels = tf.cast(x=total_labels, dtype=tf.int64)
i = tf.train.range_input_producer(NUM_EXPOCHES, shuffle=False).dequeue()
if train:
inputs = tf.slice(total_inputs, [i * BATCH_SIZE, 0], [BATCH_SIZE, max_length])
labels = tf.slice(total_labels, [i * BATCH_SIZE], [BATCH_SIZE])
else:
inputs=total_inputs
labels=total_labels
return inputs, labels
class Inputs(object):
def __init__(self):
self.batch_size = BATCH_SIZE
self.inputs, self.labels = input_producer()
self.vocab_size = VOCAB_SIZE
self.sequence_length = SEQUENCE_LENGTH
self.num_classes = NUM_CLASSES
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
import tensorflow as tf
class Config(object):
def __init__(self):
self.embedding_size = 100
self.kernel_sizes = [3, 4, 5]
self.num_kernels = 128
class TextCNN(object):
def __init__(self, config, inputs):
embedding_size = config.embedding_size
kernel_sizes = config.kernel_sizes
num_kernels = config.num_kernels
vocab_size = inputs.vocab_size
sequence_length = inputs.sequence_length
num_classes = inputs.num_classes
with tf.variable_scope("embedding"):
embedding = tf.get_variable("embedding",
shape=[vocab_size, embedding_size],
initializer=tf.truncated_normal_initializer(stddev=0.05),
dtype=tf.float32)
embed = tf.nn.embedding_lookup(embedding, inputs.inputs)
print("embed shape: %s" % str(embed.shape))
expand = tf.expand_dims(embed, 3)
print("expand shape: %s" % str(expand.shape))
outputs = []
for i, kernel_size in enumerate(kernel_sizes):
with tf.variable_scope("conv_pool_%d" % i):
kernel = tf.get_variable("kernel",
shape=[kernel_size, embedding_size, 1, num_kernels],
initializer=tf.truncated_normal_initializer(stddev=0.05),
dtype=tf.float32)
print("kernel %d shape: %s" %(i, str(kernel.get_shape())))
bias = tf.get_variable("bias",
shape=[num_kernels],
initializer=tf.constant_initializer(value=0.),
dtype=tf.float32)
conv = tf.nn.conv2d(input=expand,
filter=kernel,
strides=[1, 1, 1, 1],
padding="VALID")
print("conv %d shape: %s" % (i,str(conv.shape)))
conv_bias = tf.nn.bias_add(conv, bias)
relu = tf.nn.relu(conv_bias)
pool = tf.nn.max_pool(relu,
ksize=[1, sequence_length - kernel_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding="VALID")
print("maxpool ksize %d = %s" % (i, str([1, sequence_length - kernel_size + 1, 1, 1])))
print("pool %d shape: %s" % (i, str(pool.shape)))
outputs.append(pool)
concat = tf.concat(outputs, 3)
print("concat shape: %s" % str(concat.shape))
squeeze = tf.squeeze(concat, squeeze_dims=[1, 2])
dim = squeeze.get_shape().as_list()[-1]
with tf.variable_scope("output"):
softmax_w = tf.get_variable("softmax_w",
shape=[dim, num_classes],
initializer=tf.truncated_normal_initializer(stddev=0.05),
dtype=tf.float32)
softmax_b = tf.get_variable("softmax_b",
shape=[num_classes],
initializer=tf.constant_initializer(value=0.),
dtype=tf.float32)
logits = tf.nn.xw_plus_b(squeeze, softmax_w, softmax_b)
with tf.name_scope("loss"):
cross_entropy_per_example = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=inputs.labels, logits=logits)
self.__loss = tf.reduce_mean(cross_entropy_per_example)
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.05)
self.__train_op = optimizer.minimize(self.__loss)
with tf.name_scope("validatin"):
predict = tf.argmax(logits, 1)
equal = tf.equal(predict, inputs.labels)
self.__validation_op = tf.reduce_mean(tf.cast(equal, tf.float32))
@property
def cost(self):
return self.__loss
@property
def train_op(self):
return self.__train_op
@property
def validation_op(self):
return self.__validation_op
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
import tensorflow as tf
from inputs import Inputs
from model import Config, TextCNN
def main(*args, **kwargs):
inputs = Inputs()
print("inputs shape: %s" % str(inputs.inputs.shape))
config = Config()
with tf.variable_scope("inference") as scope:
m = TextCNN(config, inputs)
scope.reuse_variables()
init = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
sess = tf.Session()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
sess.run(init)
try:
index = 0
while not coord.should_stop() and index<1:
_, loss_value = sess.run([m.train_op, m.cost])
index += 1
print("step: %d, loss: %f" % (index, loss_value))
except tf.errors.OutOfRangeError:
print("Done traing:-------Epoch limit reached")
except KeyboardInterrupt:
print("keyboard interrput detected, stop training")
finally:
coord.request_stop()
coord.join(threads)
sess.close()
del sess
if __name__ == "__main__":
main()
本文重点是网络结构的学习,这里主要讲一下model.py的代码。
程序输出内容如下:
inputs shape: (33, 37)
embed shape: (33, 37, 100)
expand shape: (33, 37, 100, 1)
kernel 0 shape: (3, 100, 1, 128)
conv 0 shape: (33, 35, 1, 128)
maxpool ksize 0 = [1, 35, 1, 1]
pool 0 shape: (33, 1, 1, 128)
kernel 1 shape: (4, 100, 1, 128)
conv 1 shape: (33, 34, 1, 128)
maxpool ksize 1 = [1, 34, 1, 1]
pool 1 shape: (33, 1, 1, 128)
kernel 2 shape: (5, 100, 1, 128)
conv 2 shape: (33, 33, 1, 128)
maxpool ksize 2 = [1, 33, 1, 1]
pool 2 shape: (33, 1, 1, 128)
concat shape: (33, 1, 1, 384)
step: 1, loss: 0.696079
网络结构示意图如下:
embedding dim expand 3个卷积核,strides=[1, 1, 1, 1],padding=VALID maxpool ksize 最后一维拼接 去掉中间两个为1的维度
(33,37) --> (33, 37, 100) --> (33, 37, 100, 1) -----> (3, 100, 1, 128) --> (33, 35, 1, 128) --> [1, 35, 1, 1] --> (33, 1, 1, 128) ---> (33, 1, 1, 384) --> (33, 384) -->softmax
|--> (4, 100, 1, 128) --> (33, 34, 1, 128) --> [1, 34, 1, 1] --> (33, 1, 1, 128) --|
|--> (5, 100, 1, 128) --> (33, 33, 1, 128) --> [1, 33, 1, 1] --> (33, 1, 1, 128) --|
首先,一个batch的数据有33个样本,最长的样本有37个词,因此inputs shape=(33,37);
26~30行对数据做embedding处理,即每个词用一个词向量表示,因此shape变成 (33, 37, 100),这里用到tf.nn.embedding_lookup;
CNN一般用于图片,图片数据前3维分别表示样本序号、图片高度、图片宽度、像素通道,一般有R、G、B三个通道,因此图片数据有4个维度,为了把CNN用到文本上,32行用tf.expand_dims对数据做了增维,只不过这里通道数为1,shape变成(33, 37, 100, 1);
接下来是经过3个卷积核外加对应的maxpool。48行的卷积操作用的是tf.nn.conv2d,这里有必要详细分析一下 (33, 37, 100, 1)经过卷积核 (3, 100, 1, 128)为什么会输出(33, 35, 1, 128) 。
看一下conv2d的签名:
tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)
这个方法强制要求strides[0] = strides[3] = 1,同时,假设数据shape为[batch, in_height, in_width, in_channels],卷积核shape为[filter_height, filter_width, in_channels, out_channels],要求卷积核的第3维必须为输入数据的通道数,卷积输出的shape为[batch, out_height, out_width, out_channels],输出的每个元素为filter_height*filter_width*in_channels的加和,因此卷积有综合输入多个通道数据的功能。那么out_height和out_width怎么确定呢?根据padding的不同,计算方法也不一样,SAME就是不够补0的方式,VALID就是不够截取的方式。以一维数据为例,假设数据为[1, 2, 3, 4. 5],核的长度为2,stride为2,则两种padding方式匹配结果如下:
[1, 2, 3, 4, 5]
SAME [1,2] [3,4] [5,0]
VALID [1,2] [3,4] [5
至于具体计算,padding=SAME时,
out_height = ceil(float(in_height) / float(strides[1]))
out_width = ceil(float(in_width) / float(strides[2]))
padding=VALID时,
out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
out_width = ceil(float(in_width - filter_width + 1) / float(strides[2]))
对于本文的例子,input的shape为 (33, 37, 100, 1),filter的shape为(3, 100, 1, 128),strides=[1, 1, 1, 1],则
out_height = ceil(float(in_height - filter_height + 1) / float(strides[1])=ceil(float(37- 3+ 1) / float(1)=35
out_width = ceil(float(in_width - filter_width + 1) / float(strides[2]))=ceil(float(100- 100+ 1) / float(1))=1
因此卷积后的shape为(33, 35, 1, 128) 。
55行调用tf.nn.max_pool对第2维作maxpool(其它维ksize的值为1,相当于没有maxpool),输出shape变成(33, 1, 1, 128)
63行用tf.concat对第4维的数据进行拼接,shape变成(33, 1, 1, 384)
65行用tf.squeeze去掉长度为1的2、3两维,shape变成(33, 384)
然后就是全连接做softmax分类了