tensorflow使用CNN做文本分类

前言

本文写作目的旨在记载tensorflow学习过程中对遇到的op的理解和思考,同时记录用CNN实现文本分类的一种实现思路。文中所用代码来自网络,为了突出主题,对代码作了一定的简化。

代码

先上代码:

文本数据加载及预处理,inputs.py

from __future__ import division
from __future__ import absolute_import
from __future__ import print_function


from nltk.tokenize import word_tokenize
import codecs
import tensorflow as tf
import re
import collections
import numpy as np


BATCH_SIZE = 33
VOCAB_SIZE = 18592
SEQUENCE_LENGTH = 37
NUM_CLASSES = 2
NUM_EXAMPLES_PER_EPOCH = 10662
NUM_EXPOCHES = 2


def clean_sentence(sentence):
    """
    Tokenization/sentence cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    sentence = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", sentence)
    sentence = re.sub(r"\'s", " \'s", sentence)
    sentence = re.sub(r"\'ve", " \'ve", sentence)
    sentence = re.sub(r"n\'t", " n\'t", sentence)
    sentence = re.sub(r"\'re", " \'re", sentence)
    sentence = re.sub(r"\'d", " \'d", sentence)
    sentence = re.sub(r"\'ll", " \'ll", sentence)
    sentence = re.sub(r",", " , ", sentence)
    sentence = re.sub(r"!", " ! ", sentence)
    sentence = re.sub(r"\(", " \( ", sentence)
    sentence = re.sub(r"\)", " \) ", sentence)
    sentence = re.sub(r"\?", " \? ", sentence)
    sentence = re.sub(r"\s{2,}", " ", sentence)
    return sentence.strip().lower()


def build_vocab():
    positive_sentences = codecs.open("test.pos").readlines()
    negative_sentences = codecs.open("test.neg").readlines()
    num_positive = len(positive_sentences)
    sentences = positive_sentences + negative_sentences
    clean = map(lambda sentence: word_tokenize(clean_sentence(sentence)), sentences)
    line = reduce(lambda x, y: x+y, clean)
    counter = collections.Counter(line)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    word2id = dict(zip(words, range(2, len(words)+2)))
    word2id[""] = -1
    word2id[""] = 0
    word2id[""] = 1
    vocab = list(words) + ["", "", ""]
    array = [[0] + [word2id[word] for word in sent] + [1] for sent in clean]
    return vocab, word2id, array, num_positive


def input_producer(train=True):
    vocab, word2id, array, num_positive = build_vocab()
    num_sents = len(array)
    labels = np.ones([num_sents])
    labels[num_positive + 1:] = 0
    max_length = max(map(len, array))
    pad = map(lambda sent: sent + [-1] * (max_length-len(sent)), array)
    data = np.hstack((np.array(pad), np.expand_dims(labels, 1)))
    np.random.shuffle(data)
    total_inputs = tf.convert_to_tensor(data[:, :-1])
    total_inputs = tf.cast(x=total_inputs, dtype=tf.int32)
    total_labels = tf.convert_to_tensor(data[:, -1])
    total_labels = tf.cast(x=total_labels, dtype=tf.int64)
    i = tf.train.range_input_producer(NUM_EXPOCHES, shuffle=False).dequeue()
    if train:
        inputs = tf.slice(total_inputs, [i * BATCH_SIZE, 0], [BATCH_SIZE, max_length])
        labels = tf.slice(total_labels, [i * BATCH_SIZE], [BATCH_SIZE])
    else:
        inputs=total_inputs
        labels=total_labels
    return inputs, labels


class Inputs(object):
    def __init__(self):
        self.batch_size = BATCH_SIZE
        self.inputs, self.labels = input_producer()
        self.vocab_size = VOCAB_SIZE
        self.sequence_length = SEQUENCE_LENGTH
        self.num_classes = NUM_CLASSES

网络结构定义,model.py

from __future__ import division
from __future__ import absolute_import
from __future__ import print_function


import tensorflow as tf


class Config(object):
    def __init__(self):
        self.embedding_size = 100
        self.kernel_sizes = [3, 4, 5]
        self.num_kernels = 128


class TextCNN(object):
    def __init__(self, config, inputs):
        embedding_size = config.embedding_size
        kernel_sizes = config.kernel_sizes
        num_kernels = config.num_kernels

        vocab_size = inputs.vocab_size
        sequence_length = inputs.sequence_length
        num_classes = inputs.num_classes
        with tf.variable_scope("embedding"):
            embedding = tf.get_variable("embedding",
                                        shape=[vocab_size, embedding_size],
                                        initializer=tf.truncated_normal_initializer(stddev=0.05),
                                        dtype=tf.float32)
            embed = tf.nn.embedding_lookup(embedding, inputs.inputs)
            print("embed shape: %s" % str(embed.shape))
            expand = tf.expand_dims(embed, 3)
            print("expand shape: %s" % str(expand.shape))

        outputs = []

        for i, kernel_size in enumerate(kernel_sizes):
            with tf.variable_scope("conv_pool_%d" % i):
                kernel = tf.get_variable("kernel",
                                         shape=[kernel_size, embedding_size, 1, num_kernels],
                                         initializer=tf.truncated_normal_initializer(stddev=0.05),
                                         dtype=tf.float32)
                print("kernel %d shape: %s" %(i, str(kernel.get_shape())))
                bias = tf.get_variable("bias",
                                       shape=[num_kernels],
                                       initializer=tf.constant_initializer(value=0.),
                                       dtype=tf.float32)
                conv = tf.nn.conv2d(input=expand,
                                    filter=kernel,
                                    strides=[1, 1, 1, 1],
                                    padding="VALID")
                print("conv %d shape: %s" % (i,str(conv.shape)))
                conv_bias = tf.nn.bias_add(conv, bias)
                relu = tf.nn.relu(conv_bias)
                pool = tf.nn.max_pool(relu,
                                      ksize=[1, sequence_length - kernel_size + 1, 1, 1],
                                      strides=[1, 1, 1, 1],
                                      padding="VALID")
                print("maxpool ksize %d = %s" % (i, str([1, sequence_length - kernel_size + 1, 1, 1])))
                print("pool %d shape: %s" % (i, str(pool.shape)))
                outputs.append(pool)

        concat = tf.concat(outputs, 3)
        print("concat shape: %s" % str(concat.shape))
        squeeze = tf.squeeze(concat, squeeze_dims=[1, 2])
        dim = squeeze.get_shape().as_list()[-1]

        with tf.variable_scope("output"):
            softmax_w = tf.get_variable("softmax_w",
                                        shape=[dim, num_classes],
                                        initializer=tf.truncated_normal_initializer(stddev=0.05),
                                        dtype=tf.float32)
            softmax_b = tf.get_variable("softmax_b",
                                        shape=[num_classes],
                                        initializer=tf.constant_initializer(value=0.),
                                        dtype=tf.float32)
            logits = tf.nn.xw_plus_b(squeeze, softmax_w, softmax_b)
        with tf.name_scope("loss"):
            cross_entropy_per_example = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=inputs.labels, logits=logits)
            self.__loss = tf.reduce_mean(cross_entropy_per_example)

        with tf.name_scope("train"):
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.05)
            self.__train_op = optimizer.minimize(self.__loss)

        with tf.name_scope("validatin"):
            predict = tf.argmax(logits, 1)
            equal = tf.equal(predict, inputs.labels)
            self.__validation_op = tf.reduce_mean(tf.cast(equal, tf.float32))

    @property
    def cost(self):
        return self.__loss

    @property
    def train_op(self):
        return self.__train_op

    @property
    def validation_op(self):
        return self.__validation_op

模型学习入口,main.py

from __future__ import division
from __future__ import absolute_import
from __future__ import print_function


import tensorflow as tf
from inputs import Inputs
from model import Config, TextCNN


def main(*args, **kwargs):
    inputs = Inputs()
    print("inputs shape: %s" % str(inputs.inputs.shape))
    config = Config()
    with tf.variable_scope("inference") as scope:
        m = TextCNN(config, inputs)
        scope.reuse_variables()

    init = tf.group(tf.global_variables_initializer(),
                    tf.local_variables_initializer())
    sess = tf.Session()
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    sess.run(init)
    try:
        index = 0
        while not coord.should_stop() and index<1:
            _, loss_value = sess.run([m.train_op, m.cost])
            index += 1
            print("step: %d, loss: %f" % (index, loss_value))
    except tf.errors.OutOfRangeError:
        print("Done traing:-------Epoch limit reached")
    except KeyboardInterrupt:
        print("keyboard interrput detected, stop training")
    finally:
        coord.request_stop()
    coord.join(threads)
    sess.close()
    del sess


if __name__ == "__main__":
    main()

代码要点

本文重点是网络结构的学习,这里主要讲一下model.py的代码。

程序输出内容如下:

inputs shape: (33, 37)
embed shape: (33, 37, 100)
expand shape: (33, 37, 100, 1)
kernel 0 shape: (3, 100, 1, 128)
conv 0 shape: (33, 35, 1, 128)
maxpool ksize 0 = [1, 35, 1, 1]
pool 0 shape: (33, 1, 1, 128)
kernel 1 shape: (4, 100, 1, 128)
conv 1 shape: (33, 34, 1, 128)
maxpool ksize 1 = [1, 34, 1, 1]
pool 1 shape: (33, 1, 1, 128)
kernel 2 shape: (5, 100, 1, 128)
conv 2 shape: (33, 33, 1, 128)
maxpool ksize 2 = [1, 33, 1, 1]
pool 2 shape: (33, 1, 1, 128)
concat shape: (33, 1, 1, 384)

step: 1, loss: 0.696079

网络结构示意图如下:


            embedding          dim expand        3个卷积核,strides=[1, 1, 1, 1],padding=VALID         maxpool ksize                         最后一维拼接      去掉中间两个为1的维度

(33,37)   -->   (33, 37, 100)   -->   (33, 37, 100, 1)   -----> (3, 100, 1, 128)   -->   (33, 35, 1, 128) --> [1, 35, 1, 1]  -->  (33, 1, 1, 128)   --->  (33, 1, 1, 384) --> (33, 384) -->softmax

                                                                                           |--> (4, 100, 1, 128)   -->   (33, 34, 1, 128) --> [1, 34, 1, 1]  -->  (33, 1, 1, 128)   --|  

                                                                                           |--> (5, 100, 1, 128)   -->   (33, 33, 1, 128) --> [1, 33, 1, 1]  -->  (33, 1, 1, 128)   --|


首先,一个batch的数据有33个样本,最长的样本有37个词,因此inputs shape=(33,37);

26~30行对数据做embedding处理,即每个词用一个词向量表示,因此shape变成 (33, 37, 100),这里用到tf.nn.embedding_lookup;

CNN一般用于图片,图片数据前3维分别表示样本序号、图片高度、图片宽度、像素通道,一般有R、G、B三个通道,因此图片数据有4个维度,为了把CNN用到文本上,32行用tf.expand_dims对数据做了增维,只不过这里通道数为1,shape变成(33, 37, 100, 1);

接下来是经过3个卷积核外加对应的maxpool。48行的卷积操作用的是tf.nn.conv2d,这里有必要详细分析一下 (33, 37, 100, 1)经过卷积核 (3, 100, 1, 128)为什么会输出(33, 35, 1, 128) 。

看一下conv2d的签名:

tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)

这个方法强制要求strides[0] = strides[3] = 1,同时,假设数据shape为[batch, in_height, in_width, in_channels],卷积核shape为[filter_height, filter_width, in_channels, out_channels],要求卷积核的第3维必须为输入数据的通道数,卷积输出的shape为[batch, out_height, out_width, out_channels],输出的每个元素为filter_height*filter_width*in_channels的加和,因此卷积有综合输入多个通道数据的功能。那么out_height和out_width怎么确定呢?根据padding的不同,计算方法也不一样,SAME就是不够补0的方式,VALID就是不够截取的方式。以一维数据为例,假设数据为[1, 2, 3, 4. 5],核的长度为2,stride为2,则两种padding方式匹配结果如下:

                [1, 2, 3, 4, 5]

SAME     [1,2] [3,4] [5,0]

VALID     [1,2] [3,4] [5

至于具体计算,padding=SAME时,

out_height = ceil(float(in_height) / float(strides[1]))
out_width  = ceil(float(in_width) / float(strides[2]))

padding=VALID时,

out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))

对于本文的例子,input的shape为 (33, 37, 100, 1),filter的shape为(3, 100, 1, 128),strides=[1, 1, 1, 1],则

out_height = ceil(float(in_height - filter_height + 1) / float(strides[1])=ceil(float(37- 3+ 1) / float(1)=35

out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))=ceil(float(100- 100+ 1) / float(1))=1

因此卷积后的shape为(33, 35, 1, 128) 。

55行调用tf.nn.max_pool对第2维作maxpool(其它维ksize的值为1,相当于没有maxpool),输出shape变成(33, 1, 1, 128)

63行用tf.concat对第4维的数据进行拼接,shape变成(33, 1, 1, 384)

65行用tf.squeeze去掉长度为1的2、3两维,shape变成(33, 384)

然后就是全连接做softmax分类了


你可能感兴趣的:(深度学习)