之前写过用lstm模型做的文本分类,但是代码结构非常混乱。读过Bert源码后,决定模仿Bert的结构,自己重新写一遍使用lstm模型的代码。只作为熟悉tensorflow各个api与一个比较清楚的NLP模型结构的练手用,不求更高的准确率。
使用包含10个商品类别,60000+数据的,已标注正负情感的商品评论数据作为训练语料。原文件为csv格式,包含3个字段:cat(类别)、label(正负情感)、review(实际的评论文字)。其既可作为正负情感分类的二分类训练语料,也可以(勉强)作为商品类别分类的10分类训练语料。
数据取自https://github.com/SophonPlus/ChineseNlpCorpus,非常感谢
已按大约8:2的比例得到了训练集与测试集并将其转为了tsv文件。
使用结巴分词作为切词工具。
由于我一直没有找到合适的中文词典文件,Bert中的词典文件又是以字作为最小单位的,故这里暂时只是对待训练的语料做切词、去重的处理,得到一个比较小的字典文件。
def create_vocab():
raw_df = pd.read_csv(RAW_DATA) # 读原始文件为dataframe
# 热水器有一条数据有问题,不要热水器的数据
raw_df = raw_df[raw_df.cat != '热水器']
raw_document = raw_df['review'].tolist() # 原始语料(list形式)
# 加载停用词列表
# with open(STOPWORDS, 'r', encoding='utf-8') as s:
# stopwords = [word.strip() for word in s.readlines()]
document_words = [] # 原始语料完成切词
for sentence in raw_document:
cut_sentence = [word for word in jieba.lcut(sentence)]
document_words.extend(cut_sentence)
vocab_list = set(document_words)
with open(VOCAB, 'w', encoding='utf-8') as f:
f.write('[PAD]' + '\n')
f.write('[UNK]' + '\n')
for vocab in vocab_list:
f.write(vocab + '\n')
这一块完全基于Bert源码,做了非常多的精简。只满足:to_unicode、读取词典、切词、词语转id、id转词语的基本功能。没什么好说的。
import collections
import tensorflow as tf
import jieba
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
# 将词典中的词构成(词,index)的collections.OrderedDict形式
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with tf.gfile.GFile(vocab_file, "r") as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab.get(item, vocab['[UNK]']))
return output
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file):
# 根据vocab文件,得到形如(词,index)的字典
self.vocab = load_vocab(vocab_file)
# 变成 index: 词 的形式
self.inv_vocab = {
v: k for k, v in self.vocab.items()}
# 将句子变成词列表
@staticmethod
def tokenize(text):
split_tokens = jieba.lcut(text)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
模型的mode参数可取train、eval、predict三类,取eval时只返回cost与accuracy,取predict时只返回logits。别的不用多说了。
import tensorflow as tf
import json
import six
class LstmConfig(object):
def __init__(self,
vocab_size, # 词典中的词数
hidden_size=128,
keep_prob=0.9,
embedding_keep_prob=0.9, # 词向量不被dropout的比例
max_grad_norm=5,
num_of_classes=2, # 分类数
num_of_layers=2, # lstm网络层数
initializer_range=0.02): # 初始化范围
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.keep_prob = keep_prob
self.embedding_keep_prob = embedding_keep_prob
self.max_grad_norm = max_grad_norm
self.num_of_classes = num_of_classes
self.num_of_layers = num_of_layers
self.initializer_range = initializer_range
@classmethod
def from_dict(cls, json_object):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
config = LstmConfig(vocab_size=None)
for (key, value) in six.iteritems(json_object):
config.__dict__[key] = value
return config
@classmethod
def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters."""
with tf.gfile.GFile(json_file, "r") as reader:
text = reader.read()
return cls.from_dict(json.loads(text))
# 双向LSTM网络模型
class LstmModel(object):
# 构建网格结构
def __init__(self, config, mode):
self.config = config
self.embedding_keep_prob = config.embedding_keep_prob
self.mode = mode
output_keep_prob = config.keep_prob if mode == 'train' else 1.0
# 词向量
self.word_embedding = tf.get_variable('word_emb', shape=[config.vocab_size, config.hidden_size])
# lstm网络结构
# 前向网络变量
lstm_cells_fw = [tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(config.hidden_size),
output_keep_prob=output_keep_prob)
for _ in range(config.num_of_layers)]
self.lstm_fw = tf.nn.rnn_cell.MultiRNNCell(lstm_cells_fw)
# 反向网络
lstm_cells_bw = [tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(config.hidden_size),
output_keep_prob=output_keep_prob)
for _ in range(config.num_of_layers)]
self.lstm_bw = tf.nn.rnn_cell.MultiRNNCell(lstm_cells_bw)
# Softmax层变量
self.weight = tf.get_variable('weight', [config.hidden_size * 2, config.num_of_classes])
self.bias = tf.get_variable('bias', [config.num_of_classes])
# 定义模型的前向计算图
def forward(self, src_input, src_size, label):
# 将输入的序号化单词转成词向量
inputs = tf.nn.embedding_lookup(self.word_embedding, src_input)
if self.mode == 'train':
inputs = tf.nn.dropout(inputs, self.embedding_keep_prob)
# LSTM网络计算
with tf.variable_scope('lstm'):
outputs, states = tf.nn.bidirectional_dynamic_rnn(self.lstm_fw,
self.lstm_bw,
inputs,
dtype=tf.float32,
sequence_length=src_size)
final_outputs = tf.concat(outputs, 2)
final_outputs = final_outputs[:, -1, :]
# 取平均值
# final_outputs = tf.reduce_mean(tf.concat(outputs, 2), 1)
# 全连接层计算
with tf.variable_scope('fc'):
logits = tf.matmul(final_outputs, self.weight) + self.bias
if self.mode == 'predict':
return logits
# 损失函数
with tf.variable_scope('loss'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label,
logits=logits)
cost = tf.reduce_mean(loss)
# 准确率
with tf.variable_scope('accuracy'):
correct_prediction = tf.equal(tf.argmax(logits, 1), label)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
if self.mode == 'eval':
return cost, accuracy
# 定义反向操作
trainable_variables = tf.trainable_variables()
# 控制梯度大小,定义优化方法和训练步骤
grads = tf.gradients(cost, trainable_variables)
grads, _ = tf.clip_by_global_norm(grads, self.config.max_grad_norm)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.config.learning_rate)
train_op = optimizer.apply_gradients(zip(grads, trainable_variables))
return logits, cost, accuracy, train_op
我命名为run_classifier,完全照搬Bert。
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
flags.DEFINE_integer("num_train_epochs", 4, "Total epoches for train.")
flags.DEFINE_string(
"data_dir", "E:/NLP/NLP_Deep_Learning_Summary/datasets",
"The input data dir. Should contain the .tsv files (or other data files) for the task.")
flags.DEFINE_string("init_checkpoint", None, "Initial checkpoint")
flags.DEFINE_string("vocab_file", "./vocab.txt", "The vocabulary file.")
flags.DEFINE_string("output_file", "./model1", "The output file for trained model.")
flags.DEFINE_bool("do_train", True, "Whether to run training.")
flags.DEFINE_bool("do_eval", True, "Whether to run eval on the dev set.")
flags.DEFINE_bool("do_predict", False, "Whether to run the model in inference mode on the test set.")
为了从训练/测试数据中获得tfrecord文件,需要做Example与Feature类的处理。DataProcessor与Bert源码几乎相同,功能为从tsv文件中得到example类的数据。
tsv文件的各个字段为:index(id)、category(商品类别)、polarity(情感,0或1)、text(原始文字)。
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text, label=None):
"""Constructs a InputExample."""
self.guid = guid
self.text = text
self.label = label
由于使用的是lstm,其特征包含:每条语料原始的词id、原始的长度(即词数)与原始的分类标签。
class InputFeatures(object):
def __init__(self, input_ids, input_size, label):
self.input_ids = input_ids
self.input_size = input_size
self.label = label
class DataProcessor(object):
def get_train_examples(self, data_dir):
lines = self._read_tsv(os.path.join(data_dir, "online_shopping_train.tsv"))
return self._create_examples(lines, 'train')
def get_dev_examples(self, data_dir):
lines = self._read_tsv(os.path.join(data_dir, "online_shopping_dev.tsv"))
return self._create_examples(lines, 'dev')
def get_test_examples(self, data_dir):
lines = self._read_tsv(os.path.join(data_dir, "online_shopping_test.tsv"))
return self._create_examples(lines, 'test')
@staticmethod
def get_labels():
return ["0", "1"]
# return ['蒙牛', '水果', '洗发水', '平板', '酒店', '手机', '计算机', '书籍', '衣服', '热水器']
@staticmethod
def _create_examples(lines, set_type):
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text = tokenization.convert_to_unicode(line[3])
label = tokenization.convert_to_unicode(line[2])
examples.append(
InputExample(guid=guid, text=text, label=label))
return examples
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with tf.gfile.Open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
return lines
也是模仿Bert的写法写的(甚至保留了打印前五条example)。
# 将一个example类的训练数据转成feature类
def convert_single_example(ex_index, example, tokenizer):
text = example.text
tokens = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_size = len(input_ids)
label = int(example.label)
# 打印前5条转换的记录
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("guid: %s" % example.guid)
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_size: %s" % input_size)
tf.logging.info("label: %s" % label)
feature = InputFeatures(input_ids=input_ids, input_size=input_size, label=label)
return feature
# 将准备喂入模型的数据存成tfrecord文件
def file_based_convert_examples_to_features(examples, tokenizer, output_file):
writer = tf.python_io.TFRecordWriter(output_file)
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, tokenizer)
def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f
features = collections.OrderedDict()
features['input_ids'] = create_int_feature(feature.input_ids)
features['input_size'] = create_int_feature([feature.input_size])
features['label'] = create_int_feature([feature.label])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
writer.close()
终于又有相对原创的代码了。这里在做batch的同时对input_ids做了padding的处理。效果即input_fn_builder,只是改了名字,调整了结构。
def file_based_dataset_loader(input_file, is_training, batch_size, num_epochs):
# TfRecord的解析函数
def parse_func(serialized_example):
name_to_features = {
"input_ids": tf.VarLenFeature(tf.int64),
"input_size": tf.FixedLenFeature(shape=(1,), dtype=tf.int64),
"label": tf.FixedLenFeature(shape=(1,), dtype=tf.int64),
}
parsed_example = tf.parse_single_example(serialized_example, features=name_to_features)
parsed_example['input_ids'] = tf.sparse_tensor_to_dense(parsed_example['input_ids'])
input_ids = parsed_example['input_ids']
input_size = parsed_example['input_size']
label = parsed_example['label']
return input_ids, input_size, label
dataset = tf.data.TFRecordDataset(input_file)
dataset = dataset.map(parse_func)
if is_training:
dataset = dataset.repeat(num_epochs).shuffle(buffer_size=100)
padded_shapes = (tf.TensorShape([None]), # 语料数据,None即代表batch_size
tf.TensorShape([None]), # 语料数据各个句子的原始长度
tf.TensorShape([None])) # 标签数据,None即代表batch_size
# 调用padded_batch方法进行batching操作
batched_dataset = dataset.padded_batch(batch_size, padded_shapes)
# dataset = dataset.batch(batch_size)
return batched_dataset
模型运行的函数,单独拿出来。
def run_epoch(session, cost_op, accuracy_op, train_op, step):
while True:
try:
cost, accuracy, _ = session.run([cost_op, accuracy_op, train_op])
if step % 100 == 0:
tf.logging.info('Steps: {0}, Loss value: {1},Accuracy: {2}'.format(
step, cost, accuracy))
step += 1
except tf.errors.OutOfRangeError:
break
主程序。在config配置里直接写明了词典文件的词数。
另外,不管是在构建词典文件,处理待预测的句子,都没有做去停用词的步骤。
def main():
config = modelling.LstmConfig(vocab_size=68355)
tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file)
processor = DataProcessor()
if FLAGS.do_train:
# 定义训练用循环神经网络模型
train_model = modelling.LstmModel(config, mode='train')
train_examples = processor.get_train_examples(FLAGS.data_dir)
num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
train_file = r'./tmp/train.tfrecord'
file_based_convert_examples_to_features(train_examples, tokenizer, train_file)
# 获得训练数据
train_data = file_based_dataset_loader(train_file,
is_training=True,
batch_size=FLAGS.train_batch_size,
num_epochs=FLAGS.num_train_epochs)
train_iterator = train_data.make_initializable_iterator()
input_ids, input_sizes, labels = train_iterator.get_next()
input_sizes = tf.reshape(input_sizes, shape=(-1,))
labels = tf.reshape(labels, shape=(-1,))
# 定义前向计算图,输入数据以张量的形式提供给forward函数
_, cost_op, accuracy_op, train_op = train_model.forward(input_ids, input_sizes, labels)
#
# TensorFlow持久化类
saver = tf.train.Saver()
step = 0
# 训练模型
tf.logging.info("***** Running training *****")
tf.logging.info(" Num examples = %d", len(train_examples))
tf.logging.info(" Total training steps: {}".format(num_train_steps))
with tf.Session() as sess:
tf.global_variables_initializer().run()
sess.run(train_iterator.initializer)
run_epoch(sess, cost_op, accuracy_op, train_op, step)
tf.logging.info("********* Training Step Finished *********")
saver.save(sess, r'./model/lstm.ckpt')
if FLAGS.do_eval:
tf.reset_default_graph()
eval_model = modelling.LstmModel(config, mode='eval')
eval_examples = processor.get_dev_examples(FLAGS.data_dir)
eval_file = r'./tmp/eval.tfrecord'
file_based_convert_examples_to_features(eval_examples, tokenizer, eval_file)
# 获得测试数据
eval_data = file_based_dataset_loader(eval_file,
is_training=False,
batch_size=FLAGS.eval_batch_size,
num_epochs=None)
eval_iterator = eval_data.make_initializable_iterator()
eval_input_ids, eval_input_sizes, eval_labels = eval_iterator.get_next()
eval_input_sizes = tf.reshape(eval_input_sizes, shape=(-1,))
eval_labels = tf.reshape(eval_labels, shape=(-1,))
eval_cost_op, eval_accuracy_op = eval_model.forward(eval_input_ids, eval_input_sizes, eval_labels)
# 测试模型
eval_step = 0
tf.logging.info("***** Running Eval *****")
tf.logging.info(" Num examples = %d", len(eval_examples))
with tf.Session() as sess_eval:
tf.global_variables_initializer().run()
sess_eval.run(eval_iterator.initializer)
saver = tf.train.Saver()
saver.restore(sess_eval, r'./model/lstm.ckpt')
tf.logging.info("*** Restore finished ***")
eval_cost = 0
eval_correct = 0
while True:
try:
cost, accuracy = sess_eval.run([eval_cost_op, eval_accuracy_op])
eval_cost += cost / FLAGS.eval_batch_size
eval_correct += accuracy * FLAGS.eval_batch_size
eval_step += 1
except tf.errors.OutOfRangeError:
break
eval_cost = eval_cost / (len(eval_examples) / FLAGS.eval_batch_size)
eval_accuracy = eval_correct / len(eval_examples)
print('Eval Cost: {0}, Eval Accuracy: {1}'.format(eval_cost, eval_accuracy))
with open(r'./model/eval_result.txt', 'w', encoding='utf-8') as f:
f.write('Eval Cost: {0}, Eval Accuracy: {1}'.format(eval_cost, eval_accuracy))
if __name__ == '__main__':
main()
重新写了convert_single_example方法。
由于只是少量单句的预测,因此没有做生成tfrecord并解析的处理,而是直接做了padding的处理。
import tensorflow as tf
import tokenization
import modelling
from run_classifier import InputExample, InputFeatures
# 将一个example类的训练数据转成feature类
def convert_single_example(example, tokenizer):
text = example.text
tokens = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_size = len(input_ids)
feature = InputFeatures(input_ids=input_ids, input_size=input_size, label=None)
return feature
def main(data):
tokenizer = tokenization.FullTokenizer(vocab_file=r'./vocab.txt')
config = modelling.LstmConfig(vocab_size=68355)
input_ids = []
input_sizes = []
max_length = 0
for index in range(len(data)):
guid = 'test-%d' % index
text = tokenization.convert_to_unicode(str(data[index]))
data_example = InputExample(guid=guid, text=text, label=None)
data_feature = convert_single_example(example=data_example, tokenizer=tokenizer)
print(data_feature.input_ids)
if len(data_feature.input_ids) > max_length:
max_length = len(data_feature.input_ids)
input_ids.append(data_feature.input_ids)
input_sizes.append(data_feature.input_size)
for input_id in input_ids:
if len(input_id) < max_length:
input_id.extend((max_length-len(input_id)) * [0])
input_ids = tf.convert_to_tensor(input_ids)
input_sizes = tf.convert_to_tensor(input_sizes)
predict_model = modelling.LstmModel(config, mode='predict')
output_op = predict_model.forward(input_ids, input_sizes, label=None)
with tf.Session() as sess:
saver = tf.train.Saver()
saver.restore(sess, r'./model/lstm.ckpt')
output = sess.run(output_op)
print(output)
print(tf.argmax(output, 1).eval())
if __name__ == '__main__':
text = ['这个房间真的很棒,又舒服又便宜', '这个房间太差了,又贵又破,不推荐', '一句很普通很中立的话']
main(text)
同样的训练集与测试集,在Bert上跑完的准确度是0.946,用lstm,在2个epoch上且没有调整学习率的情况下,跑完准确度有0.897。其中Bert模型在服务器上跑了三天两夜,lstm在本机cpu上跑了一个半小时。结果还可以。
tf.estimator,这个高级封装的api似乎很牛逼,Bert中也使用了这个api,要继续学习学习。牵扯到model_fn的构建。我理解应该是重新复构模型的结构。在modelling中应该只定义模型结构,对于前向传播的过程,都到model_fn中完成。但是具体的操作还需要进一步的了解。
不得不说,虽然这样的代码可能还是不算高级,但是已经比我之前自己写的初版,在结构上要清楚太多了。同时也对tf.dataset有了比较深的了解(之前的next_batch都是完全自己写的,这里用dataset的iterator就可以搞定)。やはり多读读牛逼的源码是很有用的。