AI实战:上海垃圾分类系列(一)之快速搭建垃圾分类模型
AI实战:上海垃圾分类系列(二)之快速搭建垃圾分类模型后台服务
AI实战:上海垃圾分类系列(三)之快速搭建垃圾分类智能问答机器人
快速搭建垃圾分类智能问答机器人:使用深度学习方法搭建算法框架,力求简洁、快速、有效。
2019上海市生活垃圾按照以下标准分类!:http://sh.bendibao.com/zffw/2019225/202535.shtm
上海生活垃圾分类标准及投放要求:https://www.sohu.com/a/163450869_688983
垂直领域搭建问答系统的框架很多,这里使用最简单的方法:把垃圾分类的问题分成8大类,每个类别给一个随机设定的回答。
算法框架一:
1、word2vec + TextCNN 搭建模型
2、类别问答映射表做回答
算法框架二:
1、word2index + TextCNN 搭建模型
2、类别问答映射表做回答
两种方法的优缺点对比:
1、方法一在语义上面比较优势大,但对训练语料小(百级、千级)的时候模型效果不太好
2、方法二在较小语料时,模型收敛更好,但对于未知词识别差
模型训练:train.py
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helper as data_helpers
from text_cnn import TextCNN
import math
from tensorflow.contrib import learn
# Parameters
# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("train_data_file", "./data/train_data.txt", "Data source for the positive data.")
tf.flags.DEFINE_string("train_label_data_file", "", "Data source for the label data.")
tf.flags.DEFINE_string("w2v_file", "./data/word2vec.bin", "w2v_file path")
#tf.flags.DEFINE_string("w2v_file", "", "w2v_file path")
#w2v_file非空表示使用词向量做embedding,否则用vocabulary方法做embedding
# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "2, 3, 4", "Comma-separated filter sizes (default: '3, 4, 5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")
# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 20, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 20, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 3, "Number of checkpoints to store (default: 5)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
max_words_length = 10
def load_data(w2v_model):
"""Loads starter word-vectors and train/dev/test data."""
# Load the starter word vectors
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file)
x = []
vocab_size = 0
if(w2v_model is None):
vocab_processor = learn.preprocessing.VocabularyProcessor(max_words_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
vocab_size = len(vocab_processor.vocabulary_)
# out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time()))))
vocab_processor.save("./data/vocab.pkl")
print( 'save "./data/vocab.pkl"')
else:
x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_words_length)
vocab_size = len(w2v_model.vocab_hash)
print('use w2v .bin')
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
return x_train, x_dev, y_train, y_dev, vocab_size
def train(w2v_model):
# Training
x_train, x_dev, y_train, y_dev, vocab_size= load_data(w2v_model)
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
cnn = TextCNN(
w2v_model,
sequence_length=x_train.shape[1],
num_classes=y_train.shape[1],
vocab_size=vocab_size,
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(", "))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-3)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Keep track of gradient values and sparsity (optional)
grad_summaries = []
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged = tf.summary.merge(grad_summaries)
# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs"))
print("Writing to {}\n".format(out_dir))
# Summaries for loss and accuracy
loss_summary = tf.summary.scalar("loss", cnn.loss)
acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
# Train Summaries
train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
# Dev summaries
dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
# Write vocabulary
# vocab_processor.save(os.path.join(out_dir, "vocab"))
# Initialize all variables
sess.run(tf.global_variables_initializer())
def train_step(x_batch, y_batch):
"""
A single training step
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
}
# _, step, summaries, loss, accuracy, (w, idx) = sess.run(
# [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy, cnn.get_w2v_W()],
# feed_dict)
_, step, summaries, loss, accuracy = sess.run(
[train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
# print w[:2], idx[:2]
train_summary_writer.add_summary(summaries, step)
def dev_step(x_batch, y_batch, writer=None):
"""
Evaluates model on a dev set
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: 1.0
}
step, summaries, loss, accuracy = sess.run(
[global_step, dev_summary_op, cnn.loss, cnn.accuracy],
feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
if writer:
writer.add_summary(summaries, step)
# Generate batches
batches = data_helpers.batch_iter(
list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
def dev_test():
batches_dev = data_helpers.batch_iter(list(zip(x_dev, y_dev)), FLAGS.batch_size, 1)
for batch_dev in batches_dev:
x_batch_dev, y_batch_dev = zip(*batch_dev)
dev_step(x_batch_dev, y_batch_dev, writer=dev_summary_writer)
# Training loop. For each batch...
for batch in batches:
x_batch, y_batch = zip(*batch)
train_step(x_batch, y_batch)
current_step = tf.train.global_step(sess, global_step)
# Training loop. For each batch...
if current_step % FLAGS.evaluate_every == 0:
print("\nEvaluation:")
dev_test()
if current_step % FLAGS.checkpoint_every == 0:
path = saver.save(sess, checkpoint_prefix, global_step=current_step)
print("Saved model checkpoint to {}\n".format(path))
if __name__ == "__main__":
w2v_wr = data_helpers.w2v_wrapper(FLAGS.w2v_file)
train(w2v_wr.model)
模型评估:eval.py
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helper as data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn
import csv
# Parameters
# Data Parameters
tf.flags.DEFINE_string("valid_data_file", "./data/valid_data.txt", "Data source for the positive data.")
#tf.flags.DEFINE_string("w2v_file", "./data/word2vec.bin", "w2v_file path")
tf.flags.DEFINE_string("w2v_file", "", "w2v_file path")
#w2v_file非空表示使用词向量做embedding,否则用vocabulary方法做embedding
# Eval Parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_string("checkpoint_dir", "./runs/checkpoints/", "Checkpoint directory from training run")
tf.flags.DEFINE_boolean("eval_train", True, "Evaluate on all training data")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
max_words_length = 10
def load_data(w2v_model):
"""Loads starter word-vectors and train/dev/test data."""
# Load the starter word vectors
print("Loading data...")
x_text, y_test = data_helpers.load_data_and_labels(FLAGS.valid_data_file)
y_test = np.argmax(y_test, axis=1)
if w2v_model is not None:
x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_words_length)
else:
vocab = learn.preprocessing.VocabularyProcessor.restore('./data/vocab.pkl')
x = data_helpers.get_text_idx(x_text, vocab.vocabulary_._mapping, max_words_length)
return x,y_test
def eval(w2v_model):
# Evaluation
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
# Load the saved meta graph and restore variables
saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
saver.restore(sess, checkpoint_file)
# Get the placeholders from the graph by name
input_x = graph.get_operation_by_name("input_x").outputs[0]
dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
# Tensors we want to evaluate
predictions = graph.get_operation_by_name("output/predictions").outputs[0]
x_test, y_test = load_data(w2v_model)
# Generate batches for one epoch
batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)
# Collect the predictions here
all_predictions = []
for x_test_batch in batches:
batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
all_predictions = np.concatenate([all_predictions, batch_predictions])
# Print accuracy if y_test is defined
if y_test is not None:
correct_predictions = float(sum(all_predictions == y_test))
print("Total number of test examples: {}".format(len(y_test)))
print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
# Save the evaluation to a csv
predictions_human_readable = np.column_stack(all_predictions)
out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
print("Saving evaluation to {0}".format(out_path))
with open(out_path, 'w') as f:
csv.writer(f).writerows(predictions_human_readable)
if __name__ == "__main__":
w2v_wr = data_helpers.w2v_wrapper(FLAGS.w2v_file)
eval(w2v_wr.model)
模型推理:predict.py
import tensorflow as tf
import numpy as np
import os, sys, random
import data_helper as data_helpers
import jieba
from tensorflow.contrib import learn
# Parameters
# Data Parameters
#tf.flags.DEFINE_string("w2v_file", "./data/word2vec.bin", "w2v_file path")
tf.flags.DEFINE_string("w2v_file", "", "w2v_file path")
#w2v_file非空表示使用词向量做embedding,否则用vocabulary方法做embedding
# Eval Parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_string("checkpoint_dir", "./runs/checkpoints/", "Checkpoint directory from training run")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
max_words_length = 10 #输入文本的最大词个数
class RefuseClassification():
def __init__(self):
self.w2v_wr = data_helpers.w2v_wrapper(FLAGS.w2v_file)#加载词向量
self.init_model()
self.answer_map = {0: ['可回收垃圾', '是可回收垃圾哦', '可回收垃圾哟!', '这个是可回收垃圾。'], \
1: ['有害垃圾', '是有害垃圾哦', '有害垃圾哟!', '这个是有害垃圾。'], \
2: ['湿垃圾', '湿垃圾是哦', '湿垃圾哟!', '这个是湿垃圾。'], \
3: ['干垃圾', '是干垃圾哦', '干垃圾哟!', '这个是干垃圾。'], \
4:['您好!', '客官好!', '你好。', '您好,您可以问我一些垃圾分类的问题额。'], \
5:['再见!', '8', '88', '拜拜。'], \
6:['您可以问我上海垃圾分类的问题。'], \
7:['目前上海垃圾分为四大类:可回收垃圾、有害垃圾、湿垃圾、干垃圾。']}
self.vocab = {}
def deal_data(self, text):
words = jieba.cut(text)
#print('words', list(words))
words = list(words)[:max_words_length]
x_text = [' '.join(words)]
if self.w2v_wr.model is None:
if len(self.vocab) == 0:
self.vocab = learn.preprocessing.VocabularyProcessor.restore('./data/vocab.pkl')
x = data_helpers.get_text_idx(x_text, self.vocab.vocabulary_._mapping, max_words_length)
else:
x = data_helpers.get_text_idx(x_text, self.w2v_wr.model.vocab_hash, max_words_length)
return x
def init_model(self):
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
self.sess = tf.Session(config=session_conf)
self.sess.as_default()
# Load the saved meta graph and restore variables
saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
saver.restore(self.sess, checkpoint_file)
# Get the placeholders from the graph by name
self.input_x = graph.get_operation_by_name("input_x").outputs[0]
self.dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
# Tensors we want to evaluate
self.predictions = graph.get_operation_by_name("output/predictions").outputs[0]
def predict(self, text):
if len(text) > 60:
return '您的输入有点长哦!'
x_test = self.deal_data(text)
predictions = self.sess.run(self.predictions, {self.input_x: x_test, self.dropout_keep_prob: 1.0})
refuse_text = self.answer_map[predictions[0]]
answer = refuse_text[random.randint(0, len(refuse_text)-1)]
return answer
if __name__ == "__main__":
if len(sys.argv) == 2:
test = RefuseClassification()
res = test.predict(sys.argv[1])
print('classify:', res)
词向量方法
训练: 200个epoch,loss 0.829977, acc 0.686275
评估:0.791946
词汇索引方法
训练:200个epoch,loss 0.657236, acc 0.862745
评估:Accuracy: 0.852349
python src/predict.py '猪肉饺子是什么垃圾?'
结果:
classify: 湿垃圾
python src/predict.py 猪骨 头是啥哪类
结果:
classify: 干垃圾哟!
python src/predict.py 你会什么啊
结果:
classify: 您可以问我上海垃圾分类的问题。
python src/predict.py hello
结果:
classify: 客官好!
python src/predict.py 猪肉粉条是什么垃圾
结果:
classify: 湿垃圾是哦
但也存在问题:
python src/predict.py 你好 !
结果:
classify: 可回收垃圾
注:由于训练数据只有500多条语料,比较少,对于4-7类的问题识别准确率不高;但对于0-3类问题还是比较准的。
本文仅是提供一种快速搭建垂直领域问答系统的一种简易方法,关于准确率这块就不再优化了。。。
环境要求
python: 3.x
tensorflow: 1.x
jieba
word2vec
数据标注
模型训练
python src/train.py
训练数据:./data/train_data.txt
模型评估
python src/eval.py
测试数据:./data/vilid_data.txt
单句测试
python src/predict.py ‘猪肉饺子是什么垃圾?’
输出结果:
回答: 湿垃圾
修改回答
修改 src/predict.py 中的 self.answer_map 的参数
修改标注
若是增加了问题类别,需修改 src/data_helper.py 中的 num_classfication
修改 ./data/train_data.txt 作为训练集;
./data/vilid_data.txt 中,作为测试数据集
embedding方法选择
修改 train.py, eval.py, predict.py 中的:
tf.flags.DEFINE_string(“w2v_file”, “”, “w2v_file path”)
w2v_file非空表示使用词向量做embedding,否则用vocabulary方法做embedding
完整的工程包括:
1、完整代码
2、完整的数据
3、已训练好的模型
4、文档
工程下载地址:https://download.csdn.net/download/zengnlp/11297390