我看现在网络上大多数据教程或者demo用tensorflow训练一个模型,喂数据使用tf.placeholder,但是在大型的文本或者图像数据里面使用placeholder就不wok,性能和内存都是瓶颈。还有大多数代码模型保存什么都没有,更别说模型部署,本篇博客从原始数据转化为tfrecord ,再把tfrecord数据丢给模型训练,再到模型保存,这里把保存好的模型再转化为pb格式文件模型,模型部署我在前面说了使用tensorflow里面的java接口或者用tensorflow-serving。我抛砖引玉,做个简单的介绍
原数数据:
1 可惜 关门 了 关门 了 啥时候 再 开 家
0 没 去过 不知道 不知道 为什么 点评
0 还 不错 价格 搬 哈哈哈 哈哈哈 哈哈哈 哈哈哈 哈哈哈
1 已经 关门 了 已经 关门 了 已经 关门 了 已经 关门 了 已经 关门 了 已经 关门 了
1 口味 好 服务 也好 经常 来吃 可惜 现在 好像 倒闭 了
转化为tf-records:
import tensorflow.contrib.keras as kr
import codecs
import tensorflow as tf
def writerecord(savepath,vocabpath,filename,maxlen=50):
writer = tf.python_io.TFRecordWriter(savepath)
_, word_to_id = _read_vocab(vocabpath)
with codecs.open(filename,'r',encoding='utf-8') as f:
for line in f.readlines():
try:
label,contet=line.strip().split('\t')
conntlist=contet.split(" ")
label = int(label)
lablehot = [0, 0]
lablehot[label] = 1
features=[word_to_id[x] if x in word_to_id else word_to_id["_UNK"] for x in conntlist]
if len(features)>=50:
x_pad=features[0:50]
else:
x_pad=features+[0]*(50-len(features))
example = tf.train.Example(
features=tf.train.Features(
feature={'input': tf.train.Feature(int64_list=tf.train.Int64List(value=x_pad)),
'lable': tf.train.Feature(int64_list=tf.train.Int64List(value=lablehot))}))
serialized = example.SerializeToString()
writer.write(serialized)
except Exception as e:
print(e)
writer.close()
print("finish")
def read_and_decode(filename_queue):
# 创建一个reader来读取TFRecord文件中的样例
reader = tf.TFRecordReader()
# 从文件中读出一个样例
_, serialized_example = reader.read(filename_queue)
# 解析读入的一个样例
features = tf.parse_single_example(serialized_example, features={
'input': tf.FixedLenFeature([50],tf.int64),
'lable': tf.FixedLenFeature([2],tf.int64)
})
x = tf.cast(features['input'], tf.int32)
y = tf.cast(features['lable'], tf.float32)
return x,y
"""
inputs获批量的数据,其中这里num_epochs通常设置为None,
如果使用的会报错
"""
def inputs(file, batch_size, num_epochs):
if not num_epochs:
num_epochs = None
filename_queue = tf.train.string_input_producer([file])
image, label = read_and_decode(filename_queue)
x,y=tf.train.shuffle_batch([image,label],batch_size=batch_size, capacity=300, min_after_dequeue=10)
return x, y
def getvocablen(filename):
words=list(map(lambda line:line.strip(),codecs.open(filename,'r',encoding='utf-8').readlines()))
return len(words)
def _read_vocab(filename):
"""读取词汇列别"""
words=list(map(lambda line:line.strip(),codecs.open(filename,'r',encoding='utf-8').readlines()))
word_to_id=dict(zip(words,range(len(words))))
return words,word_to_id
def file_to_ids_single(content,word_to_id,maxlen=50):
contents=[]
contents.append(list(content.lower()))
data_id = []
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
#print("data_id is :",data_id)
x_pad = kr.preprocessing.sequence.pad_sequences(data_id, maxlen=maxlen,padding='post',
truncating='post')
return x_pad
if __name__=='__main__':
"""data_id is : [[266, 1548, 255]]"""
words, word_to_id = _read_vocab("tensorflow/vocab.txt")
print("len word_to_id:",len(word_to_id))
result=file_to_ids_single("准备一个小时",word_to_id=word_to_id)
print(result[0][49])
print(result)
#build_vocab(Path.baseabusepath)
writerecord("tensorflow/tf.records","tensorflow/vocab.txt","tensorflow/cnn.txt")
# x_train, y_train, words = preocess_file()
# print(x_train.shape, y_train.shape)
模型主类:
import tensorflow as tf
class TextCNNMulFilterSize(object):
def __init__( self, config,input_x,label,keep_prob):
self.config=config
self.input_x =input_x
self.input_y = label
self.keep_prob =keep_prob
self.filter_sizes=list(map(int, self.config.multi_kernel_size.split(",")))
self.cnn()
# Embedding layer
def input_embedding(self):
"""词嵌套
这里先把指定gpu的程序去掉,线上用cpu部署,指定gpu模型会报错
"""
#with tf.device('/gpu:0'):
embedding =tf.get_variable("embedding",[self.config.vocab_size,self.config.embedding_dim])
_input = tf.nn.embedding_lookup(embedding, self.input_x)
_input_expanded = tf.expand_dims(_input, -1)
return _input_expanded
def cnn(self):
l2_loss = tf.constant(0.0)
pooled_outputs = []
embedding_inputs = self.input_embedding()
print(tf.shape(embedding_inputs))
for i, filter_size in enumerate(self.filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
# Convolution Layer
filter_shape = [filter_size, self.config.embedding_dim, 1, self.config.num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[self.config.num_filters]), name="b")
conv = tf.nn.conv2d( embedding_inputs, W,strides=[1, 1, 1, 1],padding="VALID",name="conv")
tf.losses.add_loss(self.config.l2_reg_lambda * tf.nn.l2_loss(W), tf.GraphKeys.REGULARIZATION_LOSSES)
tf.losses.add_loss(self.config.l2_reg_lambda * tf.nn.l2_loss(b), tf.GraphKeys.REGULARIZATION_LOSSES)
# Apply nonlinearity
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
# Maxpooling over the outputs
pooled = tf.nn.max_pool( h,ksize=[1, self.config.seq_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],padding='VALID',name="pool")
pooled_outputs.append(pooled)
# Combine all the pooled features
num_filters_total =(self.config.num_filters) * len(self.filter_sizes)
self.h_pool = tf.concat(pooled_outputs, 3)
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
# 增加dropoout参数
with tf.name_scope("dropout"):
self.h_drop = tf.nn.dropout(self.h_pool_flat, self.keep_prob)
with tf.name_scope("score"):
W = tf.get_variable("W",
shape=[num_filters_total, self.config.num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name="b")
# l2_loss += tf.nn.l2_loss(W)
# l2_loss += tf.nn.l2_loss(b)
tf.losses.add_loss(self.config.l2_reg_lambda*tf.nn.l2_loss(W), tf.GraphKeys.REGULARIZATION_LOSSES)
tf.losses.add_loss(self.config.l2_reg_lambda*tf.nn.l2_loss(b),tf.GraphKeys.REGULARIZATION_LOSSES)
self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name="logits")
self.pred_y=tf.nn.softmax(self.logits,name="pred_y")
tf.add_to_collection('pred_network', self.pred_y)
self.predictions = tf.argmax(self.pred_y, 1, name="predictions")
with tf.name_scope("loss"):
tf.losses.softmax_cross_entropy(logits=self.logits,onehot_labels=self.input_y)
self.loss=tf.losses.get_total_loss()
# losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
# self.loss = tf.reduce_mean(losses) + self.config.l2_reg_lambda *l2_loss
with tf.name_scope("optimize"):
# 优化器
optimizer = tf.train.AdamOptimizer(
learning_rate=self.config.learning_rate)
self.optim = optimizer.minimize(self.loss)
with tf.name_scope("accuracy"):
correct_pred = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.acc = tf.reduce_mean(tf.cast(correct_pred, "float"), name="accuracy")
模型训练以及保存:
from model import TextCNNMulFilterSize
from configuration import TCNNConfig
from data_utils import inputs,getvocablen
import time
import tensorflow as tf
import os
from datetime import timedelta
#basepath="/Users/shuubiasahi/Documents/python"
basepath="/home/zhoumeixu"
data_path=basepath+"/credit-tftextclassify-poi/tensorflow/tf.records"
vocapath=basepath+"/credit-tftextclassify-poi/tensorflow/vocab.txt"
modelpath=basepath+"/credit-tftextclassify-poi/tensorflow/"
print(modelpath,"poi识别model开始训练")
def run_epoch():
# 载入数据
print('Loading data...')
vocablen=getvocablen(vocapath)
x_train, y_train=inputs(data_path,batch_size=30,num_epochs=3)
keep_prob=tf.constant(0.9,dtype=tf.float32)
print('Using CNN model...')
config = TCNNConfig()
config.vocab_size = vocablen
print("vocab_size is:",config.vocab_size)
if config.is_multi_kernel_size==True:
model = TextCNNMulFilterSize(config,x_train,y_train,keep_prob)
session = tf.Session()
session.run(tf.global_variables_initializer())
saver=tf.train.Saver()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord, sess=session)
start_time = time.time()
for i in range(1000):
#session.run(model.optim)
_,loss_train, acc_train = session.run([model.optim,model.loss, model.acc]
)
if i%25==0:
end_time = time.time()
time_dif = end_time - start_time
time_dif = timedelta(seconds=int(round(time_dif)))
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
+ ' Time: {3}'
print(msg.format(i + 1, loss_train, acc_train,time_dif))
start_time = time.time()
if i%100==0:
saver.save(session, "/tmp/model/model.ckpt", global_step=i)
coord.request_stop()
coord.join(threads)
session.close()
if __name__ == '__main__':
run_epoch()
转化为pb格式模型用于做预测:
from model import TextCNNMulFilterSize
from configuration import TCNNConfig
from data_utils import inputs,getvocablen
import time
import tensorflow as tf
#basepath="/Users/shuubiasahi/Documents/python"
basepath="/home/zhoumeixu"
data_path=basepath+"/credit-tftextclassify-poi/tensorflow/tf.records"
vocapath=basepath+"/credit-tftextclassify-poi/tensorflow/vocab.txt"
modelpath=basepath+"/credit-tftextclassify-poi/tensorflow/"
def run_epoch():
# 载入数据
graph = tf.Graph() //1.4.1之后用tf.get_default_graph
vocablen=getvocablen(vocapath)
with graph.as_default():
x_train= tf.placeholder(tf.int32, [None, None], name="input_x")
y_train = tf.placeholder(tf.float32, [None, None], name="input_y")
keep_prob = tf.placeholder(tf.float32, name="keep_prob")
config = TCNNConfig()
config.vocab_size = vocablen
print("vocab_size is:",config.vocab_size)
model = TextCNNMulFilterSize(config,x_train,y_train,keep_prob)
output=model.pred_y
restore_saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
sess.run(tf.global_variables_initializer())
latest_ckpt = tf.train.latest_checkpoint("/tmp/model")
print(latest_ckpt)
print("keep_prob is:",sess.run(model.keep_prob,feed_dict={model.keep_prob:1.0}))
restore_saver.restore(sess, latest_ckpt)
output_graph_def = tf.graph_util.\
convert_variables_to_constants(sess, sess.graph_def, [x_train.op.name, keep_prob.op.name, output.op.name])
tf.train.write_graph(output_graph_def, '.', "./tensorflow/graph.model", as_text=False)
if __name__ == '__main__':
run_epoch()
java接口代码:
int[][] arr = gettexttoidnews(text, map);
Tensor input = Tensor.create(arr);
Tensor result = sess.runner().feed("input_x", input).feed("keep_prob", keep_prob).fetch("score/pred_y").run()
.get(0);
long[] rshape = result.shape();
int nlabels = (int) rshape[1];
int batchSize = (int) rshape[0];
float[][] logits = result.copyTo(new float[batchSize][nlabels]);
if (nlabels > 1 && batchSize > 0) {
return logits[0][1];
}
以上步骤基本上是现在在tensorflow上面训练模型到模型步骤部署的正常规范。。。有问题联系我。