这个文件是主文件,主要就是把输入的句子,变成 id,然后进行 reduce_mean,进行下 wx_b 操作,最后 soft_max 得到分类结果,没有使用分层 soft 哈。
import tensorflow as tf
from sklearn import metrics
import sys
import os
import time
import numpy as np
from datetime import timedelta
from data.loader import read_vocab, read_category, batch_iter, process_file, build_vocab
tf.app.flags.DEFINE_string('train_board_dir', 'tensorboard/fasttext/train', 'Path to tensorboard')
tf.app.flags.DEFINE_string('test_board_dir', 'tensorboard/fasttext/test', 'Path to tensorboard')
tf.app.flags.DEFINE_string('train_dir', 'data/cnews/cnews.train.txt', 'Path to train txt')
tf.app.flags.DEFINE_string('val_dir', 'data/cnews/cnews.val.txt', 'Path to val txt')
tf.app.flags.DEFINE_string('test_dir', 'data/cnews/cnews.test.txt', 'Path to test txt')
tf.app.flags.DEFINE_string('vocab_dir', 'data/cnews/cnews.vocab.txt', 'Path to vocabulary txt')
tf.app.flags.DEFINE_string('checkpoints_dir', 'checkpoints/fasttext/best_val', 'Path to checkpoints')
tf.app.flags.DEFINE_integer('batch_size', 256, 'default 256')
tf.app.flags.DEFINE_integer('epoch', 150, 'default 50')
tf.app.flags.DEFINE_float('keep_prob', 0.8, 'default 0.8')
tf.app.flags.DEFINE_integer('batch_saver_summary', 10, 'default 0.8')
tf.app.flags.DEFINE_integer('batch_test_acc', 20, 'default 0.8')
tf.app.flags.DEFINE_float('learn_rate', 1e-3, 'default 0.001')
FLAGS = tf.app.flags.FLAGS
class FastTest:
def __init__(self, embedding_size=150, num_classes=22, vocab_size=5000, seq_lenth=600):
self.embedding_size = embedding_size
self.num_classes = num_classes
self.vocab_size = vocab_size
self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, seq_lenth], name='input_x')
self.input_y = tf.placeholder(dtype=tf.int32, shape=[None, num_classes], name='input_y')
self.keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
self.run()
def run(self):
with tf.name_scope('embedding'):
embedding = tf.get_variable('embedding', [self.vocab_size, self.embedding_size])
embedding_input = tf.nn.embedding_lookup(embedding, self.input_x)
with tf.name_scope('init_w_b'):
W = tf.Variable(initial_value=tf.random_normal([self.embedding_size, self.num_classes]), name='W')
b = tf.Variable(initial_value=tf.random_normal([self.num_classes]), name='b')
with tf.name_scope('fasttext'):
reduce_input = tf.reduce_mean(embedding_input, axis=1)
logits = tf.nn.xw_plus_b(reduce_input, W, b, name='predict')
self.predict = tf.argmax(tf.nn.softmax(logits, name='predict'), 1)
with tf.name_scope("accuracy"):
judge = tf.equal(tf.argmax(self.input_y, 1), self.predict)
self.acc = tf.reduce_mean(tf.cast(judge, tf.float32))
with tf.name_scope('optimize'):
self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.input_y, logits=logits))
self.optim = tf.train.AdamOptimizer(learning_rate=FLAGS.learn_rate).minimize(self.loss)
def get_time_dif(start_time):
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
def feed_data(x_batch, y_batch, keep_prob):
feed_dict = {
model.input_x: x_batch,
model.input_y: y_batch,
model.keep_prob: keep_prob
}
return feed_dict
def train():
train_board_dir = FLAGS.train_board_dir
test_board_dir = FLAGS.test_board_dir
if not os.path.exists(train_board_dir):
os.makedirs(train_board_dir)
tf.summary.scalar("loss", model.loss)
tf.summary.scalar("accuracy", model.acc)
merged_summary = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(train_board_dir)
test_writer = tf.summary.FileWriter(test_board_dir)
saver = tf.train.Saver()
session = tf.Session()
session.run(tf.global_variables_initializer())
train_writer.add_graph(session.graph)
_, x_train, y_train = process_file(FLAGS.train_dir, word_to_id, cat_to_id)
_, x_val, y_val = process_file(FLAGS.val_dir, word_to_id, cat_to_id)
total_batch = 0
best_acc_val = 0.0
start_time = time.time()
for epoch in range(FLAGS.epoch):
print('Epoch:', epoch + 1)
batch_train = batch_iter(x_train, y_train, FLAGS.batch_size)
for x_batch, y_batch in batch_train:
feed_dict = feed_data(x_batch, y_batch, FLAGS.keep_prob)
if total_batch % FLAGS.batch_saver_summary == 0:
s = session.run(merged_summary, feed_dict=feed_dict)
train_writer.add_summary(s, total_batch)
if total_batch % FLAGS.batch_test_acc == 0:
feed_dict[model.keep_prob] = 1.0
loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
feed_dict = feed_data(x_val, y_val, 1.0)
val_loss, val_acc, summ = session.run([model.loss, model.acc, merged_summary], feed_dict=feed_dict)
test_writer.add_summary(summ, total_batch)
if val_acc > best_acc_val:
best_acc_val = val_acc
saver.save(sess=session, save_path=FLAGS.checkpoints_dir)
improved_str = '*'
else:
improved_str = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
+ ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss_train, acc_train, val_loss, val_acc, time_dif, improved_str))
#getdictionary(session, ['一', '想', '津', '京', '建', '设', '广', '点', '微', '阿'])
session.run([model.optim], feed_dict=feed_dict)
total_batch += 1
def test():
contents, x_test, y_test = process_file(FLAGS.test_dir, word_to_id, cat_to_id)
batch_test = batch_iter(x_test, y_test, FLAGS.batch_size)
session = tf.Session()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess=session, save_path=FLAGS.checkpoints_dir)
y_batch_test = np.zeros(0, dtype=np.int32)
predict_list = np.zeros(0, dtype=np.int32)
for x_batch, y_batch in batch_test:
feed_dict = feed_data(x_batch, y_batch, 1.0)
temp = session.run([model.predict], feed_dict=feed_dict)
predict_list = np.append(predict_list, temp)
y_batch_test = np.append(y_batch_test, np.argmax(y_batch, axis=1))
print("Precision, Recall and F1-Score...")
print(metrics.classification_report(y_batch_test, predict_list.tolist(), target_names=categories))
if __name__ == '__main__':
if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
raise ValueError("""usage: python run_cnn.py [train / test]""")
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(FLAGS.vocab_dir)
model = FastTest()
if sys.argv[1] == 'train':
train()
else:
test()
这个是文本处理的脚本,也很简单
# coding: utf-8
import sys
from collections import Counter
import numpy as np
import tensorflow.contrib.keras as kr
if sys.version_info[0] > 2:
is_py3 = True
else:
reload(sys)
sys.setdefaultencoding("utf-8")
is_py3 = False
def native_word(word, encoding='utf-8'):
"""如果在python2下面使用python3训练的模型,可考虑调用此函数转化一下字符编码"""
if not is_py3:
return word.encode(encoding)
else:
return word
def native_content(content):
if not is_py3:
return content.decode('utf-8')
else:
return content
def open_file(filename, mode='r'):
"""
常用文件操作,可在python2和python3间切换.
mode: 'r' or 'w' for read or write
"""
if is_py3:
return open(filename, mode, encoding='utf-8', errors='ignore')
else:
return open(filename, mode)
def read_file(filename):
"""读取文件数据"""
contents, labels = [], []
with open_file(filename) as f:
for line in f:
try:
label, content = line.strip().split('\t')
if content:
contents.append(list(native_content(content)))
labels.append(native_content(label))
except:
pass
return contents, labels
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
"""根据训练集构建词汇表,存储"""
data_train, _ = read_file(train_dir)
all_data = []
for content in data_train:
all_data.extend(content)
counter = Counter(all_data)
count_pairs = counter.most_common(vocab_size - 1)
words, _ = list(zip(*count_pairs))
# 添加一个 来将所有文本pad为同一长度
words = [''] + list(words)
open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')
def read_vocab(vocab_dir):
"""读取词汇表"""
# words = open_file(vocab_dir).read().strip().split('\n')
with open_file(vocab_dir) as fp:
# 如果是py2 则每个值都转化为unicode
words = [native_content(_.strip()) for _ in fp.readlines()]
word_to_id = dict(zip(words, range(len(words))))
return words, word_to_id
def read_category():
"""读取分类目录,固定"""
categories = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
'11', '12', '13', '14', '15', '15', '17']
categories = [native_content(x) for x in categories]
cat_to_id = dict(zip(categories, range(len(categories))))
return categories, cat_to_id
def to_words(content, words):
"""将id表示的内容转换为文字"""
return ''.join(words[x] for x in content)
def process_file(filename, word_to_id, cat_to_id, max_length=600):
"""将文件转换为id表示"""
contents, labels = read_file(filename)
data_id, label_id = [], []
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
# 使用keras提供的pad_sequences来将文本pad为固定长度
x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示
return contents, x_pad, y_pad
def batch_iter(x, y, batch_size=64):
"""生成批次数据"""
data_len = len(x)
num_batch = int((data_len - 1) / batch_size) + 1
indices = np.random.permutation(np.arange(data_len))
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
下面这个是看训练过程中与指定字的相似字,感觉有点 bug,跑的时间长了就崩溃了。。。。原因位置
def getdictionary(session, word_list=[], top_k=10):
input_x = np.array([list(range(4800))])
input_x = np.reshape(input_x, (-1, 600))
feed_dict = {
model.input_x: input_x,
model.keep_prob: 1.0
}
aaa = session.run([model.embedding_input], feed_dict=feed_dict)
bbb = np.reshape(aaa, (-1, 100))
#欧氏距离
sim = np.zeros((0, 4800), dtype=np.float32)
for i in range(len(word_list)):
wi = word_list[i]
index = word_to_id[wi]
w = bbb[index]
r = tf.sqrt(tf.reduce_sum(tf.square([w] - bbb), 1)).eval(session=session)
sim = np.append(sim, [r], axis=0)
'''
#余弦距离
norm = tf.sqrt(tf.reduce_sum(tf.square(bbb), 1, keepdims=True))
normalized_embeddings = bbb / norm
valid_dataset = np.array(list(range(4800)))
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
sim = similarity.eval(session=session)
'''
for i in range(len(word_list)):
wi = word_list[i]
#print(i)
nearest = (sim[i, :]).argsort()[1:top_k + 1]
log_str = 'Nearest to %s:' % wi
for k in range(top_k):
close_word = words[nearest[k]]
log_str = '%s %s,' % (log_str, close_word)
print(log_str)
这个是训练的图
总体来说还可以,但是没有 textcnn 好哈,就是写着玩
因为我没有用分层的 softmax,所以训练起来还是没有 Facebook 正版的快哈