简单实现 fasttext

这个文件是主文件,主要就是把输入的句子,变成 id,然后进行 reduce_mean,进行下 wx_b 操作,最后 soft_max 得到分类结果,没有使用分层 soft 哈。

import tensorflow as tf
from sklearn import metrics
import sys
import os
import time
import numpy as np
from datetime import timedelta
from data.loader import read_vocab, read_category, batch_iter, process_file, build_vocab

tf.app.flags.DEFINE_string('train_board_dir', 'tensorboard/fasttext/train', 'Path to tensorboard')
tf.app.flags.DEFINE_string('test_board_dir', 'tensorboard/fasttext/test', 'Path to tensorboard')
tf.app.flags.DEFINE_string('train_dir', 'data/cnews/cnews.train.txt', 'Path to train txt')
tf.app.flags.DEFINE_string('val_dir', 'data/cnews/cnews.val.txt', 'Path to val txt')
tf.app.flags.DEFINE_string('test_dir', 'data/cnews/cnews.test.txt', 'Path to test txt')
tf.app.flags.DEFINE_string('vocab_dir', 'data/cnews/cnews.vocab.txt', 'Path to vocabulary txt')
tf.app.flags.DEFINE_string('checkpoints_dir', 'checkpoints/fasttext/best_val', 'Path to checkpoints')
tf.app.flags.DEFINE_integer('batch_size', 256, 'default 256')
tf.app.flags.DEFINE_integer('epoch', 150, 'default 50')
tf.app.flags.DEFINE_float('keep_prob', 0.8, 'default 0.8')
tf.app.flags.DEFINE_integer('batch_saver_summary', 10, 'default 0.8')
tf.app.flags.DEFINE_integer('batch_test_acc', 20, 'default 0.8')
tf.app.flags.DEFINE_float('learn_rate', 1e-3, 'default 0.001')

FLAGS = tf.app.flags.FLAGS


class FastTest:
    def __init__(self, embedding_size=150, num_classes=22, vocab_size=5000, seq_lenth=600):
        self.embedding_size = embedding_size
        self.num_classes = num_classes
        self.vocab_size = vocab_size
        self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, seq_lenth], name='input_x')
        self.input_y = tf.placeholder(dtype=tf.int32, shape=[None, num_classes], name='input_y')
        self.keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
        self.run()

    def run(self):
        with tf.name_scope('embedding'):
            embedding = tf.get_variable('embedding', [self.vocab_size, self.embedding_size])
            embedding_input = tf.nn.embedding_lookup(embedding, self.input_x)

        with tf.name_scope('init_w_b'):
            W = tf.Variable(initial_value=tf.random_normal([self.embedding_size, self.num_classes]), name='W')
            b = tf.Variable(initial_value=tf.random_normal([self.num_classes]), name='b')

        with tf.name_scope('fasttext'):
            reduce_input = tf.reduce_mean(embedding_input, axis=1)
            logits = tf.nn.xw_plus_b(reduce_input, W, b, name='predict')
            self.predict = tf.argmax(tf.nn.softmax(logits, name='predict'), 1)

        with tf.name_scope("accuracy"):
            judge = tf.equal(tf.argmax(self.input_y, 1), self.predict)
            self.acc = tf.reduce_mean(tf.cast(judge, tf.float32))

        with tf.name_scope('optimize'):
            self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.input_y, logits=logits))
            self.optim = tf.train.AdamOptimizer(learning_rate=FLAGS.learn_rate).minimize(self.loss)


def get_time_dif(start_time):
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob
    }
    return feed_dict


def train():
    train_board_dir = FLAGS.train_board_dir
    test_board_dir = FLAGS.test_board_dir
    if not os.path.exists(train_board_dir):
        os.makedirs(train_board_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(train_board_dir)
    test_writer = tf.summary.FileWriter(test_board_dir)

    saver = tf.train.Saver()

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    train_writer.add_graph(session.graph)

    _, x_train, y_train = process_file(FLAGS.train_dir, word_to_id, cat_to_id)
    _, x_val, y_val = process_file(FLAGS.val_dir, word_to_id, cat_to_id)

    total_batch = 0
    best_acc_val = 0.0
    start_time = time.time()

    for epoch in range(FLAGS.epoch):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, FLAGS.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, FLAGS.keep_prob)
            if total_batch % FLAGS.batch_saver_summary == 0:
                s = session.run(merged_summary, feed_dict=feed_dict)
                train_writer.add_summary(s, total_batch)

            if total_batch % FLAGS.batch_test_acc == 0:
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)

                feed_dict = feed_data(x_val, y_val, 1.0)
                val_loss, val_acc, summ = session.run([model.loss, model.acc, merged_summary], feed_dict=feed_dict)
                test_writer.add_summary(summ, total_batch)
                if val_acc > best_acc_val:
                    best_acc_val = val_acc
                    saver.save(sess=session, save_path=FLAGS.checkpoints_dir)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, val_loss, val_acc, time_dif, improved_str))
                
            #getdictionary(session, ['一', '想', '津', '京', '建', '设', '广', '点', '微', '阿'])
            session.run([model.optim], feed_dict=feed_dict)
            total_batch += 1


def test():
    contents, x_test, y_test = process_file(FLAGS.test_dir, word_to_id, cat_to_id)
    batch_test = batch_iter(x_test, y_test, FLAGS.batch_size)
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=FLAGS.checkpoints_dir)

    y_batch_test = np.zeros(0, dtype=np.int32)
    predict_list = np.zeros(0, dtype=np.int32)
    for x_batch, y_batch in batch_test:
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        temp = session.run([model.predict], feed_dict=feed_dict)
        predict_list = np.append(predict_list, temp)
        y_batch_test = np.append(y_batch_test, np.argmax(y_batch, axis=1))

    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_batch_test, predict_list.tolist(), target_names=categories))


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(FLAGS.vocab_dir)
    model = FastTest()

    if sys.argv[1] == 'train':
        train()
    else:
        test()

这个是文本处理的脚本,也很简单

# coding: utf-8

import sys
from collections import Counter

import numpy as np
import tensorflow.contrib.keras as kr

if sys.version_info[0] > 2:
    is_py3 = True
else:
    reload(sys)
    sys.setdefaultencoding("utf-8")
    is_py3 = False


def native_word(word, encoding='utf-8'):
    """如果在python2下面使用python3训练的模型,可考虑调用此函数转化一下字符编码"""
    if not is_py3:
        return word.encode(encoding)
    else:
        return word


def native_content(content):
    if not is_py3:
        return content.decode('utf-8')
    else:
        return content


def open_file(filename, mode='r'):
    """
    常用文件操作,可在python2和python3间切换.
    mode: 'r' or 'w' for read or write
    """
    if is_py3:
        return open(filename, mode, encoding='utf-8', errors='ignore')
    else:
        return open(filename, mode)


def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list(native_content(content)))
                    labels.append(native_content(label))
            except:
                pass
    return contents, labels


def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根据训练集构建词汇表,存储"""
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    # 添加一个  来将所有文本pad为同一长度
    words = [''] + list(words)
    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')


def read_vocab(vocab_dir):
    """读取词汇表"""
    # words = open_file(vocab_dir).read().strip().split('\n')
    with open_file(vocab_dir) as fp:
        # 如果是py2 则每个值都转化为unicode
        words = [native_content(_.strip()) for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id


def read_category():
    """读取分类目录,固定"""
    categories = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
    '11', '12', '13', '14', '15', '15', '17']


    categories = [native_content(x) for x in categories]

    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id


def to_words(content, words):
    """将id表示的内容转换为文字"""
    return ''.join(words[x] for x in content)


def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """将文件转换为id表示"""
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示

    return contents, x_pad, y_pad


def batch_iter(x, y, batch_size=64):
    """生成批次数据"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

下面这个是看训练过程中与指定字的相似字,感觉有点 bug,跑的时间长了就崩溃了。。。。原因位置

def getdictionary(session, word_list=[], top_k=10):
    input_x = np.array([list(range(4800))])
    input_x = np.reshape(input_x, (-1, 600))
    feed_dict = {
        model.input_x: input_x,
        model.keep_prob: 1.0
    }
    aaa = session.run([model.embedding_input], feed_dict=feed_dict)
    bbb = np.reshape(aaa, (-1, 100))

    #欧氏距离
    sim = np.zeros((0, 4800), dtype=np.float32)
    for i in range(len(word_list)):
        wi = word_list[i]
        index = word_to_id[wi]
        w = bbb[index]
        r = tf.sqrt(tf.reduce_sum(tf.square([w] - bbb), 1)).eval(session=session)
        sim = np.append(sim, [r], axis=0)
    '''
    #余弦距离
    norm = tf.sqrt(tf.reduce_sum(tf.square(bbb), 1, keepdims=True))
    normalized_embeddings = bbb / norm
    valid_dataset = np.array(list(range(4800)))
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    sim = similarity.eval(session=session)
    '''
    for i in range(len(word_list)):
        wi = word_list[i]
        #print(i)
        nearest = (sim[i, :]).argsort()[1:top_k + 1]
        log_str = 'Nearest to %s:' % wi
        for k in range(top_k):
            close_word = words[nearest[k]]
            log_str = '%s %s,' % (log_str, close_word)
        print(log_str)

这个是训练的图

简单实现 fasttext_第1张图片

总体来说还可以,但是没有 textcnn 好哈,就是写着玩

因为我没有用分层的 softmax,所以训练起来还是没有 Facebook 正版的快哈

你可能感兴趣的:(简单实现 fasttext)