w2v

# -- encoding:utf-8 --


import os
import tensorflow as tf


class CBOWNetwork(object):
    def __init__(self, name="W2V", num_sampled=100, window=4, vocab_size=3365, embedding_size=128, is_mean=True,
                 regularization=0.001, optimizer_name='adam', learning_rate=0.01, checkpoint_dir="./running/model"):
        self.name = name  # 网络名称
        self.vocab_size = vocab_size  # 词汇数目
        self.embedding_size = embedding_size  # 词向量转换的时候,转换的向量维度大小
        self.is_mean = is_mean  # 合并数据的时候是否进行均值操作
        self.window = window  # 窗口大小,也就是上下单词数目(除中心词外)
        self.num_sampled = num_sampled  # 抽样的时候,抽取的类别数目(单词数目)
        self.regularization = regularization  # 正则化项系数
        self.optimizer_name = optimizer_name.lower()  # 优化器的名称
        self.learning_rate = learning_rate  # 学习率
        self.adam_beta1 = 0.9  # Adam优化器参数
        self.adam_beta2 = 0.999  # Adam优化器参数
        self.epsilon = 1e-8  # Adam、Adadelta优化器参数
        self.adadelta_rho = 0.95  # Adadelta优化器参数
        self.checkpoint_dir = checkpoint_dir  # 模型持久化文件夹
        self.checkpoint_path = os.path.join(self.checkpoint_dir, "{}.ckpt".format(self.name.lower()))

        # 模型持久化文件夹校验
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.input_x = None  # [B,T]
        self.target = None  # [B,1]
        self.training = None  # []
        self.global_step = None  # []
        self.features = None  # [B,E]
        self.embedding_table = None  # [V,E]
        self.saver = None  # 模型参数恢复、持久化等操作对象

    def interface(self):
        """
        前向网络的构建
        :return:
        """
        with tf.variable_scope(self.name):
            # 一、定义输入
            with tf.variable_scope("placeholder"):
                self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.window], name="input_x")  # [B,T]
                self.target = tf.placeholder(dtype=tf.int32, shape=[None, 1], name="target")  # [B,1]
                self.training = tf.placeholder_with_default(True, shape=[], name="training")
                self.global_step = tf.train.get_or_create_global_step()

            # 二、Embedding操作,将单词id转换为词向量
            with tf.variable_scope("embedding"), tf.device("/cpu:0"):
                # a. 定义词汇转换列表
                self.embedding_table = tf.get_variable("embedding_table",
                                                       shape=[self.vocab_size, self.embedding_size],
                                                       dtype=tf.float32)
                # b. 将单词id转换为词向量, [B,T] --> [B,T,E]
                vectors = tf.nn.embedding_lookup(params=self.embedding_table, ids=self.input_x)

            # 三、对于输入值进行合并,得到最终的特征属性
            with tf.variable_scope("merge"):
                if self.is_mean:
                    # 对T个单词的向量进行合并,合并方式采用均值的方式, [B,T,E] --> [B,E]
                    features = tf.reduce_mean(vectors, axis=1)
                else:
                    # 对T个单词的向量进行合并,合并方式采用sum的方式, [B,T,E] --> [B,E]
                    features = tf.reduce_sum(vectors, axis=1)

            # 属性给定
            self.features = tf.identity(features, "features")

    def losses(self):
        """
        损失函数的构建
        :return:
        """
        with tf.variable_scope("loss"):
            # 0. 定义参数
            weight = tf.get_variable(name="weight", shape=[self.vocab_size, self.embedding_size])
            bias = tf.get_variable(name="bias", shape=[self.vocab_size])

            def train_loss():
                """
                训练阶段的损失函数构建
                :return:
                """
                _loss = tf.nn.sampled_softmax_loss(
                    weights=weight,  # 输出转换系数w,形状为: [V,E]
                    biases=bias,  # 输出转换系数b,形状为: [V,]
                    labels=self.target,  # 实际类别下标,形状为: [B,num_true], num_true表示每个样本存在几个预测标签
                    inputs=self.features,  # 前向过程提取出来的特征信息,形状为: [B,E]
                    num_sampled=self.num_sampled,  # 针对每个批次,会随机抽取多少个类别(负例)
                    num_classes=self.vocab_size,  # 总类别数目,也就是单词词汇数目
                    num_true=1  # 给定每个样本对应预测标签有多少个,真实的类别,实际类别
                )
                _loss = tf.reduce_mean(_loss, name="train_loss")
                return _loss

            def eval_loss():
                """
                数据验证时候的损失函数构建(不能够近似的计算,那么必须全连接)
                :return:
                """
                # 1. 做一个全连接操作,得到对应的logits值: [B,V]
                logits = tf.nn.bias_add(tf.matmul(self.features, weight, transpose_b=True), bias=bias)
                # 2. 对实际值进行转换
                labels = tf.reshape(self.target, shape=[-1])
                # 3. 损失函数构建
                _loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels,  # [N1,N2,....Nn], eg: [B,] 实际所属类别id
                    logits=logits  # [N1,N2,....Nn,num_class], eg: [B,V] 预测属于各个类别的置信度
                )
                _loss = tf.reduce_mean(_loss, name="eval_loss")
                return _loss

            # 1. 根据训练或者验证的参数,来获取对应的损失
            loss = tf.cond(
                pred=self.training,  # 判断条件,Tensor对象
                true_fn=train_loss,  # 当pred为True的时候,返回true_fn这个参数的返回结果
                false_fn=eval_loss  # 当pred为False的时候,返回false_fn这个参数的返回结果
            )
            tf.summary.scalar('loss', loss)

            # 2. 将变量的L2 Loss加入到损失函数中
            l2_loss = tf.nn.l2_loss(self.embedding_table) + tf.nn.l2_loss(weight) + tf.nn.l2_loss(bias)
            l2_loss = self.regularization * l2_loss
            tf.summary.scalar('l2_loss', l2_loss)

            # 3. 所有损失合并到一起
            total_loss = loss + l2_loss
            tf.summary.scalar('total_loss', total_loss)
            return total_loss

    def optimizer(self, loss):
        """
        损失函数的构建
        :param loss:
        :return:
        """
        with tf.variable_scope("train"):
            # 1. 构建优化器
            if self.optimizer_name == 'adam':
                opt = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate,
                    beta1=self.adam_beta1,
                    beta2=self.adam_beta2,
                    epsilon=self.epsilon
                )
            elif self.optimizer_name == 'adadelta':
                opt = tf.train.AdadeltaOptimizer(
                    learning_rate=self.learning_rate,
                    rho=self.adadelta_rho,
                    epsilon=self.epsilon
                )
            elif self.optimizer_name == 'adagrad':
                opt = tf.train.AdagradOptimizer(learning_rate=self.learning_rate)
            else:
                opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)

            # 2. 训练对象构建
            train_op = opt.minimize(loss=loss, global_step=self.global_step)

            return opt, train_op

    def metrics(self, loss=None):
        """
        模型评估值的构建
        :param loss:
        :return:
        """
        pass

    def restore(self, session):
        """
        模型参数恢复
        :param session:
        :return:
        """
        # 0. 参数判断
        if self.saver is None:
            self.saver = tf.train.Saver()

        # 1. 所有参数初始化
        session.run(tf.global_variables_initializer())

        # 2. 从checkpoint_dir文件夹中进行模型恢复操作
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=self.checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:
            tf.logging.info("Restore model weight from '{}'".format(ckpt.model_checkpoint_path))
            self.saver.restore(session, save_path=ckpt.model_checkpoint_path)
            self.saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths)

    def save(self, session):
        """
        模型持久化
        :param session:
        :return:
        """
        # 0. 参数判断
        if self.saver is None:
            self.saver = tf.train.Saver()

        # 1. 保存操作
        tf.logging.info("Store the model weight to '{}'".format(self.checkpoint_path))
        self.saver.save(session, save_path=self.checkpoint_path, global_step=self.global_step)


class SkipGramNetwork(object):
    def __init__(self, name="W2V", num_sampled=100, window=4, vocab_size=3365, embedding_size=128,
                 regularization=0.001, optimizer_name='adam', learning_rate=0.01, checkpoint_dir="./running/model"):
        self.name = name  # 网络名称
        self.vocab_size = vocab_size  # 词汇数目
        self.embedding_size = embedding_size  # 词向量转换的时候,转换的向量维度大小
        self.window = window  # 窗口大小,也就是上下单词数目(除中心词外)
        self.num_sampled = num_sampled  # 抽样的时候,抽取的类别数目(单词数目)
        self.regularization = regularization  # 正则化项系数
        self.optimizer_name = optimizer_name.lower()  # 优化器的名称
        self.learning_rate = learning_rate  # 学习率
        self.adam_beta1 = 0.9  # Adam优化器参数
        self.adam_beta2 = 0.999  # Adam优化器参数
        self.epsilon = 1e-8  # Adam、Adadelta优化器参数
        self.adadelta_rho = 0.95  # Adadelta优化器参数
        self.checkpoint_dir = checkpoint_dir  # 模型持久化文件夹
        self.checkpoint_path = os.path.join(self.checkpoint_dir, "{}.ckpt".format(self.name.lower()))

        # 模型持久化文件夹校验
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.input_x = None  # [B,1]
        self.target = None  # [B,T]
        self.training = None  # []
        self.global_step = None  # []
        self.features = None  # [B,E]
        self.embedding_table = None  # [V,E]
        self.saver = None  # 模型参数恢复、持久化等操作对象

    def interface(self):
        """
        前向网络的构建
        :return:
        """
        with tf.variable_scope(self.name):
            # 一、定义输入
            with tf.variable_scope("placeholder"):
                self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, 1], name="input_x")  # [B,1]
                self.target = tf.placeholder(dtype=tf.int32, shape=[None, self.window], name="target")  # [B,T]
                self.training = tf.placeholder_with_default(True, shape=[], name="training")
                self.global_step = tf.train.get_or_create_global_step()

            # 二、Embedding操作,将单词id转换为词向量
            with tf.variable_scope("embedding"), tf.device("/cpu:0"):
                # a. 定义词汇转换列表
                self.embedding_table = tf.get_variable("embedding_table",
                                                       shape=[self.vocab_size, self.embedding_size],
                                                       dtype=tf.float32)
                # b. 将单词id转换为词向量, [B,1] --> [B,1,E]
                vectors = tf.nn.embedding_lookup(params=self.embedding_table, ids=self.input_x)

            # 三、对于输入值进行合并,得到最终的特征属性
            with tf.variable_scope("merge"):
                # 特征压缩, [B,1,E] --> [B,E]
                features = tf.squeeze(vectors, axis=1)

            # 属性给定
            self.features = tf.identity(features, "features")

    def losses(self):
        """
        损失函数的构建
        :return:
        """
        with tf.variable_scope("loss"):
            # 0. 定义参数
            weight = tf.get_variable(name="weight", shape=[self.vocab_size, self.embedding_size])
            bias = tf.get_variable(name="bias", shape=[self.vocab_size])

            def train_loss():
                """
                训练阶段的损失函数构建
                :return:
                """
                _loss = tf.nn.nce_loss(
                    weights=weight,  # 输出转换系数w,形状为: [V,E]
                    biases=bias,  # 输出转换系数b,形状为: [V,]
                    labels=self.target,  # 实际类别下标,形状为: [B,num_true], num_true表示每个样本存在几个预测标签
                    inputs=self.features,  # 前向过程提取出来的特征信息,形状为: [B,E]
                    num_sampled=self.num_sampled,  # 针对每个批次,会随机抽取多少个类别(负例)
                    num_classes=self.vocab_size,  # 总类别数目,也就是单词词汇数目
                    num_true=self.window  # 给定每个样本对应预测标签有多少个
                )
                _loss = tf.reduce_mean(_loss, name="train_loss")
                return _loss

            def eval_loss():
                """
                数据验证时候的损失函数构建(不能够近似的计算,那么必须全连接)
                :return:
                """
                # 1. 做一个全连接操作,得到对应的logits值: [B,V]
                logits = tf.nn.bias_add(tf.matmul(self.features, weight, transpose_b=True), bias=bias)
                # 2. 对实际值进行哑编码操作(对应位置为1,存在多个位置为1)
                labels = tf.one_hot(self.target, depth=self.vocab_size)  # [B,T] --> [B,T,V]
                labels = tf.reduce_sum(labels, axis=1)  # [B,T,V] --> [B,V]
                # 3. 损失函数构建(由于一个条件对应多个预测标签,那么损失函数的使用sigmoid交叉熵损失函数)
                # TODO: 自己思考一下为什么使用sigmoid交叉熵损失函数,而不是softmax损失函数?
                # softmax在计算样本属于当前类别概率值的时候,会使用到所有类别的置信度,各个类别之间是互斥的,所以softmax在训练的时候,只能够让其中一个类别的置信度最大,而其它类别的置信度减小
                # sigmoid在计算概率的时候,是独立的,类别之间就没有互相影响,也就是允许存在一个样本对应多个类别的情况
                _loss = tf.nn.sigmoid_cross_entropy_with_logits(
                    labels=labels,  # 和logits的结构必须一致,一般为: [B,V]
                    logits=logits
                )
                _loss = tf.reduce_mean(_loss, name="eval_loss")
                return _loss

            # 1. 根据训练或者验证的参数,来获取对应的损失
            loss = tf.cond(
                pred=self.training,  # 判断条件,Tensor对象
                true_fn=train_loss,  # 当pred为True的时候,返回true_fn这个参数的返回结果
                false_fn=eval_loss  # 当pred为False的时候,返回false_fn这个参数的返回结果
            )
            tf.summary.scalar('loss', loss)

            # 2. 将变量的L2 Loss加入到损失函数中
            l2_loss = tf.nn.l2_loss(self.embedding_table) + tf.nn.l2_loss(weight) + tf.nn.l2_loss(bias)
            l2_loss = self.regularization * l2_loss
            tf.summary.scalar('l2_loss', l2_loss)

            # 3. 所有损失合并到一起
            total_loss = loss + l2_loss
            tf.summary.scalar('total_loss', total_loss)
            return total_loss

    def optimizer(self, loss):
        """
        损失函数的构建
        :param loss:
        :return:
        """
        with tf.variable_scope("train"):
            # 1. 构建优化器
            if self.optimizer_name == 'adam':
                opt = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate,
                    beta1=self.adam_beta1,
                    beta2=self.adam_beta2,
                    epsilon=self.epsilon
                )
            elif self.optimizer_name == 'adadelta':
                opt = tf.train.AdadeltaOptimizer(
                    learning_rate=self.learning_rate,
                    rho=self.adadelta_rho,
                    epsilon=self.epsilon
                )
            elif self.optimizer_name == 'adagrad':
                opt = tf.train.AdagradOptimizer(learning_rate=self.learning_rate)
            else:
                opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)

            # 2. 训练对象构建
            train_op = opt.minimize(loss=loss, global_step=self.global_step)

            return opt, train_op

    def metrics(self, loss=None):
        """
        模型评估值的构建
        :param loss:
        :return:
        """
        pass

    def restore(self, session):
        """
        模型参数恢复
        :param session:
        :return:
        """
        # 0. 参数判断
        if self.saver is None:
            self.saver = tf.train.Saver()

        # 1. 所有参数初始化
        session.run(tf.global_variables_initializer())

        # 2. 从checkpoint_dir文件夹中进行模型恢复操作
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=self.checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:
            tf.logging.info("Restore model weight from '{}'".format(ckpt.model_checkpoint_path))
            self.saver.restore(session, save_path=ckpt.model_checkpoint_path)
            self.saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths)

    def save(self, session):
        """
        模型持久化
        :param session:
        :return:
        """
        # 0. 参数判断
        if self.saver is None:
            self.saver = tf.train.Saver()

        # 1. 保存操作
        tf.logging.info("Store the model weight to '{}'".format(self.checkpoint_path))
        self.saver.save(session, save_path=self.checkpoint_path, global_step=self.global_step)

你可能感兴趣的:(w2v)