Word2vec+seq2seq实现对话系统

ChatScript做对话系统,做到一定程度后,便遇到了天花板,主要的瓶颈是中文wordnet比较难弄。

下面先用word2vec+seq2seq尝试一下,基于机器学习的对话系统。

如题,整个程序主要又两部分组成,word2vec将训练语料首先做词向量化,然后,用向量输入到seq2seq中训练。实际上tensorflow本身已经有一个完整的seq2seq模型,可以直接拿来训练,且tf自带的模型中还加入了分桶算法和attention机制。这里,先用tflearn写seq2seq吧。

本文参照了李大神的博客:http://www.shareditor.com/blogshow/?blogId=121

 

Word2vec是好久之前已经有用过,所以这里直接把语料分词后拿到程序中训练就好了。得到词向量模型后,后面需要做的分三部分:加载词向量和数据读入(相当于初始化),建图训练、预测结果。

词向量载入的程序如下:

 

class load_word_and_vector(object):
    def __init__(self):
        self.max_c = 50
        self.float_size = 4
        self.word_vector = {}

    def load_vectors(self, input):
        print "begin load vectors:"
        with open(input, 'rb') as file_vector:
            words_and_size = file_vector.readline()
            words_and_size = words_and_size.strip()
            words = long(words_and_size.split(' ')[0])
            size = re.findall('\d+', words_and_size.split(' ')[1])
            size_long = long(size[0])
            print "words=", words
            print "size=", size_long

            word_set = []

            for b in range(0, words):
                word = ''
                a = 0
                while True:
                    c = file_vector.read(1)
                    word = word + c
                    if a < self.max_c and c != '\n':
                        a = a + 1
                    if False == c or c == ' ':
                        break
                word = word.strip()
                #print "word=", word

                vector = np.empty([200])
                for index in range(0, size_long):
                    m = file_vector.read(self.float_size)
                    (weight,) = struct.unpack('f', m)
                    vector[index] = weight

                self.word_vector[word.decode('utf-8')] = vector

            print "finish vector loading"
分batch读入语料数据:

def read_source_file(self, source):
        question_seq = []
        anwser_seq = []
        line = source.readline()
        if self.phrase == 'train':
            for i in range(self.batch_size):
                if line:
                    line_pair = line.split('|')
                    line_question = line_pair[0]
                    line_answer = line_pair[1]
                    question_words = []
                    anwser_words = []
                    for word in line_question.decode("utf-8").split(' '):
                        if word != u'' and word != u'\n':
                            question_words.append(word)
                    for word in line_answer.decode("utf-8").split(' '):  # 虽然用utf-8解码了。但是存入list或者dict后依然是以unicode存的,why?
                        if word != u'' and word != u'\n':
                            anwser_words.append(word)
                    question_seq.append(question_words)
                    anwser_seq.append(anwser_words)
                    line = source.readline()
            print "seq_num", len(question_seq)
            print "read sentences"
        return question_seq, anwser_seq


语料向量化

def get_batches(self):
        print "start get batch!"
        xy_data = []
        y_data = []
        if self.phrase == 'train':
            index_list = range(0, self.seq_len)
            for i in range(self.seq_len):
                index = random.choice(index_list)
                x_data_single = []
                y_data_single = []
                for word in self.question_seq[i]:
                    if word in self.word_vector_dict.keys():  # word2vec取至少出现5次的词
                        x_data_single.append(self.word_vector_dict[word])
                for word in self.anwser_seq[i]:
                    if word in self.word_vector_dict.keys():
                        y_data_single.append(self.word_vector_dict[word])
                if len(x_data_single) < self.max_seq_len and len(y_data_single) < self.max_seq_len:
                    sequence_xy = [np.zeros(self.word_vec_dim)] * (self.max_seq_len - len(x_data_single)) + list(
                        reversed(x_data_single))
                    sequence_y = y_data_single + [np.zeros(self.word_vec_dim)] * (self.max_seq_len - len(y_data_single))
                    sequence_xy = sequence_xy + sequence_y
                    sequence_y = [np.ones(self.word_vec_dim)] + sequence_y
                    xy_data.append(sequence_xy)
                    y_data.append(sequence_y)

            return np.array(xy_data), np.array(y_data)

Tflearn大大的简化和使用tensorflow的复杂度,但不得不说,其能做的很少,有点傻瓜式。用tflearn和tensorflow的一些基本操作来建图:

 

def model(self, feed_previous=False):
        input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
        encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
        decoder_input_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
        go_inputs = tf.ones_like(decoder_input_tmp)
        go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
        decoder_inputs = tf.concat([go_inputs, decoder_input_tmp], axis=1, name ="dec_in")

        (encoder_output_tensor, states) =tflearn.lstm(encoder_inputs, self.word_vec_dim, return_state=True, scope="encoder_lstm")
        tflearn.embedding()
        encoder_output_sequence = tf.stack([encoder_output_tensor], axis=1)
        #tf.contrib.seq2seq.attention_wrapper.LuongAttention(self.word_vec_dim,encoder_output_tensor)  #attention机制能加进去
        #tf.contrib.legacy_seq2seq.model_with_buckets          #分桶机制也能加进去

        if feed_previous:
            first_dec_input = go_inputs
        else:
            first_dec_input = tf.slice(decoder_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
        decoder_output_tensor = tflearn.lstm(first_dec_input, self.word_vec_dim, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
        # 0.x版本的tf的pack()已经修改乘stack(),两个函数参数一样,内部程序也一样。tf.x, x表示tensorflow/python/ops目录下的所有xxx_ops.py文件下的所有方法。
        decoder_out_sequence_single = tf.stack([decoder_output_tensor], axis =1)
        decoder_out_sequence_list = [decoder_output_tensor]

        for i in range(self.max_seq_len - 1):
            if feed_previous:
                next_dec_input = decoder_out_sequence_single
            else:
                next_dec_input = tf.slice(decoder_inputs, [0, i+1, 0], [-1, 1, self.word_vec_dim])
            decoder_output_tensor = tflearn.lstm(next_dec_input, self.word_vec_dim, return_seq=False, reuse=True, scope='decoder_lstm')
            # 0.x版本的tf的pack()已经修改乘stack(),两个函数参数一样,内部程序也一样。tf.x, x表示tensorflow/python/ops目录下的所有xxx_ops.py文件下的所有方法。
            decoder_out_sequence_single = tf.stack([decoder_output_tensor], axis =1)
            decoder_out_sequence_list.append(decoder_output_tensor)
        decoder_out_sequence = tf.stack(decoder_out_sequence_list, axis=1)
        real_output_sequence = tf.concat([encoder_output_sequence, decoder_out_sequence], 1)
        net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
        model = tflearn.DNN(net)

        return model


提醒一下:tf升级到1.0之后,tf.pack改成了tf.stack,从源程序可以看出,仅仅是名字改了而已,函数内部没有变化。Tf的基本操作的源程序都存放在tensorflow/python/ops路径下的xxx_ops.py中,tf.stack是对array的操作。

 

训练过程,由于本人的机子太差了,加上word2vec模型在训练过程中一直驻留在内存中,导致,语料输入必须按batch分批次训练。

def train(self):
        print "step1: start init ..."
        self.init_sys()
        model = self.model(feed_previous=False)
        print "step1 finished"

        print "step2: start =train ..."
        with open(self.source_file, 'r') as source:
            batch_num = self.total_pair_num / self.batch_size
            print "batch_num=", batch_num
            for j in range(batch_num):
                (question_seq, anwser_seq) = self.read_source_file(source)
                self.question_seq = question_seq
                self.anwser_seq = anwser_seq
                question_seq_len = len(self.question_seq)
                anwser_seq_len = len(self.anwser_seq)
                self.seq_len = min(question_seq_len, anwser_seq_len)

                trainXY, trainY = self.get_batches()
                print "XY=", trainXY.shape
                print "Y=", trainY.shape
                model.fit(trainXY, trainY, n_epoch=5, snapshot_epoch=False, batch_size=20)
        model.save("out/model")

 

加载模型

 

def load(self):
        self.init_sys()
        with open(self.source_file, 'r') as source:
            (question_seq, anwser_seq) = self.read_source_file(source)
            self.question_seq = question_seq
            self.anwser_seq = anwser_seq
            self.seq_len = len(self.question_seq)
            #batch_num = self.seq_len / self.batch_size
            model = self.model(feed_previous=True)
            model.load('out/model')
        return model

计算与预测句子最相近的词句并输出

 

 seq2seq = cb_seq2seq(max_seq_len=16,
                             word_vec_dim=200,
                             vector_file='source/w2v_dialogue_neg15_20170628.bin',
                             source_file='source/ass_cut_pair_test',
                             batch_size=100,
                             total_pair_num= 100,
                             phrase = 'predict')

        model = seq2seq.load()
        testXY, testY = seq2seq.get_batches()
        predict = model.predict(testXY)
        distance_calc = cb_word2vec_distance.word2vec_distance(word_vector_dict=seq2seq.word_vector_dict)
        for sample in predict:
            print "predict anwser:"


            for v in sample[1:]:
                #print v
                (matched_word, max_cos) = distance_calc.distance(v)

                print matched_word, max_cos

 词距计算:

class word2vec_distance(object):
    def __init__(self, word_vector_dict):
        self.word_vector_dict = word_vector_dict

    def vector_len(self, vector):
        vector = np.array(vector)
        vector_transpose = vector.transpose()
        len = np.dot(vector, vector_transpose)
        len_sqrt = math.sqrt(len)

        return len_sqrt

    def vector_cosine(self, vector1, vector2):
        if len(vector1) != len(vector2):
            sys.exit(1)
        v1_sqrtlen = self.vector_len(vector1)
        v2_sqrtlen = self.vector_len(vector2)
        v1 = np.array(vector1)
        v2_transpose = np.array(vector2).transpose()
        value = np.dot(v1, v2_transpose)
        cosine = value / (float(v1_sqrtlen) * float(v2_sqrtlen))

        return cosine

    def distance(self, vector):
        max_cos = -10000
        matched_word = ''
        for word in self.word_vector_dict:
            v = self.word_vector_dict[word]
            v_cos = self.vector_cosine(vector, v)

            if v_cos > max_cos:
                matched_word = word
                max_cos = v_cos

        return (matched_word, max_cos)



这两天用了doc2vec之后,感觉,预测输出的向量值,可以做成句向量,然后和预先设定的模板库匹配,找到最佳回复,实现模型+模板。后面可以试试这个。另外,tf自带的seq2seq可能效率更高,不用手动计算词距,直接预测就可以了。






 
  
 
 

你可能感兴趣的:(tensorflow,对话系统)