ChatScript做对话系统,做到一定程度后,便遇到了天花板,主要的瓶颈是中文wordnet比较难弄。
下面先用word2vec+seq2seq尝试一下,基于机器学习的对话系统。
如题,整个程序主要又两部分组成,word2vec将训练语料首先做词向量化,然后,用向量输入到seq2seq中训练。实际上tensorflow本身已经有一个完整的seq2seq模型,可以直接拿来训练,且tf自带的模型中还加入了分桶算法和attention机制。这里,先用tflearn写seq2seq吧。
本文参照了李大神的博客:http://www.shareditor.com/blogshow/?blogId=121
Word2vec是好久之前已经有用过,所以这里直接把语料分词后拿到程序中训练就好了。得到词向量模型后,后面需要做的分三部分:加载词向量和数据读入(相当于初始化),建图训练、预测结果。
词向量载入的程序如下:
class load_word_and_vector(object):
def __init__(self):
self.max_c = 50
self.float_size = 4
self.word_vector = {}
def load_vectors(self, input):
print "begin load vectors:"
with open(input, 'rb') as file_vector:
words_and_size = file_vector.readline()
words_and_size = words_and_size.strip()
words = long(words_and_size.split(' ')[0])
size = re.findall('\d+', words_and_size.split(' ')[1])
size_long = long(size[0])
print "words=", words
print "size=", size_long
word_set = []
for b in range(0, words):
word = ''
a = 0
while True:
c = file_vector.read(1)
word = word + c
if a < self.max_c and c != '\n':
a = a + 1
if False == c or c == ' ':
break
word = word.strip()
#print "word=", word
vector = np.empty([200])
for index in range(0, size_long):
m = file_vector.read(self.float_size)
(weight,) = struct.unpack('f', m)
vector[index] = weight
self.word_vector[word.decode('utf-8')] = vector
print "finish vector loading"
分batch读入语料数据:
def read_source_file(self, source):
question_seq = []
anwser_seq = []
line = source.readline()
if self.phrase == 'train':
for i in range(self.batch_size):
if line:
line_pair = line.split('|')
line_question = line_pair[0]
line_answer = line_pair[1]
question_words = []
anwser_words = []
for word in line_question.decode("utf-8").split(' '):
if word != u'' and word != u'\n':
question_words.append(word)
for word in line_answer.decode("utf-8").split(' '): # 虽然用utf-8解码了。但是存入list或者dict后依然是以unicode存的,why?
if word != u'' and word != u'\n':
anwser_words.append(word)
question_seq.append(question_words)
anwser_seq.append(anwser_words)
line = source.readline()
print "seq_num", len(question_seq)
print "read sentences"
return question_seq, anwser_seq
语料向量化
def get_batches(self):
print "start get batch!"
xy_data = []
y_data = []
if self.phrase == 'train':
index_list = range(0, self.seq_len)
for i in range(self.seq_len):
index = random.choice(index_list)
x_data_single = []
y_data_single = []
for word in self.question_seq[i]:
if word in self.word_vector_dict.keys(): # word2vec取至少出现5次的词
x_data_single.append(self.word_vector_dict[word])
for word in self.anwser_seq[i]:
if word in self.word_vector_dict.keys():
y_data_single.append(self.word_vector_dict[word])
if len(x_data_single) < self.max_seq_len and len(y_data_single) < self.max_seq_len:
sequence_xy = [np.zeros(self.word_vec_dim)] * (self.max_seq_len - len(x_data_single)) + list(
reversed(x_data_single))
sequence_y = y_data_single + [np.zeros(self.word_vec_dim)] * (self.max_seq_len - len(y_data_single))
sequence_xy = sequence_xy + sequence_y
sequence_y = [np.ones(self.word_vec_dim)] + sequence_y
xy_data.append(sequence_xy)
y_data.append(sequence_y)
return np.array(xy_data), np.array(y_data)
Tflearn大大的简化和使用tensorflow的复杂度,但不得不说,其能做的很少,有点傻瓜式。用tflearn和tensorflow的一些基本操作来建图:
def model(self, feed_previous=False):
input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
decoder_input_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
go_inputs = tf.ones_like(decoder_input_tmp)
go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
decoder_inputs = tf.concat([go_inputs, decoder_input_tmp], axis=1, name ="dec_in")
(encoder_output_tensor, states) =tflearn.lstm(encoder_inputs, self.word_vec_dim, return_state=True, scope="encoder_lstm")
tflearn.embedding()
encoder_output_sequence = tf.stack([encoder_output_tensor], axis=1)
#tf.contrib.seq2seq.attention_wrapper.LuongAttention(self.word_vec_dim,encoder_output_tensor) #attention机制能加进去
#tf.contrib.legacy_seq2seq.model_with_buckets #分桶机制也能加进去
if feed_previous:
first_dec_input = go_inputs
else:
first_dec_input = tf.slice(decoder_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
decoder_output_tensor = tflearn.lstm(first_dec_input, self.word_vec_dim, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
# 0.x版本的tf的pack()已经修改乘stack(),两个函数参数一样,内部程序也一样。tf.x, x表示tensorflow/python/ops目录下的所有xxx_ops.py文件下的所有方法。
decoder_out_sequence_single = tf.stack([decoder_output_tensor], axis =1)
decoder_out_sequence_list = [decoder_output_tensor]
for i in range(self.max_seq_len - 1):
if feed_previous:
next_dec_input = decoder_out_sequence_single
else:
next_dec_input = tf.slice(decoder_inputs, [0, i+1, 0], [-1, 1, self.word_vec_dim])
decoder_output_tensor = tflearn.lstm(next_dec_input, self.word_vec_dim, return_seq=False, reuse=True, scope='decoder_lstm')
# 0.x版本的tf的pack()已经修改乘stack(),两个函数参数一样,内部程序也一样。tf.x, x表示tensorflow/python/ops目录下的所有xxx_ops.py文件下的所有方法。
decoder_out_sequence_single = tf.stack([decoder_output_tensor], axis =1)
decoder_out_sequence_list.append(decoder_output_tensor)
decoder_out_sequence = tf.stack(decoder_out_sequence_list, axis=1)
real_output_sequence = tf.concat([encoder_output_sequence, decoder_out_sequence], 1)
net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
model = tflearn.DNN(net)
return model
提醒一下:tf升级到1.0之后,tf.pack改成了tf.stack,从源程序可以看出,仅仅是名字改了而已,函数内部没有变化。Tf的基本操作的源程序都存放在tensorflow/python/ops路径下的xxx_ops.py中,tf.stack是对array的操作。
训练过程,由于本人的机子太差了,加上word2vec模型在训练过程中一直驻留在内存中,导致,语料输入必须按batch分批次训练。
def train(self):
print "step1: start init ..."
self.init_sys()
model = self.model(feed_previous=False)
print "step1 finished"
print "step2: start =train ..."
with open(self.source_file, 'r') as source:
batch_num = self.total_pair_num / self.batch_size
print "batch_num=", batch_num
for j in range(batch_num):
(question_seq, anwser_seq) = self.read_source_file(source)
self.question_seq = question_seq
self.anwser_seq = anwser_seq
question_seq_len = len(self.question_seq)
anwser_seq_len = len(self.anwser_seq)
self.seq_len = min(question_seq_len, anwser_seq_len)
trainXY, trainY = self.get_batches()
print "XY=", trainXY.shape
print "Y=", trainY.shape
model.fit(trainXY, trainY, n_epoch=5, snapshot_epoch=False, batch_size=20)
model.save("out/model")
加载模型
def load(self):
self.init_sys()
with open(self.source_file, 'r') as source:
(question_seq, anwser_seq) = self.read_source_file(source)
self.question_seq = question_seq
self.anwser_seq = anwser_seq
self.seq_len = len(self.question_seq)
#batch_num = self.seq_len / self.batch_size
model = self.model(feed_previous=True)
model.load('out/model')
return model
计算与预测句子最相近的词句并输出
seq2seq = cb_seq2seq(max_seq_len=16,
word_vec_dim=200,
vector_file='source/w2v_dialogue_neg15_20170628.bin',
source_file='source/ass_cut_pair_test',
batch_size=100,
total_pair_num= 100,
phrase = 'predict')
model = seq2seq.load()
testXY, testY = seq2seq.get_batches()
predict = model.predict(testXY)
distance_calc = cb_word2vec_distance.word2vec_distance(word_vector_dict=seq2seq.word_vector_dict)
for sample in predict:
print "predict anwser:"
for v in sample[1:]:
#print v
(matched_word, max_cos) = distance_calc.distance(v)
print matched_word, max_cos
词距计算:
class word2vec_distance(object):
def __init__(self, word_vector_dict):
self.word_vector_dict = word_vector_dict
def vector_len(self, vector):
vector = np.array(vector)
vector_transpose = vector.transpose()
len = np.dot(vector, vector_transpose)
len_sqrt = math.sqrt(len)
return len_sqrt
def vector_cosine(self, vector1, vector2):
if len(vector1) != len(vector2):
sys.exit(1)
v1_sqrtlen = self.vector_len(vector1)
v2_sqrtlen = self.vector_len(vector2)
v1 = np.array(vector1)
v2_transpose = np.array(vector2).transpose()
value = np.dot(v1, v2_transpose)
cosine = value / (float(v1_sqrtlen) * float(v2_sqrtlen))
return cosine
def distance(self, vector):
max_cos = -10000
matched_word = ''
for word in self.word_vector_dict:
v = self.word_vector_dict[word]
v_cos = self.vector_cosine(vector, v)
if v_cos > max_cos:
matched_word = word
max_cos = v_cos
return (matched_word, max_cos)