1 聊点么好呢?
2 那我们随便聊聊吧
3 你是什么人?
4 我是智能客服
5 有人在吗
6 小宝一直会在这里诚心为您服务
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True):
if not gfile.Exists(vocabulary_path):
print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
vocab = {}
with gfile.GFile(data_path, mode="rb") as f:
counter = 0
for line in f:
counter += 1
if counter % 100000 == 0:
print("processing line %d" % counter)
line = tf.compat.as_bytes(line)
tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
for win tokens:
word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w
if word in vocab:
vocab[word] += 1
vocab[word] = 1
vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
if len(vocab_list) > max_vocabulary_size:
vocab_list = vocab_list[:max_vocabulary_size]
with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
for win vocab_list:
vocab_file.write(w + b"\n")
根据统计的词频和字典,我们为聊天语料建立Token Id,比如“聊点什么好呢”这句话,根据每个词在词组中的位置[“聊”:0,“点”:1,“什么”:2,“好”:3,“呢”:4]可以表征为[0,1,2,3,4]。
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True):
data_path (str): 数据文件的路径,格式为每行一句。
target_path (str): 将创建的文件token-ids的路径。
vocabulary_path (str): 词汇文件的路径。
tokenizer: 用于对每个句子进行分词的函数;如果为None,将使用basic_tokenizer。
normalize_digits (bool): 如果为True,则将所有数字替换为O。
if not gfile.Exists(target_path):
print("正在对位于 {} 的数据进行分词".format(data_path))
vocab = initialize_vocabulary(vocabulary_path)
with gfile.GFile(data_path, mode="rb") as data_file:
with gfile.GFile(target_path, mode="w") as tokens_file:
counter = 0
for line in data_file:
line = line.decode('utf8', 'ignore')
except Exception as e:
print(e, line)
counter += 1
if counter % 100000 == 0:
print("正在对第 {} 行进行分词".format(counter))
token_ids = sentence_to_token_ids(tf.compat.as_bytes(line), vocab, tokenizer, normalize_digits)
tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def single_cel1():
return tf.contrib.rnn.GRUCell(size) if use_lstm else tf.contrib.rnn.BasicLSTMCell(size)
def single_cell():
return tf.contrib.rnn.BasicLSTMCell(size)
cell = single_cel1() if num_layers > 1 else single_cell()
cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)])
# The seq2seg function: we use embedding for the input and attention.
def seq2seq_f(encoder_inputs, decoder_inputs, feed_previous):
return tf_seq2seq.embedding_attention_seq2seq(
encoder_inputs, decoder_inputs, cell,
num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size,
embedding_size=size, output_projection=output_projection,
feed_previous=feed_previous, dtype=dtype)
# Training outputs and losses, if forward_only:
self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
lambda x, y: seq2seq_f(x, y, True),
# If we use output projection, we need to project outputs for decoding.
if output_projection is not None:
for b in xrange(len(buckets)):
self.outputs[b] = [
tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b]
self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
lambda x, y: seq2seq_f(x, y, False),
def det_train(args):
print("Preparing dialog data in to", args.model_name, args.data_dir)
train_data, dev_data, _ = data_utils.prepare_dialog_data(args.data_dir, args.vocab_size)
if args.reinforce_learn:
args.batch_size = # is decode one sentence at a time
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction*args.gpu_usage)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
# Create model,
print("Creating id layers of hd units.")
model = seq2seq_model_utils.create_model(sess, args.forward_only-False)
# Read data into buckets and compute their sizes,
print("Reading development and training data (limit: %d)," % args.max_train_data_size)
dev_set = data_utils.read_data(dev_data, args.buckets*args.rev_model)
train_set = data_utils.read_data(train_data, args.buckets, args.max_train_data_size, args.rev_model)
#Tev mode
train_bucket_sizes = [len(train_set[b]) for b in range(len(args.buckets))]
train_total_size = float(sum(train_bucket_sizes))
train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))]
# This is the training loop
step_time, loss = 0.0, 0.0 # current step and loss so far
previous_losses = [] # to keep track of the losses in every epoch
# Load vocabularies
vocab_path = os.path.join(args.data_dir, "rocabid.%d" % args.vocab_size)
vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
while True:
random_number = np.random.random() # random number between 0 and 1
bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number]) # find the bucket id based on the random number
# Get a batch and make a step
start_time = time.time() # record the start time of this batch
encoder_inputs, decoder_inputs, target_weights = model.get_batch(train_set, bucket_id) # get a batch from the selected bucket id
if args.reinforce_learn:
step_loss = model.step_rf(args, sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, rev_vocab) # make a step using the reinforcement learning loss function
step_loss = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id=bucket_id, forward_only=False) # make a step using the default loss function
# update the loss and current step after each batch/step finishs (in the end of this loop)
loss += step_loss / (time.time() - start
在预测模块,对应生成对话,我们需要利用Beam Search来寻找最优解。通过对Beam Size的控制可以保证输出语句的多样性。此外我们也可以加入强化学习,对于不同的机器人回答进行及时的人工反馈,通过Reinforcement Learning不断优化模型。
Get output logits for the sentence
beams, now_beams, results = [(1.0, 0.0, i'eos': 0.0, 'dec inp': decoder_inputs, 'prob': 1.0, 'prob_ts': 1.0, 'prob_t': 4.0))]. []. [
Adjusted probability
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
Normal seg2seg
if debug:
print(' '.join([dict_lookup(rev_vocab, w) for w in cand['dec_inp']]))
if cand[eos']:
results += [(prob, 0, cand)]
Adjusted probability
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
Adjusted probability
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
Adjusted probability
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
all_prob = all_prob_ts - args.antilm * all_prob_t #+ args.n_bonus * dptr + random() * 1e-50
all_prob = all_prob_ts - args.antilm * all_prob_t
if args.n_bonus != 0:
all_prob += args.n_bonus * dptr
Suppress copy-cat (respond the same as input)
if dptr < len(input_token_ids):
all_prob[input_token_ids[dptr]] = all_prob[input_token_ids[dptr]] * 0.01
if return_raw:
return all_prob, all_prob_ts, all_prob_t # beam search
for c in np.argsort(all_prob)[::-1][:args.beam_size]:
new_cand "
gos dec_inp" (c - data_utils.EOS_ID), [(np.array([c]) if i -- (dptr+1) else k)
for i, k in enumerate(cand['dec_inp'])]
prob_ts cand['prob_ts *all_prob_ts[c]
prob prob cand['prob _ cand['prob ] * all_prob t[c]
new_cand = (new_cand['prob'], random(). new_cand) # stuff a randon to prevent comparing new_cand
if len (new_beams) < args.beam_size:
heapq. heappush(new_beams, new cand)
elif (new cando[0] > new _beams[0][0]):
heapq. heapreplace(new _beams, new _cand)
except Exception as e:
print("[Error]', e)
print(" ----[new _beams]-- ")
print("-ines _cand]\n", new _cand) -\n". new _beams)
results += new _cands # flush last cands post-process results res _cands
for prob, _ in sorted(results, reverse=True):
cand['dec _inp']l- res _cands. append(cand) join([dict _lookup(rev _vocab. w) for w in cand['dec _inp']l]) retugn res _cands[:args. beam _size]