小姜机器人、python、tensorflow、chatbot、dialog、fuzzywuzzy、搜索、检索式、生成式、聊天、闲聊、对话、问答、多轮、单轮、开放、封闭、任务、垂直等等描述,都属于自动问答领域。而且已经有了比较大的应用,常见的智能手机助手、聊天机器人、智能客服......
检索式chatbot,在我看来,非常适合那些个一问一答的领域,尤其是那些垂直型业务,需要比较精准回答的领域,比如说客服哇、政务哇什么的。这些应用,一般会粗分为业务类和闲聊类。
检索类chatbot,主要方法便是预先准备好问答数据,然后,用户来了问题请求,再检索出最适合的答案,设定阀值或者域值,给用户返回就好。
虽然看起来比较简单,效果也不是最好,但以我工作中实验的真实语料来看,也没有差哪些花里胡哨的算法类方案特别多,凑合着用还行吧。
比如说什么lucene啊,ES(elasticsearch)啊,编辑距离匹配啊、关键词匹配啊什么的。
我自己实现了一个小demo,依赖用的是fuzzywuzzy,fuzzywuzzy来计算模糊距离等,效果嘛,勉勉强强吧。
方案的大致过程:
用户输入-----> 粗帅选(选出那些与用户输入有相同字符的) -----> 精帅选(选出top1000,fuzzywuzzy计算距离,选top1)
是不是超简单,确实比较low,但实验效果还行吧,多种方式的话,不妨加进去防止给出的回答一点儿也不相关。
代码地址如下:
https://github.com/yongzhuo/nlp_xiaojiang/blob/master/ChatBot/chatbot_search/chatbot_fuzzy.py
具体代码如下,但是需要语料,需要的可以直接下github里边的项目吧:
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/4 10:00
# @author :Mo
# @function :
from conf.path_config import chicken_and_gossip_path
from utils.text_tools import txtRead, txtWrite
from conf.path_config import projectdir
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import pickle
import time
import re
def count_same_char(x1, x2):
'''获取相同字符的个数'''
res = []
for x in x1:
if x in x2:
res.append(x)
if res:
return len(res)
else:
return 0
def fuzzy_re(user_input, collection):
'''匹配方法, 效果不大好,只能匹配相同字数一样,或者字数比他多的那种,同义词或者是有一个词不一样,就没法区分开'''
suggestions = []
user_input = user_input.replace('.', '').replace('*', '').replace('?', '')
collection_new = []
len_user_input = len(user_input)
for coll in collection: # 获取包含所有字符的,如果不包含,就返回错误
count_coll = 0
for i in range(len_user_input):
if user_input[i] in coll:
count_coll += 1
if len_user_input == count_coll:
collection_new.append(coll)
if not collection_new:
return None
pattern = '.*?'.join(user_input) # Converts 'djm' to 'd.*?j.*?m'
try:
regex = re.compile(pattern) # Compiles a regex.
except:
gg = 0
for item in collection_new:
match = regex.search(item) # Checks if the current item matches the regex.
if match:
suggestions.append((len(match.group()), match.start(), item))
return [x for _, _, x in sorted(suggestions)]
def fuzzy_fuzzywuzzy(fuzz, user_input, collection):
'''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
collection_new = []
len_user_input = len(user_input)
for coll in collection: # 获取包含一个字符的,如果不包含,就返回错误
for i in range(len_user_input):
if user_input[i] in coll:
collection_new.append(coll)
if not collection_new:
return None
collection_new = list(set(collection_new))
same_char_list = []
for collection_new_one in collection_new: # 获取相同字符串多的问题
count_same_char_one = count_same_char(user_input, collection_new_one)
same_char_list.append((collection_new_one, count_same_char_one))
same_char_list.sort(key=lambda x: x[1], reverse=True)
if len(same_char_list) >= 500:
same_char_list = same_char_list[0: 500]
result = process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20)
return result
def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
'''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
start_time = time.time()
# user_input_set = set([user_input_one for user_input_one in user_input])
user_input_set = [user_input_one for user_input_one in user_input]
same_char_list = []
max_data = 0
max_data_list = []
count_collection_new_one = 0
for collection_new_one in collection: # 获取相同字符串多的问题
count_same_char_one = len([x for x in user_input_set if x in collection_new_one])
if count_same_char_one > 0:
same_char_list.append((count_collection_new_one, count_same_char_one))
if count_same_char_one > max_data:
max_data_list.append(count_same_char_one)
max_data = count_same_char_one
count_collection_new_one += 1
end_time1 = time.time()
list_max_count = []
len_max_data_list = len(max_data_list)
for x in range(len_max_data_list): # 获取前20排名
for k,l in same_char_list:
if l == max_data_list[len_max_data_list -1 - x]:
list_max_count.append(qa_list[k]) #问答重这里取出来
if len(list_max_count) >= 5000:
list_max_count = list_max_count[0:5000]
break
end_time2 = time.time()
# end_time1: 0.34090662002563477
# end_time2: 0.4080846309661865
# end_time1: 0.06417036056518555
# end_time2: 0.08422374725341797
# same_char_list.sort(key=lambda x: x[1], reverse=True)
# if len(same_char_list) >= 20:
# same_char_list = same_char_list[0: 20]
result = process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
end_time3 = time.time()
# print('end_time1: ' + str(end_time1 - start_time))
# print('end_time2: ' + str(end_time2 - start_time))
# print('end_time3: ' + str(end_time3 - start_time))
return result
# [fuzz.WRatio, fuzz.QRatio,
# fuzz.token_set_ratio, fuzz.token_sort_ratio,
# fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
# fuzz.UWRatio, fuzz.UQRatio]
if __name__ == '__main__':
start_time = time.time()
qa_list = txtRead(chicken_and_gossip_path)
questions = [qa.strip().split("\t")[0] for qa in qa_list]
print("read questions ok!")
sen = "你谁呀"
# list_fuzzyfinder = fuzzyfinder(base_syn_one_split[1], qa_list)
# list_fuzzyfinder = fuzzy_fuzzywuzzy(fuzz, base_syn_one_split[1], qa_list)
print("你问: " + "你谁呀")
list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, sen, qa_list, questions, topn=5)
print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip())
print("推荐结果: ")
print(list_fuzzyfinder)
while True:
print("你问: ")
ques = input()
list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, ques, qa_list, questions, topn=5)
print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip())
print("推荐结果: ")
print(list_fuzzyfinder)
希望对你有所帮助