随着AI的发展,各大企业采用智能问答机器人取代了人工客服。智能问答系统实现的方法有很多,本篇文章介绍之前做的一个简易的智能问答机器人。采用的方法是使用朴素贝叶斯模型进行问题分类,模糊匹配查询近似问题。
1.1 总体流程设计
问答系统总体实现步骤如下流程图主要包括数据预处理,模型训练,结果映射以及答案匹配。数据预处理主要是对语料库进行收集,并对语料库进行筛选转换为需要的格式;然后使用朴素贝叶斯对处理好的语料库进行训练,语料库输入模型训练之前需要先对其进行预处理以及转换(去停用词,分词,TF-IDF计算),通过TF-IDF计算后则输入朴素贝叶斯中进行训练,由于我的语料库比较简易,所以采用默认参数训练即可达到较好的分类效果。在结果映射步骤中,主要是对事先确定好的类别进行映射处理(可用于脚本网页跳转使用)。答案匹配采用了模糊匹配的方法对用户提的问题进行匹配,搜索出相似的问题并给出其对应的答案。
1.2 语料库收集
语料库收集如下图。这里第一列为需要分类的类别,第二列为相关的问题。本篇中的语料库主要分为人社信息语料库以及娱乐聊天语料库。
人社信息语料库:
娱乐聊天语料库:
1.3 主要程序介绍
可视化界面GUI主要采用了tkinter工具包完成,rum_main.py程序如下:
#!/usr/bin/env python3
# _*_ coding:utf-8 _*_
from tkinter import *
import time
from speech_test import *
'''
定义消息发送函数:
1、在<消息列表分区>的文本控件中实时添加时间;
2、获取<发送消息分区>的文本内容,添加到列表分区的文本中;
3、将<发送消息分区>的文本内容清空。
'''
def msgsend():
msg = '我:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + '\n'
# print(msg)
txt_msglist.insert(END, msg, 'green') # 添加时间
query = txt_msgsend.get('0.0', END) #!!!!!!!!!!!!!!!11
print(query)
result = main(query) #问题输入模型入口
print('result:',result)
txt_msglist.insert(END, txt_msgsend.get('0.0', END)) # 获取发送消息,添加文本到消息列表
txt_msglist.insert(END, '\n')
txt_msgsend.delete('0.0', END) # 清空发送消息
robot = '小Y:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + '\n'
txt_msglist.insert(END, robot, 'red')
txt_msglist.insert(END, result+'\n')
'''定义取消发送 消息 函数'''
def cancel():
txt_msgsend.delete('0.0', END) # 取消发送消息,即清空发送消息
'''绑定up键'''
def msgsendEvent(event):
if event.keysym == 'Up':
msgsend()
tk = Tk()
tk.title('聊天窗口')
'''创建分区'''
f_msglist = Frame(height=300, width=300) # 创建<消息列表分区 >
f_msgsend = Frame(height=300, width=300) # 创建<发送消息分区 >
f_floor = Frame(height=100, width=300) # 创建<按钮分区>
f_right = Frame(height=700, width=100) # 创建<图片分区>
'''创建控件'''
txt_msglist = Text(f_msglist) # 消息列表分区中创建文本控件
txt_msglist.tag_config('green', foreground='blue') # 消息列表分区中创建标签
txt_msglist.tag_config('red', foreground='red') # 消息列表分区中创建标签
txt_msgsend = Text(f_msgsend) # 发送消息分区中创建文本控件
txt_show = Text(f_msglist) # 消息列表分区中创建文本控件
txt_show.tag_config('red', foreground='red') # 消息列表分区中创建标签
txt_showsend = Text(f_msgsend) # 发送消息分区中创建文本控件
txt_msgsend.bind('', msgsendEvent) # 发送消息分区中,绑定‘UP’键与消息发送。
'''txt_right = Text(f_right) #图片显示分区创建文本控件'''
button_send = Button(f_floor, text='Send',command=msgsend) # 按钮分区中创建按钮并绑定发送消息函数
button_cancel = Button(f_floor, text='Cancel', command=cancel) # 分区中创建取消按钮并绑定取消函数
'''分区布局'''
f_msglist.grid(row=0, column=0) # 消息列表分区
f_msgsend.grid(row=1, column=0) # 发送消息分区
f_floor.grid(row=2, column=0) # 按钮分区
f_right.grid(row=0, column=1, rowspan=3) # 图片显示分区
txt_msglist.grid() # 消息列表文本控件加载
txt_msgsend.grid() # 消息发送文本控件加载
button_send.grid(row=0, column=0, sticky=W) # 发送按钮控件加载
button_cancel.grid(row=0, column=1, sticky=W) # 取消按钮控件加载
tk.mainloop()
智能问答机器人相关程序为 speech_test.py,程序如下:
#-*- coding:utf-8 -*-
import logging
logging.getLogger("requests").setLevel(logging.WARNING)
import csv
import jieba
import pickle
from fuzzywuzzy import fuzz
import math
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import lil_matrix
import jieba.posseg as pseg
import sys
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from speech_recognition import *
import warnings
warnings.filterwarnings("ignore")
def load_label_url():
with open('znwd_label_url.csv','r',encoding='utf-8') as f:
name_id = {}
label_url = csv.reader(f)
header = next(label_url)
for power_name_id in label_url:
name_id[power_name_id[0]] = power_name_id[1]
return name_id
def load_cut_save(filename,load = False):
jieba.load_userdict('UserDefined_words.txt')
corpus = []
label = []
with open(filename,'rt',encoding='utf-8') as f:
data_corpus = csv.reader(f)
header = next(data_corpus)
for words in data_corpus:
word = jieba.cut(words[1])
doc = []
for x in word:
if x not in stop_words and not x.isdigit():
doc.append(x)
corpus.append(' '.join(doc))
label.append(words[0])
if load == True:
with open('corpus.oj','wb') as f:
pickle.dump(corpus,f)
with open('label.oj','wb') as f:
pickle.dump(label,f)
return corpus,label
def train_model():
with open('corpus.oj','rb') as f_corpus:
corpus = pickle.load(f_corpus)
with open('label.oj','rb') as f_label:
label = pickle.load(f_label,encoding='bytes')
vectorizer = CountVectorizer(min_df=1)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
words_frequency = vectorizer.fit_transform(corpus)
word = vectorizer.get_feature_names()
saved = input_tfidf(vectorizer.vocabulary_,sparse.csc_matrix(words_frequency),len(corpus))
model = MultinomialNB()
model.fit(tfidf,label)
with open('model.oj','wb') as f_model:
pickle.dump(model,f_model)
with open('idf.oj','wb') as f_idf:
pickle.dump(saved,f_idf)
return model,tfidf,label
class input_tfidf(object):
def __init__(self,feature_index,frequency,docs):
self.feature_index = feature_index
self.frequency = frequency
self.docs = docs
self.len = len(feature_index)
def key_count(self,input_words):
keys = jieba.cut(input_words)
count = {}
for key in keys:
num = count.get(key, 0)
count[key] = num + 1
return count
def getTdidf(self,input_words):
count = self.key_count(input_words)
result = lil_matrix((1, self.len))
frequency = sparse.csc_matrix(self.frequency)
for x in count:
word = self.feature_index.get(x)
if word != None and word>=0:
word_frequency = frequency.getcol(word)
feature_docs = word_frequency.sum()
tfidf = count.get(x) * (math.log((self.docs+1) / (feature_docs+1))+1)
result[0, word] = tfidf
return result
def model_predict(input_str):
f = open('idf.oj','rb')
idf = pickle.load(f)
f.close()
f = open('model.oj','rb')
model = pickle.load(f)
f.close()
tfidf = idf.getTdidf(input_str)
classifiction = (model.predict(tfidf))
# print(model.predict_proba(tfidf))
prob = model.predict_proba(tfidf).max()
name_id = load_label_url()
if prob >= 0.5:
answer1 = str(classifiction[0],'utf-8')
else:
answer1 = None
return answer1
def similarity(input_questions):
with open('corpus_1233.oj', 'rb') as f:
corpus = pickle.load(f,encoding='bytes')
with open('question_1233.oj', 'rb') as f:
question = pickle.load(f,encoding='bytes')
with open('answer_1233.oj', 'rb') as f:
answer = pickle.load(f,encoding='bytes')
text = {}
train = []
answer2 = []
for key, value in enumerate(corpus):
similarity = fuzz.ratio(input_questions, value)
if similarity > 40:
text[key] = similarity
if len(text) >= 3:
train = sorted(text.items(), key=lambda d: d[1], reverse=True)
# print(u"与您提的疑问相似的问题有\n")
for i in range(3):
an = {"question":question[train[i][0]],"answer":answer[train[i][0]]}
answer2.append(an)
# print("%d、" % (i + 1), \
# " 问题:%s\n" % str(question[train[i][0]],'utf-8'), \
# " 答案:%s" % str(answer[train[i][0]],'utf-8'))
elif len(text) == 2:
train = sorted(text.items(), key=lambda d: d[1], reverse=True)
# print("与您提的疑问相似的问题有\n")
for i in range(2):
an = {"question":question[train[i][0]],"answer":answer[train[i][0]]}
answer2.append(an)
# print("%d、" % (i + 1), \
# " 问题:%s\n" % str(question[train[i][0]],'utf-8'), \
# " 答案:%s" % str(answer[train[i][0]],'utf-8'))
elif len(text) == 1:
an = {"question": question[list(text.keys())[0]], "answer": answer[list(text.keys())[0]]}
answer2.append(an)
# print("与您提的疑问相似的问题有:\n", \
# " 问题:%s" % str(question[text.keys()[0]],'utf-8'), \
# " 答案:%s" % str(answer[text.keys()[0]],'utf-8'))
else:
# print("您所提的疑问无其他相似问题!")
an = {"question":None,"answer":None}
answer2.append(an)
return answer2
def get_greeting(input_questions,question,answer):
text = {}
for key, value in enumerate(question):
similarity = fuzz.ratio(input_questions, value)
if similarity > 60:
text[key] = similarity
if len(text) > 0:
train = sorted(text.items(), key=lambda d: d[1], reverse=True)
answer3 = answer[train[0][0]]
else:
answer3 = None
return answer3
def sim(doc):
input_questions = ''
input_words = jieba.cut(doc)
for x in input_words:
if x not in stop_words:
input_questions += x
answer2 = similarity(input_questions)
return answer2
def ans_show(returnSet):
if returnSet[2] is not None:
ans = "%s"%returnSet[2]
elif returnSet[0] is not None:
ans = "您的问题属于<%s>专栏\n"%returnSet[0]
ans1 = ""
if returnSet[1][0]['question'] is not None:
ans1 = "小Y还知道其他一些问题例如:\n"
ans2 = ""
for i in range(len(returnSet[1])):
ans2 = ans2 + "%d、" % (i + 1) + " 问题:%s\n" % str(returnSet[1][i]['question'],'utf-8') + " 答案:%s" % str(returnSet[1][i]['answer'],'utf-8')
ans1 = ans1 + ans2
ans = ans + ans1
elif returnSet[1][0]['question'] is not None:
ans1 = "小Y知道相似的问题:\n"
ans2 = ""
for i in range(len(returnSet[1])):
ans2 = ans2 + "%d、" % (i + 1) + " 问题:%s\n" % str(returnSet[1][i]['question'], 'utf-8') + " 答案:%s" % str(returnSet[1][i]['answer'], 'utf-8')
ans = ans1 + ans2
else:
ans = "您问的问题太过深奥,Mike才疏学浅暂时无法为您解答,待我读书破万卷后成为您的百科机器人"
return ans
with open('stop_words.txt', 'rb') as f:
stop_words = f.read().splitlines()
question_greeting = []
answer_greeting = []
with open("greeting.csv", 'r',encoding='utf-8') as f:
greeting = csv.reader(f)
header = next(greeting)
for words in greeting:
question_greeting.append(words[0])
answer_greeting.append(words[1])
filename = 'znwd_corpus.csv'
corpus, label = load_cut_save(filename,load=False)
def main(question):
if question != None:
query = question #########此处会因语音无法识别还报错
print("我 > %s" %query)
##############
answer3 = get_greeting(query,question_greeting,answer_greeting)
# print(answer3)
if answer3 is None:
answer1 = model_predict(query)
answer2 = sim(query)
else:
answer1 = None
answer2 = None
ans = [answer1,answer2,answer3]
result = ans_show(ans)
else:
result = "输入有误请重新输入!"
query = None
return result
因而,一个简单的智能问答机器人即可实现,若需要问答机器人能够回答更多的内容,可针对语料库进行修改,进而丰富智能问答机器人的聊天范围。程序中读取语料库采用了pickle工具包将预处理后的语料库进行序列化至本地,进而在下次使用语料库不需要再次进行预处理,可节省处理的时间。修改语料库后需重新覆盖序列化至本地的语料库文件和模型文件。
最终效果如下,我的语料库主要用了社保卡相关的数据,该问答系统可以当作是一个社保信息方面的问答。
工程源码:https://github.com/liangjunAI/chatting_robot