主题:基于天津大学基本信息构建知识库,并实现基于模板匹配的知识库问答系统
目的:实践内容包括知识获取、知识表示、知识存储、知识管理、知识问答以及推理。通过实现一个知识库问答系统,将本课程所学的理论知识以及实践操作结合,并融会贯通到实际应用中。
所需软件以及编程环境:python3、Apache Jena Fuseki
本文选用的数据包括智算学部所有硕士生导师的信息、所有专业的信息、天津大学所有院系的信息、天津大学所有职能部门的信息,本文主要以智算学部所有导师的信息为例。
从数据所在的网址中获取请求URL和请求方式,在Chrome浏览器中打开网页,在网页中右击,选择“检查”,在Network中的Doc下面可以找到,如图所示:
然后在pycharm中来编写抓取数据的代码。
第一步:获取要爬虫的网页
url = 'http://cic.tju.edu.cn/jyjx/yjsjy/yjsdsml.htm'
strhtml = requests.get(url)
strhtml.encoding = 'utf-8'
第二步:从Elements中找到要爬取的内容所在的位置
找到网页中对应的块,然后提取出块中的信息。
以智算学部所有导师的信息为例,需要从教师名字超链接中进入教师的个人网页,然后爬取数据。在这里我先爬取教师的个人网页,然后保存在list中,再逐个读取并进入读取到的网址,获取每位教师的信息。
def get_teacher_id(url):
strhtml = requests.get(url)
strhtml.encoding = 'utf-8'
soup = BeautifulSoup(strhtml.text, 'html.parser')
#print(strhtml.text)
data = soup.select('#vsb_content > table > tbody > tr > td > a')
results = []
for item in data:
results.append(item.get('href'))
#print(results)
#print(len(results))
list2 = []
for i in results:
if i not in list2:
list2.append(i)
#print(list2)
#print(len(list2))
return list2
第三步:因为并不是所有教师主页中都包含相同格式的内容,而且在后续将数据转为RDF时,需要按照标签来进行转换。因此我们设定每位老师需要爬取的信息,提取具有相同内容的教师的信息,若某位教师主页内容与我们设定的不同,我们则跳过这位老师。最后将所有爬取到的教师的信息以json格式保存到json文档中。
#从教师的个人主页中爬取数据,并以json格式保存到json文档中
for teacher_url in teacher_url:
print(teacher_url)
teacher_info_html = requests.get(teacher_url)
teacher_info_html.encoding = 'utf-8'
teacher_info_html = teacher_info_html.text
#print(teacher_info_html)
if getinfo(teacher_url, teacher_info_html) == 0:
continue # 信息不同,跳过,继续下一位老师
else:
final.append(getinfo(teacher_url, teacher_info_html))
with open(os.path.join('E:/kg', 'teacher.json'), 'w', encoding='utf-8') as opt_file:
json.dump(final, opt_file, ensure_ascii=False) # 将opt数据写入json文件
部分结果如图所示:
完整代码如下:
# -*- coding: utf-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import json
import os
def get_teacher_id(url):
strhtml = requests.get(url)
strhtml.encoding = 'utf-8'
soup = BeautifulSoup(strhtml.text, 'html.parser')
#print(strhtml.text)
data = soup.select('#vsb_content > table > tbody > tr > td > a')
results = []
for item in data:
results.append(item.get('href'))
#print(results)
#print(len(results))
list2 = []
for i in results:
if i not in list2:
list2.append(i)
#print(list2)
#print(len(list2))
return list2
def getinfo(teacher_id, teacher_info_html):
"""
:param teacher_id:
:param teacher_info_html:
"""
teacher_soup = BeautifulSoup(teacher_info_html, 'html.parser')
#print(teacher_info_html)
info = []
for p in teacher_soup.select('.v_news_content p')[1:2]:
print(p)
info.append(p.text.strip())
print(len(info))
print(info)
if info == [''] or len(info)==0 :
print("no infomation")
return 0
all_info = "".join(info[0].split())
print(all_info)#str类型
name_list = all_info.split("姓名:")
if len(name_list) == 1:
print("no infomation")
return 0
name = name_list[1].split("职称:")[0]
print(name)
title_list = name_list[1].split("职称:") # 职称
if len(title_list) == 1:
print("no infomation")
return 0
title = title_list[1].split("所在系别:")[0]
print(title)
faculty_list = title_list[1].split("所在系别:") # 所在系别
if len(faculty_list) == 1:
print("no infomation")
return 0
faculty = faculty_list[1].split("主讲课程:")[0]
print(faculty)
course_list = faculty_list[1].split("主讲课程:") # 主讲课程
if len(course_list) == 1:
print("no infomation")
return 0
course = course_list[1].split("导师类型:")[0]
course = re.split('[、,/。]',course)
print(course)
print(len(course))
type_list = course_list[1].split("导师类型:") # 导师类型
if len(type_list) == 1:
print("no infomation")
return 0
type = type_list[1].split("电子邮件:")[0]
type = re.split("[、,/。]",type)
print(type)
print(len(type))
email_list = type_list[1].split("电子邮件:") # 电子邮件
if len(email_list) == 1:
print("no infomation")
return 0
email = email_list[1].split("研究领域:")[0]
print(email)
field_list = email_list[1].split("研究领域:") # 研究领域
if len(field_list) == 1:
print("no infomation")
return 0
field = field_list[1].split("研究方向:")[0]
print(field)
direction_list = field_list[1].split("研究方向:") # 研究方向
if len(direction_list) == 1:
print("no infomation")
return 0
direction = direction_list[1].split("个人主页:")[0]
print(direction)
if len(direction_list[1].split("个人主页:")) == 1:
print("no infomation")
return 0
homepage = direction_list[1].split("个人主页:")[1] # 个人主页
print(homepage)
teacher_info_json = {
'id': teacher_id,
'name': name,
'title': title,
'faculty':faculty,
'course': course,
'type':type,
'email': email,
'field': field,
'direction': direction,
'homepage': homepage,
}
print(course,type,teacher_info_json)
return teacher_info_json
if __name__ == '__main__':
try:
final = []
url = 'http://cic.tju.edu.cn/jyjx/yjsjy/yjsdsml.htm'
teacher_url = get_teacher_id(url)
teacher_url.remove('../../info/1067/1147.htm')
print(teacher_url)
print(len(teacher_url))
for teacher_url in teacher_url:
print(teacher_url)
teacher_info_html = requests.get(teacher_url)
teacher_info_html.encoding = 'utf-8'
teacher_info_html = teacher_info_html.text
#print(teacher_info_html)
if getinfo(teacher_url, teacher_info_html) == 0:
continue
else:
final.append(getinfo(teacher_url, teacher_info_html))
with open(os.path.join('E:kg', 'teacher.json'), 'w', encoding='utf-8') as opt_file:
json.dump(final, opt_file, ensure_ascii=False) # 将opt数据写入json文件
except requests.exceptions.ConnectionError:
print('Handle Exception')
然后将数据转为RDF数据,供后面进行SPARQL语句查询。
第一步:定义三元组的格式,如代码所示,其中‘%05d’会被教师的顺序所代替,“%s”会被原始数据代替
teacher_id = " \"%s\" ."
name = " \"%s\" ."
title = " \"%s\" ."
faculty = " \"%s\" ."
course = " \"%s\" ."
type = " \"%s\" ."
email = " \"%s\" ."
field = " \"%s\" ."
direction = " \"%s\" ."
homepage = " \"%s\" ."
第二步:逐行读取数据,然后按照定义好的三元组,将数据存为实体关系三元组。以教师——主讲课程和教师——类型三元组为例,每名教师可能教授几门课程,每门课程都要和对应的教师建立实体关系三元组,每名教师可能有多种类型,每种类型都要和对应的教师建立实体关系三元组
for course_1 in load_dict[i]['course']:
#print(course_1)
course_str = course % (i+1, course_1)
print(course_str)
triples.append(course_str)
triples_sum += 1
for type_1 in load_dict[i]['type']:
#print(type_1)
type_str= type % (i+1, type_1)
print(type_str)
triples.append(type_str)
triples_sum += 1
第三步:在后续操作中,需要对问题文本进行分词、词性标注,为了避免教师姓名分词出现错误,要提前制作教师姓名词性字典,代码如下所示
file_3 = open('E:/kg/triple/teachers_name.txt', 'w', encoding='UTF-8')
完整代码如下:
#!/usr/bin/env python
#encoding=utf-8
import random
import sys
import os
import json
teacher_id = " \"%s\" ."
name = " \"%s\" ."
title = " \"%s\" ."
faculty = " \"%s\" ."
course = " \"%s\" ."
type = " \"%s\" ."
email = " \"%s\" ."
field = " \"%s\" ."
direction = " \"%s\" ."
homepage = " \"%s\" ."
with open(os.path.join('E:/kg/triple', 'teacher.json'),'r',encoding='utf-8') as load_f:
load_dict = json.load(load_f)
print(load_dict)
print(len(load_dict))
file_3 = open('E:/kg/triple/teachers_name.txt', 'w', encoding='UTF-8')
triples_sum = 0
triples = []
for i in range(0, len(load_dict)) :
print(i)
id_str = teacher_id % (i+1, load_dict[i]['id'])
triples.append(id_str)
triples_sum += 1
name_str = name % (i+1, load_dict[i]['name'])
triples.append(name_str)
triples_sum += 1
file_3.write(load_dict[i]['name'] + ' ' + 'nr' + '\n')
title_str = title % (i+1, load_dict[i]['title'])
triples.append(title_str)
triples_sum += 1
faculty_str = faculty % (i+1, load_dict[i]['faculty'])
triples.append(faculty_str)
triples_sum += 1
print(load_dict[i]['course'],len(load_dict[i]['course']))
#print()
for course_1 in load_dict[i]['course']:
#print(course_1)
course_str = course % (i+1, course_1)
print(course_str)
triples.append(course_str)
triples_sum += 1
for type_1 in load_dict[i]['type']:
#print(type_1)
type_str= type % (i+1, type_1)
print(type_str)
triples.append(type_str)
triples_sum += 1
email_str = email % (i+1, load_dict[i]['email'])
triples.append(email_str)
triples_sum += 1
field_str = field % (i+1, load_dict[i]['field'])
triples.append(field_str)
triples_sum += 1
direction_str = direction % (i+1, load_dict[i]['direction'])
triples.append(direction_str)
triples_sum += 1
homepage_str = homepage % (i+1, load_dict[i]['homepage'])
triples.append(homepage_str)
triples_sum += 1
filename = ("'E:/kg/triple/teacher_%d_triples.nt") % (triples_sum)
with open(filename,"w+",encoding='utf-8') as fd:
fd.write("\n".join(triples))
首先从官网下载Jena Fuseki,解压到指定位置安装。
然后启动cmd,进入Jena Fuseki所在的位置。然后启动Jena Fuseki,并创建数据库名称,命令如下图所示:
然后从浏览器中输入localhost:3030,进入Jena Fuseki。从dataset中选择刚刚建立的数据库testds,并将创建的RDF数据上传到数据库中。上传成功后,如下图所示
接下来就是设计问答系统
第一步:要通过SPARQLWrapper包来连接数据库
sparql_base = SPARQLWrapper("http://localhost:3030/testds")
第二步:设计SPARQL查询语句的模板
# SPARQL模板
SPARQL_PREAMBLE = u"""
PREFIX school:
"""
SPARQL_TEM = u"{preamble}\n" + \
u"SELECT DISTINCT {select} WHERE {{\n" + \
u"{expression}\n" + \
u"}}\n"
SPARQL_TEM_count = u"{preamble}\n" + \
u"SELECT (COUNT({select}) AS {count}) WHERE {{\n" + \
u"{expression}\n" + \
u"}}\n"
SPARQL_ASK_TEM = u"{preamble}\n" + \
u"ASK WHERE{{\n" + \
u"{expression}\n" + \
u"}}\n"
第三步:设计正则匹配
首先要对问句列表中的问句进行分词处理,为了避免教师姓名、学院、专业、职能部门名称分词有误,因此导入外部字典,代码如下所示:
# 引入外部字典
jieba.load_userdict("all_name.txt")
分词代码如下所示:
default_questions = [
"天津大学有哪些学院?",
"化工学院的简介是什么?",
"化工学院电话?",
"化工学院的网址是什么?",
"天津大学有哪些职能部门?",
"研究生院的介绍是什么?",
"研究生院的电话是什么?",
"研究生院的网址是什么?",
"智算学部有哪些专业?",
"教授类型有多少老师?",
"老师类型有哪些?",
"硕士生导师类型有哪些老师?",
"硕士生导师类型有多少老师?",
"动画专业的专业培养是什么?",
"计算机专业的考研就业情况?",
"王晓飞老师主讲了哪些课?",
"王晓飞老师主讲了几门课?",
"王晓飞老师的研究方向是什么?",
"王晓飞老师是博士生导师吗?",
"王晓飞老师的个人主页是什么?"
]
questions = default_questions[0:]
seg_lists = []
# tokenizing questions
for question in questions:
words = pseg.cut(question) #分词 词性标注
seg_list = [Word(word.encode("utf-8"), flag) for word, flag in words] #分词后用Word类初始化,把words看成objects
seg_lists.append(seg_list)
然后设置关键词,使正则匹配时可以根据关键词匹配到正确的问题
# 正则匹配关键词设置
tutor_type_master = (W("硕士生导师") | W("硕导")| W("硕士导师")| W("硕士生"))
tutor_type_PhD = (W("博士生导师") | W("博导")| W("博士导师")| W("博士生"))
teacher = (W(pos = "nr") | W(pos = "x"))
whose = (W("谁") | W("哪些"))
quantity = (W("多少") | W("几") | W("几门"))
institution = (W("学院")|W("职能部门"))
college = (W(pos="nr"))
attribute = (W("简介")|W("电话")|W("网址")|W("介绍"))
teacher_title=(W("老师"))
class_1=(W('类型'))
teacher_title_name=(W("教授"))
college_1=(W("智算学部"))
major=(W('计算机专业')|W('动画专业')|W('软件工程专业'))
development=(W('培养'))
work=(W('考研')|W('就业'))
direction = (W("方向") | W("研究方向"))
page = (W("个人主页") | W("主页"))
接下来编写正则匹配规则。以第一个Rule为例,condition表示当遇见关键词institution和whose时,就采用how_many_institution_question这个查询函数
# 正则匹配规则编写
rules = [
#天津有哪些机构(学院或职能部门)?
Rule(condition = Star(Any(), greedy=False) + whose + institution, action=how_many_institution_question),
#某学院的电话?
Rule(condition= college + Star(Any(), greedy=False) + attribute, action=what_attribute_institution_question),
#某导师类型有哪些老师?
Rule(condition = tutor_type_master + Star(Any(), greedy = False) + whose, action = who_is_master_tutor_question),
#某导师类型有多少老师?
Rule(condition = tutor_type_master + Star(Any(), greedy = False) + quantity, action = how_many_teachers_are_master_tutor_question),
#老师类型有哪些?
Rule(condition = teacher_title + Star(Any(), greedy = False)+ class_1,action=teacher_title_question),
#教授类型有多少老师?
Rule(condition = teacher_title_name + Star(Any(), greedy=False)+quantity,action=how_many_professor_question),
#智算学部有哪些专业?
Rule(condition = college_1 + Star(Any(), greedy = False)+ whose,action=which_majors_in_cal_question),
#某专业的专业培养是什么?
Rule(condition = major + Star(Any(), greedy = False)+ development,action=what_development_question),
#某专业的考研就业情况?
Rule(condition = major + Star(Any(), greedy = False)+ work,action=how_work_question),
# 某老师主讲了哪些课?
Rule(condition=teacher + Star(Any(), greedy=False) + whose, action=what_courses_teacher_question),
# 某老师主讲了几门课?
Rule(condition=teacher + Star(Any(), greedy=False) + quantity, action=how_many_courses_teacher_question),
# 某老师的研究方向是什么?
Rule(condition=teacher + Star(Any(), greedy=False) + direction, action=what_direction_teacher_question),
# 某老师是博士生导师吗?
Rule(condition=teacher + Star(Any(), greedy=False) + tutor_type_PhD, action=teacher_is_PhD_tutor_question),
# 某老师的个人主页是什么?
Rule(condition=teacher + Star(Any(), greedy=False) + page, action=what_homepage_teacher_question)
]
编写查询函数,部分查询函数如下所示:
# 某老师的研究方向是什么?
def what_direction_teacher_question(x):
select = u"?x0"
sparql = None
for w in x:
if w.pos == "nr":
e=u"?teacherid school:teacher_name \"{person}\". " \
u"?teacherid school:teacher_direction ?x0.".format(person=w.token.decode("utf-8"))
sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
break
return sparql
# 某老师是博士生导师吗?
def teacher_is_PhD_tutor_question(x):
sparql = None
for w in x:
if w.pos == "nr":
e=u"?teacherid school:teacher_name \"{person}\". " \
u"?teacherid school:teacher_type \"博导\".".format(person=w.token.decode("utf-8"))
sparql = SPARQL_ASK_TEM.format(preamble=SPARQL_PREAMBLE, expression=INDENT + e)
break
return sparql
# 某老师的个人主页是什么?
def what_homepage_teacher_question(x):
select = u"?x0"
sparql = None
for w in x:
if w.pos == "nr":
e=u"?teacherid school:teacher_name \"{person}\". " \
u"?teacherid school:teacher_homepage ?x0.".format(person=w.token.decode("utf-8"))
sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
break
return sparql
完整代码如下:
# coding: utf-8
# standard import
import re
from refo import finditer, Predicate, Star, Any
import jieba.posseg as pseg
from jieba import suggest_freq
import jieba
from SPARQLWrapper import SPARQLWrapper, JSON
import io
import importlib,sys
importlib.reload(sys)
# 引入外部字典
jieba.load_userdict("all_name.txt")
sparql_base = SPARQLWrapper("http://localhost:3030/testds")
# SPARQL config
# SPARQL模板
SPARQL_PREAMBLE = u"""
PREFIX school:
"""
SPARQL_TEM = u"{preamble}\n" + \
u"SELECT DISTINCT {select} WHERE {{\n" + \
u"{expression}\n" + \
u"}}\n"
SPARQL_TEM_count = u"{preamble}\n" + \
u"SELECT (COUNT({select}) AS {count}) WHERE {{\n" + \
u"{expression}\n" + \
u"}}\n"
SPARQL_ASK_TEM = u"{preamble}\n" + \
u"ASK WHERE{{\n" + \
u"{expression}\n" + \
u"}}\n"
INDENT = " "
class Word(object):
"""treated words as objects"""
def __init__(self, token, pos):
self.token = token
self.pos = pos
class W(Predicate):
"""object-oriented regex for words"""
def __init__(self, token=".*", pos=".*"):
self.token = re.compile(token + "$")
self.pos = re.compile(pos + "$")
super(W, self).__init__(self.match)
def match(self, word):
m1 = self.token.match(word.token.decode('utf-8')) #.decode('utf-8')
m2 = self.pos.match(word.pos)
return m1 and m2
class Rule(object):
def __init__(self, condition=None, action=None):
assert condition and action
self.condition = condition
self.action = action
def apply(self, sentence):
matches = [] # #finditer是要返回sentence中所有与self.condition相匹配的全部字串,返回形式为迭代器。 m为其中某个
for m in finditer(self.condition, sentence):
i, j = m.span() #以tuple的形式返回范围,m在sentence中的范围
matches.extend(sentence[i:j])#将关键词依次放进matches中
if __name__ == '__main__':
print("----------applying %s----------" % self.action.__name__)
return self.action(matches)#将关键词列表给action代表的函数
# 有哪些机构?
def how_many_institution_question(x):
select ="?x0"
sparql = None;
for w in x:
if w.token.decode("utf-8")=="学院" or w.token.decode("utf-8")=="职能部门":
if w.token.decode("utf-8")=="学院":
s = "college"
if w.token.decode("utf-8")=="职能部门":
s = "office"
e="?id school:{institution}_name ?x0".format(institution=s)
sparql=SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select,expression=INDENT + e)
break
return sparql
# 某机构的属性是什么?
def what_attribute_institution_question(x):
s1=s2=s3="null"
select = "?x0"
sparql = None
for w in x:
if w.token.decode("utf-8")=="机械工程学院" or w.token.decode("utf-8")=="精密仪器与光电子工程学院" or w.token.decode("utf-8")=="电气自动化与信息工程学院" or w.token.decode("utf-8")=="微电子学院" or w.token.decode("utf-8")=="建筑工程学院" or w.token.decode("utf-8")=="建筑学院" or w.token.decode("utf-8")=="化工学院" or w.token.decode("utf-8")=="环境科学与工程学院" or w.token.decode("utf-8")=="管理与经济学部" or w.token.decode("utf-8")=="马克思主义学院" or w.token.decode("utf-8")=="理学院" or w.token.decode("utf-8")=="生命科学学院" or w.token.decode("utf-8")=="医学部" or w.token.decode("utf-8")=="教育学院" or w.token.decode("utf-8")=="国际教育学院":
s1 = w.token.decode("utf-8")
s3 = "college"
if w.token.decode("utf-8")=="纪委、监察室" or w.token.decode("utf-8")=="研究生院" or w.token.decode("utf-8")=="党委离退休工作处" or w.token.decode("utf-8")=="发展战略研究中心" or w.token.decode("utf-8")=="党委教师工作部、人事处" or w.token.decode("utf-8")=="科学技术发展研究院(医科建设办公室)" or w.token.decode("utf-8")=="国际合作与交流处、港澳台事务办公室" or w.token.decode("utf-8")=="审计处" or w.token.decode("utf-8")=="保卫处":
s1 = w.token.decode("utf-8")
s3 = "office"
if w.token.decode("utf-8")=="简介" or w.token.decode("utf-8")=="介绍":
s2 = "introduction"
if w.token.decode("utf-8")=="电话":
s2 = "phone"
if w.token.decode("utf-8") == "网址":
s2 = "network"
if s1!="null" and s2!="null":
e = "?id school:{institution}_name \"{name}\".?id school:{institution}_{attribute} ?x0.".format(institution=s3,attribute=s2,name=s1)
sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select,expression=INDENT + e)
break
return sparql
#某专业的专业培养是什么?#正确
def what_development_question(x):
select = "?x0"
sparql = None
for w in x:
if w.token.decode("utf-8") == "动画专业" or w.token.decode("utf-8") == "计算机专业" or w.token.decode("utf-8") == "软件工程专业" :
e = "?projectid school:project_name \"{type}\". ?projectid school:project_cultivation ?x0.".format(type = w.token.decode("utf-8"))
sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE, select = select, expression = INDENT + e)
break
return sparql
#某专业的考研就业情况? #正确
def how_work_question(x):
select = "?x0"
sparql = None
for w in x:
if w.token.decode("utf-8") == "计算机专业" or w.token.decode("utf-8") == "动画专业" or w.token.decode("utf-8") == "软件工程专业" :
e = "?projectid school:project_name \"{type}\". ?projectid school:project_work ?x0.".format(type = w.token.decode("utf-8"))
sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE, select = select, expression = INDENT + e)
break
return sparql
#智算学部有哪些专业? #正确
def which_majors_in_cal_question(x):
select = "?x0"
sparql = None
for w in x:
if w.token.decode("utf-8") == "智算学部" or w.token.decode("utf-8")== "哪些":
e = "?projectid school:project_name ?x0"
sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE, select = select, expression = INDENT + e)
break
return sparql
#老师类型有哪些? #正确
def teacher_title_question(x):
select = "?x0"
sparql = None
for w in x:
if w.token.decode("utf-8") == "老师"or w.token.decode("utf-8") == "类型":
e = "?teacherid school:teacher_title ?x0."
sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE,select = select, expression = INDENT + e)
break
return sparql
#教授类型有多少老师? #正确
def how_many_professor_question(x):
select = "?teacher"
count= "?x0"
sparql = None
for w in x:
if w.token.decode("utf-8") == "教授"or w.token.decode("utf-8") == "多少":
e = "?teacherid school:teacher_title \"教授\". ?teacherid school:teacher_name ?teacher."
sparql = SPARQL_TEM_count.format(preamble = SPARQL_PREAMBLE,select = select,count = count, expression = INDENT + e)
break
return sparql
#某导师类型有哪些老师? #正确
def who_is_master_tutor_question(x):
select = "?x0"
sparql = None
for w in x:
if w.token.decode("utf-8") == "硕士生" or w.token.decode("utf-8")== "哪些":
e = "?x school:teacher_type \"{type}导师\". ?x school:teacher_name ?x0.".format(type = w.token.decode("utf-8"))
sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE, select = select, expression = INDENT + e)
break
return sparql
#某导师类型有多少老师? #正确
def how_many_teachers_are_master_tutor_question(x):
select = "?teachers"
count = "?x0"
sparql = None
for w in x:
if w.token.decode("utf-8") == "硕士生" or w.token.decode("utf-8") == "多少":
e = "?teachers school:teacher_type \"{type}导师\".".format(type = w.token.decode("utf-8"))
sparql = SPARQL_TEM_count.format(preamble = SPARQL_PREAMBLE, select = select, count = count, expression = INDENT + e)
break
return sparql
# 某老师主讲了哪些课?
def what_courses_teacher_question(x):
select = u"?x0"
sparql = None
for w in x:
if w.pos == "nr":
e=u"?teacherid school:teacher_name \"{person}\". " \
u"?teacherid school:teacher_course ?x0.".format(person=w.token.decode("utf-8"))
sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
break
return sparql
# 某老师主讲了几门课?
def how_many_courses_teacher_question(x):
select = u"?courses"
count = u"?x0"
sparql = None
for w in x:
if w.pos == "nr":
e=u"?teacherid school:teacher_name \"{person}\". " \
u"?teacherid school:teacher_course ?courses.".format(person=w.token.decode("utf-8"))
sparql = SPARQL_TEM_count.format(preamble=SPARQL_PREAMBLE, select=select, count=count,
expression=INDENT + e)
break
return sparql
# 某老师的研究方向是什么?
def what_direction_teacher_question(x):
select = u"?x0"
sparql = None
for w in x:
if w.pos == "nr":
e=u"?teacherid school:teacher_name \"{person}\". " \
u"?teacherid school:teacher_direction ?x0.".format(person=w.token.decode("utf-8"))
sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
break
return sparql
# 某老师是博士生导师吗?
def teacher_is_PhD_tutor_question(x):
sparql = None
for w in x:
if w.pos == "nr":
e=u"?teacherid school:teacher_name \"{person}\". " \
u"?teacherid school:teacher_type \"博导\".".format(person=w.token.decode("utf-8"))
sparql = SPARQL_ASK_TEM.format(preamble=SPARQL_PREAMBLE, expression=INDENT + e)
break
return sparql
# 某老师的个人主页是什么?
def what_homepage_teacher_question(x):
select = u"?x0"
sparql = None
for w in x:
if w.pos == "nr":
e=u"?teacherid school:teacher_name \"{person}\". " \
u"?teacherid school:teacher_homepage ?x0.".format(person=w.token.decode("utf-8"))
sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
break
return sparql
def encode(s):
return ' '.join([bin(ord(c)).replace('0b', '') for c in s])
if __name__ == "__main__":
default_questions = [
"天津大学有哪些学院?",
"化工学院的简介是什么?",
"化工学院电话?",
"化工学院的网址是什么?",
"天津大学有哪些职能部门?",
"研究生院的介绍是什么?",
"研究生院的电话是什么?",
"研究生院的网址是什么?",
"智算学部有哪些专业?",
"教授类型有多少老师?",
"老师类型有哪些?",
"硕士生导师类型有哪些老师?",
"硕士生导师类型有多少老师?",
"动画专业的专业培养是什么?",
"计算机专业的考研就业情况?",
"王晓飞老师主讲了哪些课?",
"王晓飞老师主讲了几门课?",
"王晓飞老师的研究方向是什么?",
"王晓飞老师是博士生导师吗?",
"王晓飞老师的个人主页是什么?"
]
questions = default_questions[0:]
seg_lists = []
# tokenizing questions
for question in questions:
words = pseg.cut(question) #分词 词性标注
seg_list = [Word(word.encode("utf-8"), flag) for word, flag in words] #分词后用Word类初始化,把words看成objects
seg_lists.append(seg_list)
# some rules for matching
# TODO: customize your own rules here
# 正则匹配关键词设置
tutor_type_master = (W("硕士生导师") | W("硕导")| W("硕士导师")| W("硕士生"))
tutor_type_PhD = (W("博士生导师") | W("博导")| W("博士导师")| W("博士生"))
teacher = (W(pos = "nr") | W(pos = "x"))
whose = (W("谁") | W("哪些"))
quantity = (W("多少") | W("几") | W("几门"))
institution = (W("学院")|W("职能部门"))
college = (W(pos="nr"))
attribute = (W("简介")|W("电话")|W("网址")|W("介绍"))
teacher_title=(W("老师"))
class_1=(W('类型'))
teacher_title_name=(W("教授"))
college_1=(W("智算学部"))
major=(W('计算机专业')|W('动画专业')|W('软件工程专业'))
development=(W('培养'))
work=(W('考研')|W('就业'))
direction = (W("方向") | W("研究方向"))
page = (W("个人主页") | W("主页"))
# 正则匹配规则编写
rules = [
#天津有哪些机构(学院或职能部门)?
Rule(condition = Star(Any(), greedy=False) + whose + institution, action=how_many_institution_question),
#某学院的电话?
Rule(condition= college + Star(Any(), greedy=False) + attribute, action=what_attribute_institution_question),
#某导师类型有哪些老师?
Rule(condition = tutor_type_master + Star(Any(), greedy = False) + whose, action = who_is_master_tutor_question),
#某导师类型有多少老师?
Rule(condition = tutor_type_master + Star(Any(), greedy = False) + quantity, action = how_many_teachers_are_master_tutor_question),
#老师类型有哪些?
Rule(condition = teacher_title + Star(Any(), greedy = False)+ class_1,action=teacher_title_question),
#教授类型有多少老师?
Rule(condition = teacher_title_name + Star(Any(), greedy=False)+quantity,action=how_many_professor_question),
#智算学部有哪些专业?
Rule(condition = college_1 + Star(Any(), greedy = False)+ whose,action=which_majors_in_cal_question),
#某专业的专业培养是什么?
Rule(condition = major + Star(Any(), greedy = False)+ development,action=what_development_question),
#某专业的考研就业情况?
Rule(condition = major + Star(Any(), greedy = False)+ work,action=how_work_question),
# 某老师主讲了哪些课?
Rule(condition=teacher + Star(Any(), greedy=False) + whose, action=what_courses_teacher_question),
# 某老师主讲了几门课?
Rule(condition=teacher + Star(Any(), greedy=False) + quantity, action=how_many_courses_teacher_question),
# 某老师的研究方向是什么?
Rule(condition=teacher + Star(Any(), greedy=False) + direction, action=what_direction_teacher_question),
# 某老师是博士生导师吗?
Rule(condition=teacher + Star(Any(), greedy=False) + tutor_type_PhD, action=teacher_is_PhD_tutor_question),
# 某老师的个人主页是什么?
Rule(condition=teacher + Star(Any(), greedy=False) + page, action=what_homepage_teacher_question)
]
file_3 = open('result.txt', 'w', encoding='UTF-8')
# matching and querying
for seg in seg_lists:#提取问题
# display question each
question = []
for s in seg:
print(str(s.token,encoding='utf-8'))#输出问题,分词后的版本
question.append(s.token)
for q in question:
file_3.write(str(q,encoding='utf-8')) #file_3.write(u','.join(question))
print()
for rule in rules:#提取一个rule
query = rule.apply(seg)
if query is None:
continue
print(query)
file_3.write(query + '\n')
if query:
sparql_base.setQuery(query)
sparql_base.setReturnFormat(JSON)
results = sparql_base.query().convert()
if "results" in results.keys():
if not results["results"]["bindings"]:
print("No answer found :(")
print('\n')
continue
for result in results["results"]["bindings"]:
print("Result: ", result["x0"]["value"])
file_3.write("Result: " + result["x0"]["value"] + '\n')
print('\n')
file_3.write('\n') #add
else:
print("Result: ", results["boolean"])
boo = str(results["boolean"])
if boo == "True":
file_3.write(u"Result: " + "True" + '\n')
else:
file_3.write(u"Result: " + "False" + '\n')
其余的数据处理过程类似,希望可以与大家互相交流~~
参考资料:
天津大学《知识工程》课程;
https://blog.csdn.net/u010744489/article/details/105923730。