今天上线发现自己竟然涨粉了,也给了我更大的动力将这一方面继续记录下去,这里是对另外一个项目代码的解读,个人认为是对前面连续几篇中文医疗知识图谱的解读的一个补充,有着拨云见日的作用。
项目来源是GitHub上面刘老师做的一个基于知识医疗图谱的问答机器人,本文主要关注点放在建立知识图谱这一侧。这个项目并且将数据集也开源了放在dict和data文件夹下,让我觉得真的很难得,得给老师一个star!https://github.com/liuhuanyong/QASystemOnMedicalKGhttps://github.com/liuhuanyong/QASystemOnMedicalKG
然后我们开始对刘老师项目内与知识图谱构建方向有关的代码进行一个解读。
目录
data_spider.py
build_data.py
MedicalGraph类:
collect_medical():
get_inspect():
max_cut.py
CutWords类:
load_words():
max_forward_cut():
max_backward_cut():
max_biward_cut():
结语
首先是数据获取阶段,解读刘老师的爬虫项目。
import urllib.request
import urllib.parse
from lxml import etree
import pymongo
import re
class CrimeSpider:
def __init__(self):
self.conn = pymongo.MongoClient()
self.db = self.conn['medical']
self.col = self.db['data']
'''根据url,请求html'''
def get_html(self, url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36'}
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('gbk')
return html
'''url解析'''
def url_parser(self, content):
selector = etree.HTML(content)
urls = ['http://www.anliguan.com' + i for i in selector.xpath('//h2[@class="item-title"]/a/@href')]
return urls
'''测试'''
def spider_main(self):
for page in range(1, 11000):
try:
basic_url = 'http://jib.xywy.com/il_sii/gaishu/%s.htm'%page
cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm'%page
prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm'%page
symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm'%page
inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm'%page
treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm'%page
food_url = 'http://jib.xywy.com/il_sii/food/%s.htm'%page
drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm'%page
data = {}
data['url'] = basic_url
data['basic_info'] = self.basicinfo_spider(basic_url)
data['cause_info'] = self.common_spider(cause_url)
data['prevent_info'] = self.common_spider(prevent_url)
data['symptom_info'] = self.symptom_spider(symptom_url)
data['inspect_info'] = self.inspect_spider(inspect_url)
data['treat_info'] = self.treat_spider(treat_url)
data['food_info'] = self.food_spider(food_url)
data['drug_info'] = self.drug_spider(drug_url)
print(page, basic_url)
self.col.insert(data)
except Exception as e:
print(e, page)
return
'''基本信息解析'''
def basicinfo_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')[0]
category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()')
desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()')
ps = selector.xpath('//div[@class="mt20 articl-know"]/p')
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
infobox.append(info)
basic_data = {}
basic_data['category'] = category
basic_data['name'] = title.split('的简介')[0]
basic_data['desc'] = desc
basic_data['attributes'] = infobox
return basic_data
'''treat_infobox治疗解析'''
def treat_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//div[starts-with(@class,"mt20 articl-know")]/p')
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
infobox.append(info)
return infobox
'''treat_infobox治疗解析'''
def drug_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')]
return drugs
'''food治疗解析'''
def food_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
divs = selector.xpath('//div[@class="diet-img clearfix mt20"]')
try:
food_data = {}
food_data['good'] = divs[0].xpath('./div/p/text()')
food_data['bad'] = divs[1].xpath('./div/p/text()')
food_data['recommand'] = divs[2].xpath('./div/p/text()')
except:
return {}
return food_data
'''症状信息解析'''
def symptom_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
symptoms = selector.xpath('//a[@class="gre" ]/text()')
ps = selector.xpath('//p')
detail = []
for p in ps:
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
detail.append(info)
symptoms_data = {}
symptoms_data['symptoms'] = symptoms
symptoms_data['symptoms_detail'] = detail
return symptoms, detail
'''检查信息解析'''
def inspect_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
inspects = selector.xpath('//li[@class="check-item"]/a/@href')
return inspects
'''通用解析模块'''
def common_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//p')
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '')
if info:
infobox.append(info)
return '\n'.join(infobox)
'''检查项抓取模块'''
def inspect_crawl(self):
for page in range(1, 3685):
try:
url = 'http://jck.xywy.com/jc_%s.html'%page
html = self.get_html(url)
data = {}
data['url']= url
data['html'] = html
self.db['jc'].insert(data)
print(url)
except Exception as e:
print(e)
handler = CrimeSpider()
handler.inspect_crawl()
在这请忽略类的名称叫CrimeSpider,这是因为刘老师之前还做了一个完整的和司法案件有关的知识图谱,加上爬虫之间大同小异,改一改就能爬取另外一个网站。对于该爬虫文件我不做过多解读,详细情况可以参照本人另一篇博客。Python爬虫编程基础 Python入门+数据分析 实训笔记分享_chen_nnn的博客-CSDN博客笔者通过哔哩哔哩弹幕网学习爬虫编程基础实训笔记在此分享视频来源:https://www.bilibili.com/video/BV12E411A7ZQ?spm_id_from=333.1007.top_right_bar_window_default_collection.content.clickhttps://blog.csdn.net/chen_nnn/article/details/122979611
这里我们主要关注一下,刘老师爬取的网站的情况,从代码中可以看出数据的原网站是寻医问药网的疾病百科。
可以看到该网站的疾病百科从HTML的角度来看,结构较为清晰,比较适合爬取,之后的数据处理的工作量可以大大减少。爬虫将所有与该疾病相关的信息都进行爬取和存储,最终一共爬取了8807条和疾病有关的数据,里面的数据存储的结构如下。
该文件是将爬虫爬取到的数据进行规整,实现上图所示的结构。
import pymongo
from lxml import etree
import os
from max_cut import *
class MedicalGraph:
def __init__(self):
self.conn = pymongo.MongoClient()#'''建立无用户名密码连接'''
cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.db = self.conn['medical']
self.col = self.db['data']
first_words = [i.strip() for i in open(os.path.join(cur_dir, 'first_name.txt'))]
alphabets = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y', 'z']
nums = ['1','2','3','4','5','6','7','8','9','0']
self.stop_words = first_words + alphabets + nums
self.key_dict = {
'医保疾病' : 'yibao_status',
"患病比例" : "get_prob",
"易感人群" : "easy_get",
"传染方式" : "get_way",
"就诊科室" : "cure_department",
"治疗方式" : "cure_way",
"治疗周期" : "cure_lasttime",
"治愈率" : "cured_prob",
'药品明细': 'drug_detail',
'药品推荐': 'recommand_drug',
'推荐': 'recommand_eat',
'忌食': 'not_eat',
'宜食': 'do_eat',
'症状': 'symptom',
'检查': 'check',
'成因': 'cause',
'预防措施': 'prevent',
'所属类别': 'category',
'简介': 'desc',
'名称': 'name',
'常用药品' : 'common_drug',
'治疗费用': 'cost_money',
'并发症': 'acompany'
}
self.cuter = CutWords()
pymongo是一个方便使用数据库的库函数,首先按照刘老师注释所言,建立一个无用户名密码连接,然后定义一个指针变量,该变量使用os.path.abspath(只有当在脚本中执行的时候,os.path.abspath(__file__)才会起作用,因为该命令是获取的当前执行脚本的完整路径,如果在交互模式或者terminate 终端中运行会报没有__file__这个错误。)获取绝对路径。然后再获取该路径下的first_name.txt中的内容,构建一个first_words变量(os.path.join的作用是:连接两个或更多的路径名组件1.如果各组件名首字母不包含’/’,则函数会自动加上2.如果有一个组件是一个绝对路径,则在它之前的所有组件均会被舍弃3.如果最后一个组件为空,则生成的路径以一个’/’分隔符结尾)。
但是这个first_name.txt里面的内容却并没有在文件中给出,所以我们也不知道是以一个怎样的逻辑,但是我们有了最后构建好的json文件,所以这部分我们就当做背景知识的学习。
def collect_medical(self):
cates = []
inspects = []
count = 0
for item in self.col.find():
data = {}
basic_info = item['basic_info']
name = basic_info['name']
if not name:
continue
# 基本信息
data['名称'] = name
data['简介'] = '\n'.join(basic_info['desc']).replace('\r\n\t', '').replace('\r\n\n\n','').replace(' ','').replace('\r\n','\n')
category = basic_info['category']
data['所属类别'] = category
cates += category
attributes = basic_info['attributes']
# 成因及预防
data['预防措施'] = item['prevent_info']
data['成因'] = item['cause_info']
# 并发症
data['症状'] = list(set([i for i in item["symptom_info"][0] if i[0] not in self.stop_words]))
for attr in attributes:
attr_pair = attr.split(':')
if len(attr_pair) == 2:
key = attr_pair[0]
value = attr_pair[1]
data[key] = value
# 检查
inspects = item['inspect_info']
jcs = []
for inspect in inspects:
jc_name = self.get_inspect(inspect)
if jc_name:
jcs.append(jc_name)
data['检查'] = jcs
# 食物
food_info = item['food_info']
if food_info:
data['宜食'] = food_info['good']
data['忌食'] = food_info['bad']
data['推荐'] = food_info['recommand']
# 药品
drug_info = item['drug_info']
data['药品推荐'] = list(set([i.split('(')[-1].replace(')','') for i in drug_info]))
data['药品明细'] = drug_info
data_modify = {}
for attr, value in data.items():
attr_en = self.key_dict.get(attr)
if attr_en:
data_modify[attr_en] = value
if attr_en in ['yibao_status', 'get_prob', 'easy_get', 'get_way', "cure_lasttime", "cured_prob"]:
data_modify[attr_en] = value.replace(' ','').replace('\t','')
elif attr_en in ['cure_department', 'cure_way', 'common_drug']:
data_modify[attr_en] = [i for i in value.split(' ') if i]
elif attr_en in ['acompany']:
acompany = [i for i in self.cuter.max_biward_cut(data_modify[attr_en]) if len(i) > 1]
data_modify[attr_en] = acompany
try:
self.db['medical'].insert(data_modify)
count += 1
print(count)
except Exception as e:
print(e)
return
find() 方法检测字符串中是否包含子字符串 str ,如果指定 beg(开始) 和 end(结束) 范围,则检查是否包含在指定范围内,如果指定范围内如果包含指定索引值,返回的是索引值在字符串中的起始位置。如果不包含索引值,返回-1。item和basic_info都以字典的形式储存数据。
由于在该for循环中,定义了data字典,之后将有关该疾病的各种信息以键值对的形式存储到字典当中。首先是名称、简介信息(需要对其做一些修正然后才能保存)、疾病类别、预防措施、成因、症状。然后对于其他信息格式如XXX:XXX也同样进行存储,在冒号之后还有并列的情况之后处理。对于inspects中包含多个项目,在data['检查']下以列表的形式存储。最后是食物和药品。将这一切都存储到data中去后,在最后我们对data的格式进行最后一次修正,使用之前设定好的英文名。然后将其保存到数据库当中。
def get_inspect(self, url):
res = self.db['jc'].find_one({'url':url})
if not res:
return ''
else:
return res['name']
该函数在上一个函数当中使用,目的就是找到该条目当中的名字,find_one()方法作用是返回一个文档满足指定的查询条件。如果多个文档满足查询,该方法返回第一个文档根据自然秩序反映了磁盘上文件的顺序。在限制集合,自然秩序是一样的插入顺序。如果没有文档满足查询,方法返回null。
class CutWords:
def __init__(self):
dict_path = './disease.txt'
self.word_dict, self.max_wordlen = self.load_words(dict_path)
开始的定义很简单,就是加载保存疾病名称的文本文档。
def load_words(self, dict_path):
words = list()
max_len = 0
for line in open(dict_path):
wd = line.strip()
if not wd:
continue
if len(wd) > max_len:
max_len = len(wd)
words.append(wd)
return words, max_len#这个是加载词典么,为什么只要字长依次递增的呢?
list() 方法用于将元组或字符串转换为列表,所以这里是想要构建一个空列表。然后定义最大字长变量,在字典遍历的过程中,不断将疾病名称保存到words中,同时更新最大字长。
def max_forward_cut(self, sent):
cutlist = []
index = 0
while index < len(sent):
matched = False
for i in range(self.max_wordlen, 0, -1):
cand_word = sent[index: index + i]
if cand_word in self.word_dict:
cutlist.append(cand_word)
matched = True
break
if not matched:
i = 1
cutlist.append(sent[index])
index += i
return cutlist
该函数作用是最大向前匹配。从左向右取待切分汉语句的m个字符作为匹配字段,m为大机器词典中最长词条个数。查找大机器词典并进行匹配。若匹配成功,则将这个匹配字段作为一个词切分出来。对于送进函数处理的文本段,首先按照最大字长来切分,并检查能否匹配到存储在字典当中的疾病名称,能匹配到就实现函数作用,跳出循环,将该名称返回,如果没有则将字长减一继续匹配,直到匹配到为止,如果到最后也没有匹配成功的话,就返回该字段第一个值。
def max_backward_cut(self, sent):
cutlist = []
index = len(sent)
while index > 0:
matched = False
for i in range(self.max_wordlen, 0, -1):
tmp = (i + 1)
cand_word = sent[index - tmp: index]
if cand_word in self.word_dict:
cutlist.append(cand_word)
matched = True
break
if not matched:
tmp = 1
cutlist.append(sent[index - 1])
index -= tmp
return cutlist[::-1]
该函数作用是最大向后匹配。从右向左取待切分汉语句的m个字符作为匹配字段,m为大机器词典中最长词条个数。查找大机器词典并进行匹配。功能和前面的最大向前匹配类似,只不过在该函数中如果最后匹配失败则返回字段最后一个值。
def max_biward_cut(self, sent):
forward_cutlist = self.max_forward_cut(sent)
backward_cutlist = self.max_backward_cut(sent)
count_forward = len(forward_cutlist)
count_backward = len(backward_cutlist)
def compute_single(word_list):
num = 0
for word in word_list:
if len(word) == 1:
num += 1
return num
if count_forward == count_backward:
if compute_single(forward_cutlist) > compute_single(backward_cutlist):
return backward_cutlist
else:
return forward_cutlist
elif count_backward > count_forward:
return forward_cutlist
else:
return backward_cutlist
双向最大匹配法是将正向最大匹配法得到的分词结果和逆向最大匹配法的到的结果进行比较,从而决定正确的分词方法。运用启发式规则:1.如果正反向分词结果词数不同,则取分词数量较少的那个。 2.如果分词结果词数相同 a.分词结果相同,就说明没有歧义,可返回任意一个。 b.分词结果不同,返回其中单字较少的那个。函数中包含的compute_single()函数用来计算分词结果列表当中单个字符的个数。
到此为止将刘老师项目当中对于数据预处理部分的代码解读完成,之后再更新对于知识图谱构建的相关部分。