上接菜哥学知识图谱(通过“基于医疗知识图谱的问答系统”)(二)
这是项目内的文件结构。从基于医疗知识图谱的问答系统源码详解借一张图,按照新内容修改了一下。
├── QASystemOnMedicalKG
├── data
├── medical.json # 知识数据
├── dict
├── check.txt # 诊断检查项目实体库
├── deny.txt # 否定词库
├── department.txt # 医疗科目实体库
├── disease.txt # 疾病实体库
├── drug.txt # 药品实体库
├── food.txt # 食物实体库
├── producer.txt # 在售药品库
├── symptom.txt # 疾病症状实体库
├── document # 文档
├── img # 图片
├── prepare_data
├── build_data.py # 数据库操作脚本
├── data_spider.py # 数据采集脚本
├── max_cut.py # 基于词典的最大前向/后向匹配
├── answer_search.py # 问题查询及返回
├── build_medicalgraph.py # 将结构化json数据导入neo4j
├── chatbot_graph.py # 问答程序脚本
├── question_classifier.py # 问句类型分类脚本
├── question_parser.py # 问句解析脚本
一个文件一个文件的分析。
1.data\medical.json
这里面是已经存储好的疾病知识数据,打开看一下。新建一个openjson.py文件,输入以下内容:
with open(r'C:\QASystemOnMedicalKG\data\medical.json', 'r', encoding='utf8') as js:
for js_data in js:
print(js_data)
执行,可以看到,json文件里面的数据是这样的:
{ "_id" : { "$oid" : "5bb57901831b973a137e614d" },
"name" : "病毒性肠炎",
"desc" : "病毒性肠炎(viralgastroenteritis)又称病毒性腹泻......。",
"category" : [ "疾病百科", "内科", "消化内科" ],
"prevent" : "及早发现和隔离病人......。",
"cause" : "......但多数肠粘膜细胞尚正常。肠绒毛上皮细胞内空泡变性,内质网中有多量轮状病毒颗粒。",
"symptom" : [ "恶心与呕吐", "驻站医", "发烧", "腹泻", "腹痛", "慢性腹痛" ],
"yibao_status" : "否",
"get_prob" : "0.001%",
"easy_get" : "无特定人群",
"get_way" : "无传染性",
"acompany" : [ "缺铁性贫血" ],
"cure_department" : [ "内科", "消化内科" ],
"cure_way" : [ "药物治疗", "康复治疗" ],
"cure_lasttime" : "7-14天",
"cured_prob" : "85%-95%",
"common_drug" : [ "盐酸左氧氟沙星胶囊", "依托红霉素片" ],
"cost_money" : "根据不同医院,收费标准不一致,市三甲医院约(1000——5000元)",
"check" : [ "便常规", "纤维肠镜", "小肠镜检查", "红细胞计数(RBC)", "细菌学检验", "粪酸碱度", "血常规", "粪细菌培养", "血小板计数(PLT)" ],
"do_eat" : [ "鸭蛋", "鸡蛋", "鸡肉", "芝麻" ],
"not_eat" : [ "杏仁", "腐竹", "白扁豆", "沙丁鱼" ],
"recommand_eat" : [ "冬瓜粒杂锦汤", "土豆肉末粥", "丁香酸梅汤" ],
"recommand_drug" : [ "司帕沙星片", "清泻丸", "复方黄连素片", "枯草杆菌二联活菌肠溶胶囊", "盐酸左氧氟沙星胶囊", "司帕沙星分散片",..... "SP", "依托红霉素片", "苦木注射液", "氧氟沙星片" ],
"drug_detail" : [ "联邦左福康盐酸左氧氟沙星胶(盐酸左氧氟沙星胶囊)", "广东华南依托红霉素片(依托红霉素片)", "桂林三金复方红根草片(复方红根草片)", ........"万年青苦木注射液(苦木注射液)", "惠州九惠炎宁颗粒(炎宁颗粒)", "浙江得恩德氧氟沙星片(氧氟沙星片)", "吉林跨海生化止痢宁片(止痢宁片)" ] }
{......}
{......}
可以猜测,每一种疾病是一条字典类型数据。每条数据里面有24个键值对。每个键值对的含义能猜出来。
2.dict文件夹内的各个文本文档,是各类实体库,可以挨个打开看一下。
和一个否定词库deny.txt。
3.prepare_data\data_spider.py 数据采集脚本。spider_main()方法把爬取的疾病相关信息放到数据库’medical’里。inspect_crawl()方法把检查项目的网页地址和源码放到了数据库’jc’里。
可以看对源码我的注释:
#!/usr/bin/env python3
# coding: utf-8
# File: data_spider.py
# Author: lhy
# Date: 18-10-3
import urllib.request #爬虫工具
import urllib.parse #爬虫工具
from lxml import etree #爬虫工具
import pymongo #MongoDB 是目前最流行的 NoSQL 数据库之一,使用的数据类型 BSON(类似 JSON)。使用该数据库应先安装,并启动服务。
import re
'''基于司法网的犯罪案件采集'''#这儿应该是写错了
class CrimeSpider:
def __init__(self):
self.conn = pymongo.MongoClient() #建立数据库连接
self.db = self.conn['medical'] #获取数据库'medical'
self.col = self.db['data'] #获取文件'data'
'''根据url,请求html'''
#获取网页的html源码
def get_html(self, url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36'}
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('gbk')
return html
'''url解析'''
#应该是自动跳转链接,抓取所有病例的页面 但是这个http://www.anliguan.com是不是写错了.这个方法有问题,也没用到.
def url_parser(self, content):
selector = etree.HTML(content)
urls = ['http://www.anliguan.com' + i for i in selector.xpath('//h2[@class="item-title"]/a/@href')]
return urls
'''测试'''
#将内容写入数据库
def spider_main(self):
for page in range(1, 11000):
try:
basic_url = 'http://jib.xywy.com/il_sii/gaishu/%s.htm'%page #疾病概述页面 这个网站的网页地址有点怪
cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm'%page #病因
prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm'%page #预防
symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm'%page #症状
inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm'%page #检查方法
treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm'%page #治疗
food_url = 'http://jib.xywy.com/il_sii/food/%s.htm'%page #饮食保健
drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm'%page #好评药品
data = {} #将以下数据封装成字典
data['url'] = basic_url #基本网址,str, 'url':基本网址
data['basic_info'] = self.basicinfo_spider(basic_url) #疾病基本信息,字典,'basic_info':{'category':疾病分类,'name':疾病名称,'desc':疾病简介,'attributes':[基本知识,治疗常识,温馨提示]}
data['cause_info'] = self.common_spider(cause_url) #病因,list, 'cause_info':[第一段文字,第二段文字,...]
data['prevent_info'] = self.common_spider(prevent_url) #预防,list, 'prevent_info':[第一段文字,第二段文字,...]
data['symptom_info'] = self.symptom_spider(symptom_url) #症状(应该是并发症),元组,'symptom_info':(症状信息,[第一段症状,第二段症状,...])
data['inspect_info'] = self.inspect_spider(inspect_url) #检查方法的网址,list,'inspect_info':[检查方法网址,.....]
data['treat_info'] = self.treat_spider(treat_url) #治疗概述,list, 'treat_info':[就诊科室,治疗方式,治疗周期,治愈率,常用药品]
data['food_info'] = self.food_spider(food_url) #食物,字典, 'food_info':{'good':宜吃食物, 'bad':忌食物, 'recommand':宜食物推荐食物}
data['drug_info'] = self.drug_spider(drug_url) #药品名称,str, 'drug_info':药品名称
print(page, basic_url)
self.col.insert(data) #将上述数据封装成的字典,写入数据库
except Exception as e:
print(e, page)
return
'''基本信息解析'''
def basicinfo_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')[0] #网页的标题
category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()') #分类
desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()') #简介
ps = selector.xpath('//div[@class="mt20 articl-know"]/p') #[基本知识,治疗常识,温馨提示]
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
infobox.append(info)
basic_data = {}
basic_data['category'] = category
basic_data['name'] = title.split('的简介')[0]
basic_data['desc'] = desc
basic_data['attributes'] = infobox
return basic_data
'''treat_infobox治疗解析'''
def treat_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//div[starts-with(@class,"mt20 articl-know")]/p')
infobox = [] #[就诊科室,治疗方式,治疗周期,治愈率,常用药品]
for p in ps:
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
infobox.append(info)
return infobox
'''treat_infobox治疗解析'''
def drug_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')]
return drugs #返回药品名称
'''food治疗解析'''
def food_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
divs = selector.xpath('//div[@class="diet-img clearfix mt20"]')
try:
food_data = {}
food_data['good'] = divs[0].xpath('./div/p/text()') #宜吃食物
food_data['bad'] = divs[1].xpath('./div/p/text()') #忌吃食物
food_data['recommand'] = divs[2].xpath('./div/p/text()') #宜吃食物(推荐食物)
except:
return {}
return food_data
'''症状信息解析'''
def symptom_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
symptoms = selector.xpath('//a[@class="gre" ]/text()') #症状
ps = selector.xpath('//p')
detail = [] #症状列表
for p in ps:
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
detail.append(info)
symptoms_data = {} #这个用来干嘛?
symptoms_data['symptoms'] = symptoms
symptoms_data['symptoms_detail'] = detail
return symptoms, detail
'''检查信息解析'''
def inspect_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
inspects = selector.xpath('//li[@class="check-item"]/a/@href') #看起来像检查的网址
return inspects
'''通用解析模块'''
def common_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//p')
infobox = [] #病因/预防. [第一段文字,第二段文字,...]
for p in ps:
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '')
if info:
infobox.append(info)
return '\n'.join(infobox)
'''检查项抓取模块'''
def inspect_crawl(self):
for page in range(1, 3685):
try:
url = 'http://jck.xywy.com/jc_%s.html'%page
html = self.get_html(url)
data = {} #检查的字典数据
data['url']= url #检查项目网页的地址
data['html'] = html #检查项目网页的源码
self.db['jc'].insert(data) #放到jc这个数据库里
print(url)
except Exception as e:
print(e)
handler = CrimeSpider()
handler.inspect_crawl()
4.prepare_data\max_cut.py 基于词典的最大前向/后向匹配。
看原文注释就可以,重新注释的意义不大。
5.prepare_data\build_data.py 数据库操作脚本。collect_medical()方法把key换成英文了,内容变化不大;执行后,数据应该在data\medical.json里面,格式内容是一样的。
注释一下
#!/usr/bin/env python3
# coding: utf-8
# File: build_data.py
# Author: lhy
# Date: 18-10-3
import pymongo
from lxml import etree #解析HTML的包
import os
from max_cut import *
class MedicalGraph:
def __init__(self):
self.conn = pymongo.MongoClient() #链接数据库
cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) #当前文件夹地址
self.db = self.conn['medical']
self.col = self.db['data'] #获取数据库内的数据
first_words = [i.strip() for i in open(os.path.join(cur_dir, 'first_name.txt'))] # 'first_name.txt'文件是做什么的?读取该文件
alphabets = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y', 'z']
nums = ['1','2','3','4','5','6','7','8','9','0']
self.stop_words = first_words + alphabets + nums #停词表?包含firstwords,字母,数字
self.key_dict = {
'医保疾病' : 'yibao_status',
"患病比例" : "get_prob",
"易感人群" : "easy_get",
"传染方式" : "get_way",
"就诊科室" : "cure_department",
"治疗方式" : "cure_way",
"治疗周期" : "cure_lasttime",
"治愈率" : "cured_prob",
'药品明细': 'drug_detail',
'药品推荐': 'recommand_drug',
'推荐': 'recommand_eat',
'忌食': 'not_eat',
'宜食': 'do_eat',
'症状': 'symptom',
'检查': 'check',
'成因': 'cause',
'预防措施': 'prevent',
'所属类别': 'category',
'简介': 'desc',
'名称': 'name',
'常用药品' : 'common_drug',
'治疗费用': 'cost_money',
'并发症': 'acompany'
}
self.cuter = CutWords() #创建最大前向/后向匹配类的实例
def collect_medical(self):
cates = []
inspects = []
count = 0
for item in self.col.find():
data = {}
basic_info = item['basic_info'] # 前面写过,'basic_info':{'category':疾病分类,'name':疾病名称,'desc':疾病简介,'attributes':[基本知识,治疗常识,温馨提示]}
name = basic_info['name'] #'name':疾病名称
if not name:
continue
# 基本信息
data['名称'] = name #疾病名称
data['简介'] = '\n'.join(basic_info['desc']).replace('\r\n\t', '').replace('\r\n\n\n','').replace(' ','').replace('\r\n','\n') #疾病简介
category = basic_info['category']
data['所属类别'] = category #疾病分类
cates += category #[疾病分类]
inspect = item['inspect_info']
inspects += inspect #'inspect_info':[检查方法网址,.....]
attributes = basic_info['attributes'] #[基本知识,治疗常识,温馨提示]
# 成因及预防
data['预防措施'] = item['prevent_info'] #'prevent_info':[第一段文字,第二段文字,...]
data['成因'] = item['cause_info'] #病因, 'cause_info':[第一段文字,第二段文字,...]
# 并发症
data['症状'] = list(set([i for i in item["symptom_info"][0] if i[0] not in self.stop_words])) #元组,'symptom_info':(症状信息,[第一段症状,第二段症状,...])
for attr in attributes: #[基本知识,治疗常识,温馨提示]
attr_pair = attr.split(':')
if len(attr_pair) == 2:
key = attr_pair[0]
value = attr_pair[1]
data[key] = value #这一段解决了好多个key,可以看下面的具体内容
'''
上面一段代码中attributes的具体内容如下文(一个例子):
医保疾病:否
患病比例:0.5%
易感人群:多见于小儿
传染方式:呼吸道传播
并发症:支气管肺炎 肺不张
就诊科室:儿科 小儿内科
治疗方式:药物治疗 支持性治疗
治疗周期:1-2个月
治愈率:98%
常用药品:穿心莲内酯片 百咳静糖浆
治疗费用:根据不同医院,收费标准不一致,市三甲医院约(1000-4000元)
保持室内通风,衣物在阳光下曝晒。
'''
# 检查
inspects = item['inspect_info'] #'inspect_info':[检查方法网址,.....]
jcs = []
for inspect in inspects:
jc_name = self.get_inspect(inspect)
if jc_name:
jcs.append(jc_name)
data['检查'] = jcs #应该是检查项点
# 食物
food_info = item['food_info']
if food_info:
data['宜食'] = food_info['good']
data['忌食'] = food_info['bad']
data['推荐'] = food_info['recommand']
# 药品
drug_info = item['drug_info']
data['药品推荐'] = list(set([i.split('(')[-1].replace(')','') for i in drug_info]))
data['药品明细'] = drug_info
data_modify = {}
for attr, value in data.items(): #遍历前面的data内容
attr_en = self.key_dict.get(attr) #'name'
if attr_en:
data_modify[attr_en] = value #把内容放到data里面,如 'name':白百咳,
if attr_en in ['yibao_status', 'get_prob', 'easy_get', 'get_way', "cure_lasttime", "cured_prob"]:
data_modify[attr_en] = value.replace(' ','').replace('\t','')
elif attr_en in ['cure_department', 'cure_way', 'common_drug']:
data_modify[attr_en] = [i for i in value.split(' ') if i]
elif attr_en in ['acompany']:
acompany = [i for i in self.cuter.max_biward_cut(data_modify[attr_en]) if len(i) > 1] #并发症用了最大向前匹配
data_modify[attr_en] = acompany
try:
self.db['medical'].insert(data_modify) #插入字典
count += 1
print(count)
except Exception as e:
print(e)
return
def get_inspect(self, url): #参数url是检查方法的网址
res = self.db['jc'].find_one({'url':url})
if not res:
return ''
else:
return res['name'] #网址对应的检查项点名称? 但是‘jc’数据库里面没有‘name’字段啊,这个没看懂
def modify_jc(self): #好吧,在这个方法里,更新了name字段
for item in self.db['jc'].find():
url = item['url']
content = item['html']
selector = etree.HTML(content)
name = selector.xpath('//title/text()')[0].split('结果分析')[0] #检查项目名称
desc = selector.xpath('//meta[@name="description"]/@content')[0].replace('\r\n\t','') #检查内容
self.db['jc'].update({'url':url}, {'$set':{'name':name, 'desc':desc}})
if __name__ == '__main__':
handler = MedicalGraph()
未完待续:
菜哥学知识图谱(通过“基于医疗知识图谱的问答系统”)(四)(代码分析2)