基于医疗知识图谱的项目构建学习总结(一)—项目构建环境搭建及爬取数据部分

由于工作需要,这里学习了中科院软件所刘焕勇老师在github上的开源项目,基于知识图谱的医药领域问答项目QABasedOnMedicaKnowledgeGraph。
原项目地址:https://github.com/liuhuanyong/QASystemOnMedicalKG
自己动手实现了环境的搭建,目前实践到爬虫部分,在此记录,欢迎大家提出意见。

首先是安装mongodb

参考博客mongodb安装及创建用户
按照文中的说明下载和配置mongodb,并启动服务,打开网址,出现如下语句说明启动成功:

It looks like you are trying to access MongoDB over HTTP on the native driver port.

为了便于验证数据库是否建立成功,这里给出了几个常用的数据库语法:

show dbs 查看已有的数据库
use db_name 如果该数据库存在则进入,若不存在则创建名称为db_name的数据库
db.dropDatabase() 删除该数据库
db.jc.find() 查看数据库中jc表中的数据

然后是爬取数据部分

在对症状的解析函数中,源代码对有的网页解析得到的结果是人名,因此我对其进行了修改,详见代码。运行之后就在数据库中建立该数据库,并存入了爬取的数据。

# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
from lxml import etree
import pymongo


# 创建数据库连接,开启Mongodb服务之后,不存在该数据库的话会自动创建该数据库
conn = pymongo.MongoClient()
db = conn['medical']
col = db['data']


# 爬取数据并解析
def get_html(url):
    headers = {'User-Agent':'Mozilla/5.0(Window Nt 10.0; WOW64) AppleWebKit/537.36 (KHTML, LIKE Gecko)'}
    req = urllib.request.Request(url=url, headers=headers)
    res = urllib.request.urlopen(req)
    html = res.read().decode('gbk')
    return html


def spider_main():
    for page in range(1, 11000):
        try:
            basic_url = 'http://jib.xywy.com/il_sii/gaishu/%s.htm'% page
            cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm' % page  # 病因
            prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm' % page  # 预防
            symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm' % page  # 症状
            inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm' % page  # 检查方法
            treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm' % page  # 治疗
            food_url = 'http://jib.xywy.com/il_sii/food/%s.htm' % page  # 饮食保健
            drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm' % page  # 好评药品
            data = {}
            data['url'] = basic_url
            data['basic_info'] = basicinfo_spider(basic_url)
            data['cause_info'] = common_spider(cause_url)
            data['prevent_info'] = common_spider(prevent_url)
            data['symptom'] = symptom_spider(symptom_url)
            data['inspect_info'] = inspect_spider(inspect_url)
            data['treat_info'] = treat_sipder(treat_url)
            data['food_info'] = food_spider(food_url)
            data['drug_info'] = drug_spider(drug_url)
            print(page, basic_url)
            col.insert(data)
        except:
            print('error')
    return


'''基本信息解析'''
def basicinfo_spider(url):
    html = get_html(url)
    selector = etree.HTML(html)
    title = selector.xpath('//title/text()')[0]
    category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()')
    desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()')
    ps = selector.xpath('//div[@class="mt20 articl-know"]/p')
    infobox = []
    for p in ps:
        info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace('   ', '').replace('\t','')
        infobox.append(info)
    basic_data = {}
    basic_data['category'] = category
    basic_data['name'] = title.split('的简介')[0]
    basic_data['desc'] = desc
    basic_data['attributes'] = infobox
    return basic_data

# 对网页分别进行解析

def treat_sipder(url):
    html = get_html(url)
    selector = etree.HTML(html)
    ps = selector.xpath('//div[starts-with(@class, "mt20 articl-know")]/p')
    infobox = []
    for p in ps:
        info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0','').replace('   ','').replace('\t','')
        infobox.append(info)
    # print(infobox)
    return infobox

def drug_spider(url):
    html = get_html(url)
    selector = etree.HTML(html)
    drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in
             selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')]
    # print(drugs)
    return drugs


'''food治疗解析'''
def food_spider(url):
    html = get_html(url)
    selector = etree.HTML(html)
    divs = selector.xpath('//div[@class="diet-img clearfix mt20"]')
    try:
        food_data = {}
        food_data['good'] = divs[0].xpath('./div/p/text()')
        food_data['bad'] = divs[1].xpath('./div/p/text()')
        food_data['recommand'] = divs[2].xpath('./div/p/text()')
    except:
        return {}
    return food_data


'''症状信息解析'''
def symptom_spider(url):
    html = get_html(url)
    selector = etree.HTML(html)
    
    # 源代码中的结果是人名,因此对其进行了修改。
    symptoms = selector.xpath('//span[@class="db f12 lh240 mb15 "]/a/text()')   
    ps = selector.xpath('//p')
    detail = []
    for p in ps:
        info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace('   ', '').replace('\t','')
        detail.append(info)
    symptoms_data = {}
    symptoms_data['symptoms'] = symptoms
    symptoms_data['symptoms_detail'] = detail
    # print(symptoms_data)
    return symptoms, detail

def inspect_spider(url):
    '''对检查项目的链接进行获取,有的网页时有检查链接的,有的是没有的'''
    html = get_html(url)
    selector = etree.HTML(html)
    inspects = selector.xpath('//li[@class="check-item"]/a/@href')
    return inspects


'''通用解析模块'''
def common_spider(url):
    html = get_html(url)
    selector = etree.HTML(html)
    ps = selector.xpath('//p')
    infobox = []
    for p in ps:
        info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace('   ','').replace('\t', '')
        if info:
            infobox.append(info)

    return '\n'.join(infobox)


'''检查项抓取模块'''
def inspect_crawl():
    for page in range(1, 3685):
        try:
            url = 'http://jck.xywy.com/jc_%s.html'%page
            html = get_html(url)
            data = {}
            data['url'] = url
            data['html'] = html
            db['jc'].insert(data)
            # print(data)
        except Exception as e:
            print(e)
spider_main()
inspect_crawl()

接着将解析的数据存入数据库中

  1. 在这一部分代码中,需要将max_cut中的disease.txt位置进行修改为…/dict/disease.txt,还需要在这个打开文件的时候加入encoding=‘utf-8’,即open(dict_path, encoding=‘utf-8’),不然会报错。
  2. 在下边的导入代码中有first_name.txt文件,在源文件中并没有给出,我猜想应该是这个症状描述不包含疾病名称,因此我使用了disease.txt文件代替的。
  3. 执行程序的时候,必须要先执行这个函数modify_jc(),更新jc表的值,这样在执行函数collect_medical()的时候才不会报name错误。
#!/usr/bin/env python3
# coding: utf-8
# File: build_data.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-10-3
import pymongo
from lxml import etree
import os
from max_cut import *

class MedicalGraph:
    def __init__(self):
        self.conn = pymongo.MongoClient()
        # cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
        self.db = self.conn['test']
        self.col = self.db['data']
        first_words = [i.strip() for i in open('../dict/disease.txt', encoding='utf-8')]
        alphabets = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y', 'z']
        nums = ['1','2','3','4','5','6','7','8','9','0']
        self.stop_words = first_words + alphabets + nums
        self.key_dict = {
            '医保疾病' : 'yibao_status',
            "患病比例" : "get_prob",
            "易感人群" : "easy_get",
            "传染方式" : "get_way",
            "就诊科室" : "cure_department",
            "治疗方式" : "cure_way",
            "治疗周期" : "cure_lasttime",
            "治愈率" : "cured_prob",
            '药品明细': 'drug_detail',
            '药品推荐': 'recommand_drug',
            '推荐': 'recommand_eat',
            '忌食': 'not_eat',
            '宜食': 'do_eat',
            '症状': 'symptom',
            '检查': 'check',
            '成因': 'cause',
            '预防措施': 'prevent',
            '所属类别': 'category',
            '简介': 'desc',
            '名称': 'name',
            '常用药品' : 'common_drug',
            '治疗费用': 'cost_money',
            '并发症': 'acompany'
        }
        self.cuter = CutWords()

    def collect_medical(self):
        cates = []
        inspects = []
        count = 0
        for item in self.col.find():
            data = {}
            basic_info = item['basic_info']
            name = basic_info['name']
            if not name:
                continue
            # 基本信息
            data['名称'] = name
            data['简介'] = '\n'.join(basic_info['desc']).replace('\r\n\t', '').replace('\r\n\n\n','').replace(' ','').replace('\r\n','\n')
            category = basic_info['category']
            data['所属类别'] = category
            cates += category
            inspect = item['inspect_info']
            inspects += inspect
            attributes = basic_info['attributes']
            # 成因及预防
            data['预防措施'] = item['prevent_info']
            data['成因'] = item['cause_info']
            # 并发症
            data['症状'] = list(set([i for i in item["symptom_info"][0] if i[0] not in self.stop_words]))
            for attr in attributes:
                attr_pair = attr.split(':')
                if len(attr_pair) == 2:
                    key = attr_pair[0]
                    value = attr_pair[1]
                    data[key] = value
            # 检查
            inspects = item['inspect_info']
            jcs = []
            for inspect in inspects:
                jc_name = self.get_inspect(inspect)
                if jc_name:
                    jcs.append(jc_name)
            data['检查'] = jcs
            # 食物
            food_info = item['food_info']
            if food_info:
                data['宜食'] = food_info['good']
                data['忌食'] = food_info['bad']
                data['推荐'] = food_info['recommand']
            # 药品
            drug_info = item['drug_info']
            data['药品推荐'] = list(set([i.split('(')[-1].replace(')','') for i in drug_info]))
            data['药品明细'] = drug_info
            data_modify = {}
            for attr, value in data.items():
                attr_en = self.key_dict.get(attr)
                if attr_en:
                    data_modify[attr_en] = value
                if attr_en in ['yibao_status', 'get_prob', 'easy_get', 'get_way', "cure_lasttime", "cured_prob"]:
                    data_modify[attr_en] = value.replace(' ','').replace('\t','')
                elif attr_en in ['cure_department', 'cure_way', 'common_drug']:
                    data_modify[attr_en] = [i for i in value.split(' ') if i]
                elif attr_en in ['acompany']:
                    acompany = [i for i in self.cuter.max_biward_cut(data_modify[attr_en]) if len(i) > 1]
                    data_modify[attr_en] = acompany

            try:
                self.db['medical'].insert(data_modify)
                count += 1
                print(count)
            except Exception as e:
                print(e)

        return


    def get_inspect(self, url):
        res = self.db['jc'].find_one({'url':url})
        if not res:
            return ''
        else:
            return res['name']

    def modify_jc(self):
        for item in self.db['jc'].find():
            url = item['url']
            content = item['html']
            selector = etree.HTML(content)
            name = selector.xpath('//title/text()')[0].split('结果分析')[0]
            desc = selector.xpath('//meta[@name="description"]/@content')[0].replace('\r\n\t','')
            self.db['jc'].update({'url':url}, {'$set':{'name':name, 'desc':desc}})


if __name__ == '__main__':
    handler = MedicalGraph()
    handler.modify_jc()
    handler.collect_medical()

完成之后,使用如下命令可以导出数据

mongoexport -d test -c medical -o medical.json

其中,-d test 是指明数据库,
-c medical 是指明要导出的列表
-o medical.json 是指明导出的文件名,可以在前边指定位置

你可能感兴趣的:(mongodb,自然语言处理,知识图谱)