Python爬虫系列之微信小程序药品数据多线程爬取

Python爬虫系列之微信小程序药品数据多线程爬取

代码仅供学习交流,请勿用于非法用途

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

微信请扫描下方二维码

在这里插入图片描述

一、准备数据库

create database drugs_;

use drugs_;

create table `drug`(
	`id` int primary key auto_increment,
	`kgId` varchar(15) unique comment '药品id',
	`proprietaryName` text default null comment 'proprietaryName',
	`productSpecification` text default null comment 'productSpecification',
	`manufacturer` text default null comment 'manufacturer',
	`otc` text default null comment 'otc',
	`medInsurance` text default null comment 'medInsurance',
	`cw` text default null comment 'cw',
	`commonName` text default null comment 'commonName',
	`imageUrlList` text default null comment 'imageUrlList',
	`englishName` text default null comment 'englishName',
	`dosageForms` text default null comment 'dosageForms',
	`mainMaterial` text default null comment 'mainMaterial',
	`storage` text default null comment 'storage',
	`permissionNumber` text default null comment 'permissionNumber',
	`atcNameZh` text default null comment 'atcNameZh',
	`fda` text default null comment 'fda',
	`priceRange` text default null comment 'priceRange',
	`packagePrice` text default null comment 'packagePrice',
	`componentId` text default null comment 'componentId',
	`pedia` text default null comment 'englishName'
)engine=MyISAM charset=utf8;

create table `guide`(
	`id` int primary key auto_increment,
	`key` varchar(60) unique comment 'key',
	`title` text default null comment 'title',
	`source` text default null comment 'source',
	`keyWords` text default null comment 'keyWords',
	`summary` text default null comment 'summary',
	`userProCompleted` text default null comment 'userProCompleted',
	`url` text default null comment 'url'
)engine=MyISAM charset=utf8;

create table `check`(
	`id` int primary key auto_increment,
	`key` varchar(60) unique comment 'key',
	`name` text default null comment 'name',
	`alias_name` text default null comment 'alias_name',
	`english_name` text default null comment 'english_name',
	`introduction` text default null comment 'introduction',
	`type` text default null comment 'type',
	`fasting` text default null comment 'fasting',
	`Indications` text default null comment 'Indications',
	`reference` text default null comment 'reference',
	`attention` text default null comment 'attention',
	`prompt` text default null comment 'prompt',
	`Specimen` text default null comment 'Specimen',
	`Clinical` text default null comment 'Clinical',
	`Inspection` text default null comment 'Inspection',
	`Adverse` text default null comment 'Adverse'
)engine=MyISAM charset=utf8;


create table `symptom`(
	`id` int primary key auto_increment,
	`key` varchar(60) unique comment 'key',
	`name` text default null comment 'name',
	`introduction` text default null comment 'introduction',
	`Pathogeny` text default null comment 'Pathogeny',
	`diagnosis` text default null comment 'diagnosis',
	`Prevention` text default null comment 'Prevention'
)engine=MyISAM charset=utf8;


create table `disease`(
	`id` int primary key auto_increment,
	`key` varchar(60) unique comment 'key',
	`name` text default null comment 'name',
	`alias_name` text default null comment 'alias_name',
	`english_name` text default null comment 'english_name',
	`Abbreviation` text default null comment 'Abbreviation',
	`ICD` text default null comment 'ICD',
	`Department` text default null comment 'Department',
	`introduction` text default null comment 'introduction',
	`Pathogeny` text default null comment 'Pathogeny',
	`Pathology` text default null comment 'Pathology',
	`historyKeyPoints` text default null comment 'historyKeyPoints',
	`keyPointsOfSymptoms` text default null comment 'keyPointsOfSymptoms',
	`keyPointsOfPhysicalExamination` text default null comment 'keyPointsOfPhysicalExamination',
	`Transfer` text default null comment 'Transfer',
	`laboratory` text default null comment 'laboratory',
	`Imaging` text default null comment 'Imaging',
	`Other` text default null comment 'Other',
	`clinical` text default null comment 'clinical',
	`stages` text default null comment 'stages',
	`diagnosis` text default null comment 'diagnosis',
	`principles` text default null comment 'principles',
	`generalTreatment` text default null comment 'generalTreatment',
	`Medication` text default null comment 'Medication',
	`surgicalTreatment` text default null comment 'surgicalTreatment',
	`otherTreatment` text default null comment 'otherTreatment',
	`commonComplications` text default null comment 'commonComplications',
	`prognosis` text default null comment 'prognosis',
	`followUp` text default null comment 'followUp',
	`Prevention` text default null comment 'Prevention'
)engine=MyISAM charset=utf8;



create table `instruction`(
	`id` int primary key auto_increment,
	`key` varchar(60) unique comment 'key',
	`title` text default null comment 'title',
	`mainFunction` text default null comment 'mainFunction',
	`usageList` text default null comment 'usageList'
)engine=MyISAM charset=utf8;


create table `case`(
	`id` int primary key auto_increment,
	`key` varchar(60) unique comment 'key',
	`source` text default null comment 'source',
	`publishDate` text default null comment 'publishDate',
	`keywords` text default null comment 'keywords',
	`title` text default null comment 'title',
	`sex` text default null comment 'sex',
	`age` text default null comment 'age',
	`complaint` text default null comment 'complaint',
	`medicalHistory` text default null comment 'medicalHistory',
	`check` text default null comment 'check',
	`conclusion` text default null comment 'conclusion',
	`treatmentProcess` text default null comment 'treatmentProcess'
)engine=MyISAM charset=utf8;

二、代码实现

import requests
from queue import Queue
import json
import threading
import MySQLdb
import time
import re

'''
    @Author     :王磊
    @Date       :2019/9/19
    @Description:某微信小程序药品数据爬取
'''

#########################################################################
# 数据库账号
mysql_user = "root"
# 数据库密码
mysql_password = "root"
# 数据库名称
mysql_database = "drugs_"
# 指南模块pdf存储路径
guide_pdf_path = "c:/users/it1002/Desktop/pds/"
##########################################################################

token = "eyJhbGciOiJIUzUxMiJ9.eyJhcHasaWNhdGlvbkFjY291bnRJbmZvIjp7ImlkIjo5NTkxNywiY2hhbm5lbElkIjoiMTEwMDQ5MDAwMCIsImluc3RpdHV0aW9uSWQiOiIxMjQ0NDQwMzAwMDAzMzEwMDAwMDAwIiwicm9sZSI6MSwic291cmNlIjoxLCJzZXNzaW9uVHlwZSI6IndlY2hhdCIsImlzQXV0b0xvZ2luIjpmYWxzZSwiY29tbW9uVXNlcklkIjpudWxsLCJwYXltZW50TGV2ZWwiOm51bGx9LCJleHAiOjE1ODM3MjkzMTR9.NLehOcnaVrB5ckxOJSEqQLlpWKVUutEDPabgJStUSHc_RL4GrWj48W3UX4Pdm3Ju4-ziNSGm8WhdPvK4hdEcrg"


class drugSpider(threading.Thread):
    def __init__(self, atcCodeQueue, *args, **kwargs):
        super(drugSpider, self).__init__(*args, **kwargs)
        self.atcCodeQueue = atcCodeQueue

    def getDrugs(self, key, page):
        offset = (page - 1) * 10
        url = "https://med-askbob.pingan.com/pedia/drug/product/list?key=" + key + "&atcCode=&filterType=common_name&relationType=&specification=&forms=&offset=" + str(offset) + "&pageSize=10"
        resp = getHtml(url)['data']
        return (resp['list'], resp['hasMore'])

    def getDrugsList(self, atcCode, page):
        offset = (page - 1) * 10
        url = "https://med-askbob.pingan.com/pedia/drug/common/list?pageSize=10&offset=" + str(offset) + "&atcCode=" + atcCode
        resp = getHtml(url)['data']
        return (resp['list'], resp['hasMore'])

    def getDrugDetail(self, key):
        url = "https://med-askbob.pingan.com/pedia/drug/product/detail?key=" + key + "&relationType="
        resp = getHtml(url)['data']
        drugs = {}
        try:
            drugs['kgId'] = resp['basicProperty']['kgId']
        except Exception as e:
            drugs['kgId'] = ""
        try:
            drugs['proprietaryName'] = resp['basicProperty']['proprietaryName']
        except Exception as e:
            drugs['proprietaryName'] = ""
        try:
            drugs['productSpecification'] = resp['basicProperty']['productSpecification']
        except Exception as e:
            drugs['productSpecification'] = ""
        try:
            drugs['manufacturer'] = resp['basicProperty']['manufacturer']
        except Exception as e:
            drugs['manufacturer'] = ""
        try:
            drugs['otc'] = resp['basicProperty']['otc']
        except Exception as e:
            drugs['otc'] = ""
        try:
            drugs['medInsurance'] = resp['basicProperty']['medInsurance']
        except Exception as e:
            drugs['medInsurance'] = ""
        try:
            drugs['cw'] = resp['basicProperty']['cw']
        except Exception as e:
            drugs['cw'] = ""
        try:
            drugs['commonName'] = resp['basicProperty']['commonName']
        except Exception as e:
            drugs['commonName'] = ""
        try:
            drugs['imageUrlList'] = str(resp['basicProperty']['imageUrlList']).replace("\'", "\"")
        except Exception as e:
            drugs['imageUrlList'] = ""
        try:
            drugs['englishName'] = resp['basicProperty']['englishName']
        except Exception as e:
            drugs['englishName'] = ""
        try:
            drugs['dosageForms'] = resp['basicProperty']['dosageForms']
        except Exception as e:
            drugs['dosageForms'] = ""
        try:
            drugs['mainMaterial'] = resp['basicProperty']['mainMaterial']
        except Exception as e:
            drugs['mainMaterial'] = ""
        try:
            drugs['storage'] = resp['basicProperty']['storage']
        except Exception as e:
            drugs['storage'] = ""
        try:
            drugs['permissionNumber'] = resp['basicProperty']['permissionNumber']
        except Exception as e:
            drugs['permissionNumber'] = ""
        try:
            drugs['atcNameZh'] = resp['basicProperty']['atcNameZh']
        except Exception as e:
            drugs['atcNameZh'] = ""
        try:
            drugs['fda'] = resp['basicProperty']['fda']
        except Exception as e:
            drugs['fda'] = ""
        try:
            drugs['priceRange'] = resp['basicProperty']['priceRange']
        except Exception as e:
            drugs['priceRange'] = ""
        try:
            drugs['packagePrice'] = resp['basicProperty']['packagePrice'].replace("\'", "\"")
        except Exception as e:
            drugs['packagePrice'] = ""
        try:
            drugs['componentId'] = resp['basicProperty']['componentId']
        except Exception as e:
            drugs['componentId'] = ""
        try:
            drugs['pedia'] = str(resp['pedia']).replace("\'", "\"")
        except Exception as e:
            drugs['pedia'] = ""
        return drugs

    def pipLine(self, drug):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert into drug(kgId, proprietaryName, productSpecification, manufacturer, otc, medInsurance, cw, commonName, imageUrlList, englishName, dosageForms, mainMaterial, storage, permissionNumber, atcNameZh, fda, priceRange, packagePrice, componentId, pedia) "
                           "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
                           (drug['kgId'], drug['proprietaryName'], drug['productSpecification'], drug['manufacturer'], drug['otc'], drug['medInsurance'], drug['cw'], drug['commonName'], drug['imageUrlList'], drug['englishName'], drug['dosageForms'], drug['mainMaterial'], drug['storage'], drug['permissionNumber'], drug['atcNameZh'], drug['fda'], drug['priceRange'], drug['packagePrice'], drug['componentId'], drug['pedia']))
            conn.commit()
        except Exception as e:
            print(e)
            pass

    def run(self):
        while True:
            if self.atcCodeQueue.empty():
                break
            actCode = self.atcCodeQueue.get()
            drugsPage = 1
            while True:
                drugsTuple = self.getDrugsList(actCode, drugsPage)
                try:
                    for metaDurgs in drugsTuple[0]:
                        key = metaDurgs['key']
                        drugListPage = 1
                        while True:
                            drugsTuple = self.getDrugs(key, drugListPage)
                            for drug in drugsTuple[0]:
                                dkey = drug['key']
                                drugDetail = self.getDrugDetail(dkey)
                                self.pipLine(drugDetail)
                                time.sleep(5)
                            if drugsTuple[1]:
                                drugListPage += 1
                            else:
                                break
                    if drugsTuple[1]:
                        drugsPage += 1
                        continue
                    else:
                        break
                except Exception as e:
                    break


class guideSpider(threading.Thread):
    def __init__(self, keyQueue, *args, **kwargs):
        super(guideSpider, self).__init__(*args, **kwargs)
        self.keyQueue = keyQueue

    def parser(self, key):
        url = "https://med-askbob.pingan.com/pedia/guide/detail?key=" + key
        resp = getHtml(url)
        guide = {}
        try:
            guide['key'] = key.replace("\'", "\"")
        except Exception as e:
            guide['key'] = ""
        try:
            guide['title'] = str(resp['data']['title']).replace("\'", "\"")
        except Exception as e:
            guide['title'] = ""
        try:
            guide['source'] = str(resp['data']['source']).replace("\'", "\"")
        except Exception as e:
            guide['source'] = ""
        try:
            guide['keyWords'] = str(resp['data']['keyWords']).replace("\'", "\"")
        except Exception as e:
            guide['keyWords'] = ""
        try:
            guide['summary'] = str(resp['data']['summary']).replace("\'", "\"")
        except Exception as e:
            guide['summary'] = ""
        try:
            guide['userProCompleted'] = str(resp['data']['userProCompleted']).replace("\'", "\"")
        except Exception as e:
            guide['userProCompleted'] = ""
        try:
            guide['url'] = resp['data']['url']
        except Exception as e:
            guide['url'] = ""
        if guide['url']:
            downLoadFile(guide['url'], guide_pdf_path + key + ".pdf")
        return guide

    def pipLine(self, guide):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert into guide(`key`, title, source, keyWords, summary, userProCompleted, url) "
                           "values('%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
                           (guide['key'], guide['title'], guide['source'], guide['keyWords'], guide['summary'], guide['userProCompleted'], guide['url'])
                           )
            conn.commit()
        except Exception as e:
            print(e)
            pass

    def run(self):
        while True:
            if self.keyQueue.empty():
                break
            key = self.keyQueue.get()
            guide = self.parser(key)
            self.pipLine(guide)


class checkSpider(threading.Thread):
    def __init__(self, keyQueue, *args, **kwargs):
        super(checkSpider, self).__init__(*args, **kwargs)
        self.keyQueue = keyQueue

    def getCheckSubCategoryList(self, category):
        url = "https://med-askbob.pingan.com/pedia/check/dic?category=" + category
        return getHtml(url)['data']

    def pushCheck(self, check, key, value):
        value = value.replace("\'", "\"")
        if key == '标准名称':
            check['name'] = value
        elif key == '别名':
            check['alias_name'] = value
        elif key == '英文名称':
            check['english_name'] = value
        elif key == '简介':
            check['introduction'] = value
        elif key == '分类':
            check['type'] = value
        elif key == '是否空腹':
            check['fasting'] = value
        elif key == '检查提示':
            check['prompt'] = value
        elif key == '适应证':
            check['Indications'] = value
        elif key == '参考值':
            check['reference'] = value
        elif key == '注意事项':
            check['attention'] = value
        elif key == '标本要求':
            check['Specimen'] = value
        elif key == '临床意义':
            check['Clinical'] = value
        elif key == '检查过程':
            check['Inspection'] = value
        elif key == '不良反应':
            check['Adverse'] = value
        return check

    def getCheckDetail(self, key):
        url = "https://med-askbob.pingan.com/pedia/check/detail?key=" + key
        pedias = getHtml(url)['data']['pedia']
        check = {}
        check['key'] = key
        check['name'] = ""
        check['alias_name'] = ""
        check['english_name'] = ""
        check['introduction'] = ""
        check['type'] = ""
        check['fasting'] = ""
        check['prompt'] = ""
        check['Indications'] = ""
        check['reference'] = ""
        check['attention'] = ""
        check['Specimen'] = ""
        check['Clinical'] = ""
        check['Inspection'] = ""
        check['Adverse'] = ""
        for pedia in pedias:
            key__ = pedia['key']
            metaPedias = pedia['value']
            key_ = ""
            value = ""
            try:
                if len(metaPedias) > 1:
                    for metaPedia in metaPedias:
                        try:
                            key_ = metaPedia['key']
                            value = str(metaPedia['value']).replace("\'", "\"")
                        except Exception as e:
                            key_ = key__
                            value = str(metaPedias).replace("\'", "\"")
                        check = self.pushCheck(check, key_, value)
                else:
                    try:
                        key_ = metaPedias['key']
                        value = str(metaPedias['value']).replace("\'", "\"")
                    except Exception as e:
                        key_ = key__
                        value = str(metaPedias).replace("\'", "\"")
                    check = self.pushCheck(check, key_, value)
            except Exception as e:
                key_ = key__
                value = str(metaPedias).replace("\'", "\"")
                check = self.pushCheck(check, key_, value)
        return check

    def pipLine(self, check):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert into `check`(`key`, name, alias_name, english_name, introduction, `type`, fasting, Indications, reference, attention, prompt, Specimen, Clinical, Inspection, Adverse) "
                           "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
                           (check['key'], check['name'], check['alias_name'], check['english_name'], check['introduction'], check['type'], check['fasting'], check['Indications'], check['reference'], check['attention'], check['prompt'], check['Specimen'], check['Clinical'], check['Inspection'], check['Adverse'])
                           )
            conn.commit()
        except Exception as e:
            print(e)
            pass

    def run(self):
        while True:
            if self.keyQueue.empty():
                break
            key = self.keyQueue.get()
            subCategoryList = self.getCheckSubCategoryList(key)
            for subCategory in subCategoryList:
                subCategorys = subCategoryList[subCategory]
                for subCategory_ in subCategorys:
                    key_ = subCategory_['key']
                    check = self.getCheckDetail(key_)
                    self.pipLine(check)


class symptomSpider(threading.Thread):
    def __init__(self, symptomKeyQueue, *args, **kwargs):
        super(symptomSpider, self).__init__(*args, **kwargs)
        self.symptomKeyQueue = symptomKeyQueue

    def getSymptomDetail(self, key):
        url = "https://med-askbob.pingan.com/pedia/symptom/detail?key=" + key
        pedia = getHtml(url)['data']['pedia']
        symptom = {}
        symptom['key'] = key
        try:
            symptom['name'] = pedia[0]['value'][0]['value'].replace("(", "(").replace(")", ")").replace("\'", "\"")
        except Exception as e:
            symptom['name'] = ''
        try:
            symptom['introduction'] = getValueStr(pedia[0]['value'][1]['value']).replace("(", "(").replace(")", ")").replace("\'", "\"")
        except Exception as e:
            symptom['introduction'] = ''
        try:
            symptom['Pathogeny'] = getValueStr(pedia[1]['value']).replace("(", "(").replace(")", ")").replace("\'", "\"")
        except Exception as e:
            symptom['Pathogeny'] = ''
        try:
            symptom['diagnosis'] = getValueStr(pedia[2]['value']).replace("(", "(").replace(")", ")").replace("\'", "\"")
        except Exception as e:
            symptom['diagnosis'] = ''
        try:
            symptom['Prevention'] = getValueStr(pedia[3]['value']).replace("(", "(").replace(")", ")").replace("\'", "\"")
        except Exception as e:
            symptom['Prevention'] = ''
        return symptom

    def pipLine(self, symptom):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert into `symptom`(`key`, name, introduction, Pathogeny, diagnosis, Prevention) "
                           "values('%s', '%s', '%s', '%s', '%s', '%s')" %
                           (symptom['key'], symptom['name'], symptom['introduction'], symptom['Pathogeny'], symptom['diagnosis'], symptom['Prevention'])
                           )
            conn.commit()
        except Exception as e:
            print(e)
            pass

    def run(self):
        while True:
            if self.symptomKeyQueue.empty():
                break
            symptomKey = self.symptomKeyQueue.get()
            symptom = self.getSymptomDetail(symptomKey)
            self.pipLine(symptom)
            exit(0)


class diseaseSpider(threading.Thread):
    def __init__(self, diseaseKeyQueue, *args, **kwargs):
        super(diseaseSpider, self).__init__(*args, **kwargs)
        self.diseaseKeyQueue = diseaseKeyQueue

    def getDiseaseDetail(self, key):
        url = "https://med-askbob.pingan.com/pedia/disease/detail?key=" + key
        pedia = getHtml(url)['data']['pedia']
        disease = {}
        disease['key'] = key
        try:
            disease['name'] = pedia[0]['value'][0]['value']
        except Exception as e:
            disease['name'] = ''
        try:
            disease['alias_name'] = getValueStr(pedia[0]['value'][1]['value'])
        except Exception as e:
            disease['alias_name'] = ''
        try:
            disease['english_name'] = getValueStr(pedia[0]['value'][2]['value'])
        except Exception as e:
            disease['english_name'] = ''
        try:
            disease['Abbreviation'] = getValueStr(pedia[0]['value'][3]['value'])
        except Exception as e:
            disease['Abbreviation'] = ''
        try:
            disease['ICD'] = getValueStr(pedia[0]['value'][4]['value'])
        except Exception as e:
            disease['ICD'] = ''
        try:
            disease['Department'] = getValueStr(pedia[0]['value'][5]['value'])
        except Exception as e:
            disease['Department'] = ''
        try:
            disease['introduction'] = getValueStr(pedia[0]['value'][6]['value'])
        except Exception as e:
            disease['introduction'] = ''
        try:
            disease['Pathogeny'] = getValueStr(pedia[1]['value'][0]['value'])
        except Exception as e:
            disease['Pathogeny'] = ''
        try:
            disease['Pathology'] = getValueStr(pedia[1]['value'][1]['value'])
        except Exception as e:
            disease['Pathology'] = ''
        try:
            disease['historyKeyPoints'] = getValueStr(pedia[2]['value'][0]['value'])
        except Exception as e:
            disease['historyKeyPoints'] = ''
        try:
            disease['keyPointsOfSymptoms'] = getValueStr(pedia[2]['value'][1]['value'])
        except Exception as e:
            disease['keyPointsOfSymptoms'] = ''
        try:
            disease['keyPointsOfPhysicalExamination'] = getValueStr(pedia[2]['value'][2]['value'])
        except Exception as e:
            disease['keyPointsOfPhysicalExamination'] = ''
        try:
            disease['Transfer'] = getValueStr(pedia[2]['value'][3]['value'])
        except Exception as e:
            disease['Transfer'] = ''
        try:
            disease['laboratory'] = getValueStr(pedia[2]['value'][4]['value'][0]['value'])
        except Exception as e:
            disease['laboratory'] = ''
        try:
            disease['Imaging'] = getValueStr(pedia[2]['value'][4]['value'][1]['value'])
        except Exception as e:
            disease['Imaging'] = ''
        try:
            disease['Other'] = getValueStr(pedia[2]['value'][4]['value'][2]['value'])
        except Exception as e:
            disease['Other'] = ''
        try:
            disease['clinical'] = getValueStr(pedia[2]['value'][5]['value'])
        except Exception as e:
            disease['clinical'] = ''
        try:
            disease['stages'] = getValueStr(pedia[2]['value'][6]['value'])
        except Exception as e:
            disease['stages'] = ''
        try:
            disease['stages'] = getValueStr(pedia[2]['value'][6]['value'])
        except Exception as e:
            disease['stages'] = ''
        try:
            disease['diagnosis'] = getValueStr(pedia[3]['value'])
        except Exception as e:
            disease['diagnosis'] = ''
        try:
            disease['principles'] = getValueStr(pedia[4]['value'][0]['value'])
        except Exception as e:
            disease['principles'] = ''
        try:
            disease['generalTreatment'] = getValueStr(pedia[4]['value'][1]['value'])
        except Exception as e:
            disease['generalTreatment'] = ''
        try:
            disease['Medication'] = getValueStr(pedia[4]['value'][2]['value'])
        except Exception as e:
            disease['Medication'] = ''
        try:
            disease['surgicalTreatment'] = getValueStr(pedia[4]['value'][3]['value'])
        except Exception as e:
            disease['surgicalTreatment'] = ''
        try:
            disease['otherTreatment'] = getValueStr(pedia[4]['value'][4]['value'])
        except Exception as e:
            disease['otherTreatment'] = ''
        try:
            disease['commonComplications'] = getValueStr(pedia[5]['value'])
        except Exception as e:
            disease['commonComplications'] = ''
        try:
            disease['prognosis'] = getValueStr(pedia[6]['value'])
        except Exception as e:
            disease['prognosis'] = ''
        try:
            disease['followUp'] = getValueStr(pedia[7]['value'])
        except Exception as e:
            disease['followUp'] = ''
        try:
            disease['Prevention'] = getValueStr(pedia[8]['value'])
        except Exception as e:
            disease['Prevention'] = ''
        return disease

    def pipLine(self, disease):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert into `disease`(`key`, name, alias_name, english_name, Abbreviation, ICD, Department, introduction, Pathogeny, Pathology, historyKeyPoints, keyPointsOfSymptoms, keyPointsOfPhysicalExamination, Transfer, laboratory, Imaging, Other, clinical, stages, diagnosis, principles, generalTreatment, Medication, surgicalTreatment, otherTreatment, commonComplications, prognosis, followUp, Prevention) "
                           "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
                           (disease['key'], disease['name'], disease['alias_name'], disease['english_name'], disease['Abbreviation'], disease['ICD'], disease['Department'], disease['introduction'], disease['Pathogeny'], disease['Pathology'], disease['historyKeyPoints'], disease['keyPointsOfSymptoms'], disease['keyPointsOfPhysicalExamination'], disease['Transfer'], disease['laboratory'], disease['Imaging'], disease['Other'], disease['clinical'], disease['stages'], disease['diagnosis'], disease['principles'], disease['generalTreatment'], disease['Medication'], disease['surgicalTreatment'], disease['otherTreatment'], disease['commonComplications'], disease['prognosis'], disease['followUp'], disease['Prevention'])
                           )
            conn.commit()
        except Exception as e:
            print(e)
            pass

    def run(self):
        while True:
            if self.diseaseKeyQueue.empty():
                break
            diseaseKey = self.diseaseKeyQueue.get()
            disease = self.getDiseaseDetail(diseaseKey)
            self.pipLine(disease)


class insrtuctionSpider(threading.Thread):
    def __init__(self, insrtuctionKeyQueue, *args, **kwargs):
        super(insrtuctionSpider, self).__init__(*args, **kwargs)
        self.insrtuctionKeyQueue = insrtuctionKeyQueue

    def getInstructionDetail(self, key):
        url = "https://med-askbob.pingan.com/pedia/oldu/detail?key=" + key + "&type=oldu"
        instructionResp = getHtml(url)['data']
        instruction = {}
        instruction['key'] = key
        try:
            instruction['title'] = instructionResp['title']
        except Exception as e:
            instruction['title'] = ""
        try:
            instruction['mainFunction'] = str(instructionResp['mainFunction']).replace("\'", "\"")
        except Exception as e:
            instruction['mainFunction'] = ""
        try:
            instruction['usageList'] = str(instructionResp['usageList']).replace("\'", "\"")
        except Exception as e:
            instruction['usageList'] = ""
        return instruction

    def pipLine(self, instruction):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert into `instruction`(`key`, title, mainFunction, usageList) "
                           "values('%s', '%s', '%s', '%s')" %
                           (instruction['key'], instruction['title'], instruction['mainFunction'], instruction['usageList'])
                           )
            conn.commit()
        except Exception as e:
            print(e)
            pass

    def run(self):
        while True:
            if self.insrtuctionKeyQueue.empty():
                break
            key = self.insrtuctionKeyQueue.get()
            instruction = self.getInstructionDetail(key)
            self.pipLine(instruction)


class caseSpider(threading.Thread):
    def __init__(self, *args, **kwargs):
        super(caseSpider, self).__init__(*args, **kwargs)

    def pipLine(self, case):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert into `case`(`key`, source, publishDate, keywords, title, sex, age, complaint, medicalHistory, `check`, conclusion, treatmentProcess) "
                           "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
                           (case['key'], case['source'], case['publishDate'], case['keywords'], case['title'], case['sex'], case['age'], case['complaint'], case['medicalHistory'], case['check'], case['conclusion'], case['treatmentProcess'])
                           )
            conn.commit()
        except Exception as e:
            print(e)
            pass

    def getCaseListTuple(self, page):
        offset = str((int(page) - 1) * 10)
        url = "https://med-askbob.pingan.com/pedia/case/list?pageSize=10&offset=" + offset + "&allSecondDept=&dept="
        resp = getHtml(url)['data']
        try:
            return (resp['list'], resp['hasMore'])
        except Exception as e:
            return None

    def getCaseDetail(self, key):
        url = "https://med-askbob.pingan.com/pedia/case/detail?key=" + key + "&type=case"
        caseDateil = getHtml(url)['data']
        case = {}
        case['key'] = key
        try:
            case['source'] = caseDateil['source']
        except Exception as e:
            case['source'] = ""
        try:
            case['publishDate'] = caseDateil['publishDate']
        except Exception as e:
            case['publishDate'] = ""
        try:
            keywords = caseDateil['keywords']
            keywords_ = []
            req = re.compile(r'>(.*?)<')
            for keyword in keywords:
                try:
                    keywords_.append(re.findall(req, keyword)[0])
                except Exception as e:
                    pass
            case['keywords'] = str(keywords_).replace("\'", "\"")
        except Exception as e:
            case['keywords'] = ""
        try:
            case['title'] = caseDateil['title']
        except Exception as e:
            case['title'] = ""
        try:
            case['sex'] = caseDateil['pedia'][0]['value'][0]['value'][0]['value']
        except Exception as e:
            case['sex'] = ""
        try:
            case['age'] = caseDateil['pedia'][0]['value'][0]['value'][1]['value']
        except Exception as e:
            case['age'] = ""
        try:
            case['complaint'] = str(caseDateil['pedia'][0]['value'][1]['value']).replace("\'", "\"")
        except Exception as e:
            case['complaint'] = ""
        try:
            case['medicalHistory'] = str(caseDateil['pedia'][0]['value'][2]['value']).replace("\'", "\"")
        except Exception as e:
            case['medicalHistory'] = ""
        try:
            case['check'] = str(caseDateil['pedia'][1]['value'][0]['value']).replace("\'", "\"")
        except Exception as e:
            case['check'] = ""
        try:
            case['conclusion'] = str(caseDateil['pedia'][1]['value'][1]['value']).replace("\'", "\"")
        except Exception as e:
            case['conclusion'] = ""
        try:
            case['treatmentProcess'] = str(caseDateil['pedia'][2]['value'][0]['value']).replace("\'", "\"")
        except Exception as e:
            case['treatmentProcess'] = ""
        return case

    def run(self):
        startPage = 1
        while True:
            caseListTuple = self.getCaseListTuple(startPage)
            if caseListTuple:
                caseList = caseListTuple[0]
                for case_ in caseList:
                    key = case_['key']
                    case = self.getCaseDetail(key)
                    self.pipLine(case)
                if caseListTuple[1]:
                    startPage += 1
                    continue
                else:
                    break


headers = {
    "authentication": token,
    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 12_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.5(0x17000523) NetType/WIFI Language/zh_CN",
    "Referer": "https://med-askbob.pingan.com/front_pedia/drugEntranceOne"
}


def downLoadFile(url, path):
    try:
       resp = requests.get(url, timeout=10)
       with open(path, "wb") as f:
           f.write(resp.content)
    except Exception as e:
        pass


def getHtml(url):
    while True:
        try:
            resp = requests.get(url, headers=headers, timeout=10)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            continue


def postHtml(url, data):
    while True:
        try:
            resp = requests.post(url, headers=headers, data=data, timeout=10)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            continue


def getValueStr(values):
    valStr = ""
    for val in values:
        try:
            valStr += val['value'] + "\r\n"
        except Exception as e:
            pass
    return valStr


####################################################
# 药品模块函数库
####################################################


def getDrugCategoryList(url):
    resp = getHtml(url)
    categoryLists = resp['data']
    categoryList = []
    for category in categoryLists:
        atcCode = category['atcCode']
        categoryList.append(atcCode)
    return categoryList


def getDrugCategoryQueue():
    zUrl = "https://med-askbob.pingan.com/pedia/drug/category?type=Z"
    xUrl = "https://med-askbob.pingan.com/pedia/drug/category?type=X"
    zList = getDrugCategoryList(zUrl)
    xList = getDrugCategoryList(xUrl)
    zList[0: 0] = xList
    atcCodeQueue = Queue(0)
    for atcCode in zList:
        atcCodeQueue.put(atcCode)
    return atcCodeQueue


####################################################
# 指南模块函数库
####################################################
def getGuideCategoryList(page):
    offset = str((page - 1) * 10)
    url = "https://med-askbob.pingan.com/pedia/guide/list?pageSize=10&offset=" + offset + "&dept=&allSecondDept=&source=&type="
    resp = getHtml(url)['data']
    return (resp['list'], resp['hasMore'])


def getGuideKeyQueue():
    startPage = 1
    guideKeyQueue = Queue(0)
    while True:
        guideCategoryTuple = getGuideCategoryList(startPage)
        guideCategorys = guideCategoryTuple[0]
        for guideCategory in guideCategorys:
            key = guideCategory['key']
            guideKeyQueue.put(key)
        if guideCategoryTuple[1]:
            startPage += 1
            break
        else:
            break
    return guideKeyQueue


####################################################
# 检验检查函数库
####################################################
def getCheckCategoryList():
    url = "https://med-askbob.pingan.com/pedia/check/category"
    return getHtml(url)['data']


def getCheckKeyQueue():
    catrgoryList = getCheckCategoryList()
    keyQueue = Queue(0)
    for level1 in catrgoryList:
        if level1['childList']:
            level2 = level1['childList']
            for level2_ in level2:
                if level2_['childList']:
                    level3 = level2_['childList']
                    for level3_ in level3:
                        keyQueue.put(level3_['key'])
                else:
                    keyQueue.put(level2_['key'])
    return keyQueue


####################################################
# 症状模块函数库
####################################################
def getSymptomList():
    url = "https://med-askbob.pingan.com/pedia/symptom/dic?dept=&allSecondDept=true"
    return getHtml(url)['data']


def getSymptomKeyQueue():
    symptomKeyQueue = Queue(0)
    symptomList = getSymptomList()
    for symptom in symptomList:
        symptomList_ = symptomList[symptom]
        for symptom_ in symptomList_:
            symptomKeyQueue.put(symptom_['key'])
    return symptomKeyQueue


####################################################
# 疾病函数库
####################################################
def getDiseaseList():
    url = "https://med-askbob.pingan.com/pedia/disease/dic?dept=&allSecondDept=true"
    return getHtml(url)['data']


def getDiseaseKeyQueue():
    diseaseList = getDiseaseList()
    diseaseKeyQueue = Queue(0)
    for disease in diseaseList:
        diseaseList_ = diseaseList[disease]
        for disease_ in diseaseList_:
            diseaseKeyQueue.put(disease_['key'])
    return diseaseKeyQueue


####################################################
# 超说明书函数库
####################################################
def getInstructionParCategoryList():
    url = "https://med-askbob.pingan.com/pedia/oldu/category"
    return getHtml(url)['data']


def getInstructionChilCategoryList(atcCode):
    url = "https://med-askbob.pingan.com/pedia/oldu/list?atcCode=" + atcCode
    return getHtml(url)['data']


def getInsrtuctionKeyQueue():
    insrtuctionKeyQueue = Queue(0)
    pars = getInstructionParCategoryList()
    nums = 0
    for par in pars:
        atcCode = par['atcCode']
        chils = getInstructionChilCategoryList(atcCode)
        for chil in chils:
            try:
                insrtuctionKeyQueue.put(chil['key'])
                nums += 1
                ## 测试删除
                if nums > 5:
                    return insrtuctionKeyQueue
            except Exception as e:
                continue
    return insrtuctionKeyQueue


def main():
    # 药品模块
    atcCodeQueue = getDrugCategoryQueue()
    for i in range(1):
        d = drugSpider(atcCodeQueue)
        d.start()
    # 指南模块
    guideKeyQueue = getGuideKeyQueue()
    for i in range(1):
        g = guideSpider(guideKeyQueue)
        g.start()
    # 检验检查模块
    checkKeyQueue = getCheckKeyQueue()
    for i in range(1):
        c = checkSpider(checkKeyQueue)
        c.start()
    # 症状模块
    symptomKeyQueue = getSymptomKeyQueue()
    for i in range(1):
        s = symptomSpider(symptomKeyQueue)
        s.start()
    # 疾病模块
    diseaseKeyQueue = getDiseaseKeyQueue()
    for i in range(1):
        d = diseaseSpider(diseaseKeyQueue)
        d.start()
    # 超说明书模块
    insrtuctionKeyQueue = getInsrtuctionKeyQueue()
    for i in range(1):
        i = insrtuctionSpider(insrtuctionKeyQueue)
        i.start()
    # 病例模块
    for i in range(1):
        c = caseSpider()
        c.start()


if __name__ == '__main__':
    main()

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

你可能感兴趣的:(Python)