# -*- coding: utf-8 -*-
import scrapy
import json
import re
from yiyao.items import YiyaoItem #引入Item
#Item和pipelines略过
class YiyaoSpiderSpider(scrapy.Spider):
name = "yiyao_spider" #爬虫名称
allowed_domains = ["210.73.89.76"] #域名
start_urls = ['http://210.73.89.76/'] #url列表
def parse(self, response): #scrpay解析函数
pro_name = ''#input("Please input name : ")
pro_comp = ''#input("Please input company : ")
pro_numb = "药" #url参数
main_url = "http://210.73.89.76/ServiceSelect/GetHosSelectList?BaseFlag=&OrgName=" + pro_comp + "&PermitNumber=" + pro_numb + "&ProductName=" + pro_name + "&filter=&group=&page=1&pageSize=100&sort="
yield scrapy.Request(url = main_url, callback = self.parse_getId) #callback为回调的函数
def parse_getId(self, response): #得到药品id的函数
# print(response.body)
main_data = json.loads(response.body.decode("utf-8"))["Data"] #返回的数据
main_count = json.loads(response.body.decode("utf-8"))["Total"] #总的数据个数
page_num = re.findall("page=(.*?)&",response.url)[0] #url中的页数 可以传递给下边的replace函数
page_size = re.findall("pageSize=(.*?)&",response.url)[0] #url中的页面大小
for eveData in main_data:
pro_id = eveData["ID"]
NAME_CHN = eveData["NAME_CHN"]
TRADE_NAME = eveData["TRADE_NAME"]
DOSEAGE_FORM_NAME = eveData["DOSEAGE_FORM_NAME"]
SPEC = eveData["SPEC"]
WRAP_NAME = eveData["WRAP_NAME"]
PERMIT_NUMBER = eveData["PERMIT_NUMBER"]
STAND_RATE = eveData["STAND_RATE"]
ORG_NAME = eveData["ORG_NAME"]
count_url = "http://210.73.89.76/ServiceSelect/GridOrgInfoList?OrgName=&OrgPrice=&ProductId=" + pro_id + "&filter=&group=&page=1&pageSize=100&sort=" #得到药品id之后需要获取对应的医院和价格
#把信息传递给后面的页面
main_desc = {"pro_id":pro_id,"NAME_CHN":NAME_CHN,"TRADE_NAME":TRADE_NAME,"DOSEAGE_FORM_NAME":DOSEAGE_FORM_NAME,"SPEC":SPEC,"WRAP_NAME":WRAP_NAME,"PERMIT_NUMBER":PERMIT_NUMBER,"STAND_RATE":STAND_RATE,"ORG_NAME":ORG_NAME}
yield scrapy.Request(url=count_url, callback=self.parse_getPrice, meta=main_desc) #meta的属性是字典,给url携带消息
if main_count > int(page_num) * int(page_size): #如果总的数据个数大于 页数乘页面大小 说明还有数据没有爬取完
new_page = int(page_num) + 1 #页数+1
new_main_url = str(response.url).replace("page="+page_num,"page="+str(new_page)) #scrapy.Request设置新的url
yield scrapy.Request(url=new_main_url, callback=self.parse_getId) #再次调取回调函数,得到商品id下所有医院和价格信息
def parse_getPrice(self, response): #得到医院和价格信息,并将上个页面得到的商品信息一起返回到Item
price_data = json.loads(response.body.decode("utf-8"))["Data"]
main_count = json.loads(response.body.decode("utf-8"))["Total"]
page_num = re.findall("page=(.*?)&", response.url)[0] #从url解析出页数(第几页)
page_size = re.findall("pageSize=(.*?)&", response.url)[0] #从url解析出每个页面大小
for eveData in price_data: #从数据中解析出全部信息
item = YiyaoItem() #此处为需要引入的Item,包含每条数据的全部信息
item['hos_id'] = eveData["ID"]
item['hos_name'] = eveData["NAME"]
item['hos_price'] = eveData["PRICE"]
item['pro_id'] = response.meta["pro_id"]
item['NAME_CHN'] = response.meta["NAME_CHN"]
item['TRADE_NAME'] = response.meta["TRADE_NAME"]
item['DOSEAGE_FORM_NAME'] = response.meta["DOSEAGE_FORM_NAME"]
item['SPEC'] = response.meta["SPEC"]
item['WRAP_NAME'] = response.meta["WRAP_NAME"]
item['PERMIT_NUMBER'] = response.meta["PERMIT_NUMBER"]
item['STAND_RATE'] = response.meta["STAND_RATE"]
item['ORG_NAME'] = response.meta["ORG_NAME"]
yield item
if main_count > int(page_num) * int(page_size): #如果id的总数大于当前页数乘页面大小,说明还有数据
new_page = int(page_num) + 1 #页数+1
new_price_url = str(response.url).replace("page="+page_num,"page="+str(new_page)) #替换成新的页数
yield scrapy.Request(url=new_price_url, callback=self.parse_getPrice, meta=response.meta) #回调parse_getPrice函数
徐朝晖 于4月17号,爬取http://210.73.89.76/ServiceSelect/GetServiceSelectList# 网站数据