只有spiders部分,看不懂的私聊,从不写注释,就是这么任性
# -*- coding: utf-8 -*-
import scrapy, json, re, time
from Spider.spider.tool.config import ConfigTool
from Spider.spider.item.eastmoneyitem import EastmoneyItem
from Spider.spider.tool.common_def import *
"""
__date__: 2020-06-15
__author__: luoshen
东方财富
"""
class EastmoneySpider(scrapy.Spider):
"""策略报告、券商晨报、宏观研究、新股研报、个股研报、行业研报"""
name = 'eastmoney'
base_url = 'http://data.eastmoney.com'
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'spider.middleware.common.RandomUserAgent': 10,
# 'spider.middleware.common.ProxyMiddleWare': 20,
},
'ITEM_PIPELINES': {
'spider.pipeline.pipelines.ReportsMongoPipeline': 120,
},
# 'DOWNLOAD_DELAY': 0.5,
'LOG_LEVEL': 'WARNING'
}
cookies = {
"Cookie": "自己的cookie"
}
type = {
"clbg": "datatable7270063",
"hgyj": "datatable1062280",
"qscb": "datatable4594326",
"ggyb": "datatable6061074",
"xgyb": "datatable9433834",
"hyyb": "datatable3735394"
}
def start_requests(self):
start_url1 = "http://reportapi.eastmoney.com/report/jg?cb={cb}&pageSize=50&beginTime=2018-04-03&" \
"endTime={time}&pageNo={page}&fields=&qType={type}&orgCode=&author=&_={timestamp}"
start_url2 = "http://reportapi.eastmoney.com/report/list?cb={cb}&industryCode=*&pageSize=50&" \
"industry=*&rating=&ratingChange=&beginTime=2018-04-03&endTime={time}&pageNo={page}&" \
"fields=&qType={type}&orgCode=&code=*&rcode=&_={timestamp}"
start_url3 = "http://reportapi.eastmoney.com/report/newStockList?cb={cb}&pageSize=50&" \
"beginTime=2018-04-03&endTime={time}&pageNo={page}&fields=&qType={type}&_={timestamp}"
start_url4 = "http://reportapi.eastmoney.com/report/list?cb={cb}&industryCode=*&pageSize=50&" \
"industry=*&rating=*&ratingChange=*&beginTime=2018-04-03&endTime={time}&pageNo={page}&" \
"fields=&qType={type}&orgCode=&rcode=&_={timestamp}"
info = int(ConfigTool().get('global', 'increment'))
endTime = time.strftime('%Y-%m-%d', time.localtime())
timestamp = round(time.time() * 1000)
if info == 1:
# 策略报告
for page in range(1, 4):
url = start_url1.format(cb=self.type['clbg'], time=endTime, page=page, type=str(2), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
meta={"report_type": "策略报告"})
# 宏观研究
for page in range(1, 3):
url = start_url1.format(cb=self.type['hgyj'], time=endTime, page=page, type=str(3), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
meta={"report_type": "宏观研究"})
# 券商晨报
for page in range(1, 3):
url = start_url1.format(cb=self.type['qscb'], time=endTime, page=page, type=str(4), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
meta={"report_type": "券商晨报"})
# 个股研报
for page in range(1, 11):
url = start_url2.format(cb=self.type['ggyb'], time=endTime, page=page, type=str(0), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_two,
meta={"report_type": "个股研报"})
# 新股研报
for page in range(1, 2):
url = start_url3.format(cb=self.type['xgyb'], time=endTime, page=page, type=str(4), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_two,
meta={"report_type": "新股研报"})
# 行业研报
for page in range(1, 11):
url = start_url4.format(cb=self.type['hyyb'], time=endTime, page=page, type=str(1), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_three,
meta={"report_type": "行业研报"})
elif info == 0:
# 策略报告
for page in range(1, 567):
url = start_url1.format(cb=self.type['clbg'], time=endTime, page=page, type=str(2), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
meta={"report_type": "策略报告"})
# 宏观研究
for page in range(1, 340):
url = start_url1.format(cb=self.type['hgyj'], time=endTime, page=page, type=str(3), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
meta={"report_type": "宏观研究"})
# 券商晨报
for page in range(1, 416):
url = start_url1.format(cb=self.type['qscb'], time=endTime, page=page, type=str(4), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
meta={"report_type": "券商晨报"})
# 个股研报
for page in range(1, 1317):
url = start_url2.format(cb=self.type['ggyb'], time=endTime, page=page, type=str(0), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_two,
meta={"report_type": "新股研报"})
# 新股研报
for page in range(1, 21):
url = start_url3.format(cb=self.type['xgyb'], time=endTime, page=page, type=str(4), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_two,
meta={"report_type": "个股研报"})
# 行业研报
for page in range(1, 1403):
url = start_url4.format(cb=self.type['hyyb'], time=endTime, page=page, type=str(1), timestamp=timestamp)
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_three,
meta={"report_type": "行业研报"}, dont_filter=True)
def parse_list_one(self, response):
rsp = re.search(r'{.*\}', response.text)
if rsp:
datas = json.loads(rsp.group()).get("data")
for data in datas:
try:
item = EastmoneyItem()
title = data.get("title")
item["title"] = deal_title(title)
pdate = data.get("publishDate", "")
item["pdate"] = pdate.split(" ")[0] if pdate else ""
item["companyCode"] = data.get("orgCode", "")
item["industry"] = []
item["author"] = data.get("researcher", "")
item["organ"] = data.get("orgName", "")
item["report_type"] = response.meta["report_type"]
encodeUrl = data.get("encodeUrl", "")
if encodeUrl:
if item['report_type'] == '策略报告':
detail_url = "http://data.eastmoney.com/report/zw_strategy.jshtml?encodeUrl={}".\
format(encodeUrl)
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
dont_filter=True)
elif item['report_type'] == '宏观研究':
detail_url = "http://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={}". \
format(encodeUrl)
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
dont_filter=True)
elif item['report_type'] == '券商晨报':
detail_url = "http://data.eastmoney.com/report/zw_brokerreport.jshtml?encodeUrl={}". \
format(encodeUrl)
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
dont_filter=True)
except Exception as e:
print(e, response.url, self.name)
continue
def parse_list_two(self, response):
rsp = re.search(r'{.*\}', response.text)
if rsp:
datas = json.loads(rsp.group()).get("data")
for data in datas:
try:
item = EastmoneyItem()
title = data.get("title")
item["title"] = deal_title(title)
pdate = data.get("publishDate", "")
item["pdate"] = pdate.split(' ')[0] if pdate else ''
item["industry"] = [data.get("indvInduName", "")]
item["author"] = data.get("researcher", "")
item["organ"] = data.get("orgName", "")
item['companyCode'] = data.get("orgCode", "")
item["infoCode"] = data.get("infoCode", "")
item["stockCode"] = data.get("stockCode", "")
item['report_type'] = response.meta['report_type']
encodeUrl = data.get("encodeUrl", "")
if encodeUrl:
if item['report_type'] == '个股研报':
detail_url = "http://data.eastmoney.com/report/zw_stock.jshtml?encodeUrl={}". \
format(encodeUrl)
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
dont_filter=True)
elif item['report_type'] == '新股研报':
detail_url = "http://data.eastmoney.com/report/zw_stock.jshtml?encodeUrl={}". \
format(encodeUrl)
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
dont_filter=True)
except Exception as e:
print(e, response.url, self.name)
continue
def parse_list_three(self, response):
rsp = re.search(r'{.*\}', response.text)
if rsp:
datas = json.loads(rsp.group()).get("data")
for data in datas:
try:
item = EastmoneyItem()
title = data.get("title")
item["title"] = deal_title(title)
pdate = data.get("publishDate", "")
item["pdate"] = pdate.split(' ')[0] if pdate else ''
item["industry"] = [data.get("industryName", "")]
item["author"] = data.get("researcher", "")
item["organ"] = data.get("orgName", "")
item['companyCode'] = data.get("orgCode", "")
item["infoCode"] = data.get("infoCode", "")
item['report_type'] = response.meta['report_type']
encodeUrl = data.get("encodeUrl", "")
if encodeUrl:
detail_url = "http://data.eastmoney.com/report/zw_industry.jshtml?encodeUrl={}".format(encodeUrl)
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item},
dont_filter=True)
except Exception as e:
print(e, response.url, self.name)
continue
def parse_detail(self, response):
try:
item = response.meta["item"]
item["src"] = response.url
item["sid"] = gen_sid(response.url)
item["enact"] = '东方财富'
item["download_status"] = 0
pdf = response.xpath('//a[text()="查看PDF原文"]/@href|//span[@class="to-link"]/a/@href').get()
item["pdf_url"] = parse_data(pdf, "")
item["spider_name"] = self.name
item["download_date"] = time.strftime("%Y-%m-%d", time.localtime())
yield item
except Exception as e:
print(e, response.url, self.name)
middlware、item、pipeline模块没有,只贴上逻辑