scrapy爬取东方财富

只有spiders部分,看不懂的私聊,从不写注释,就是这么任性

# -*- coding: utf-8 -*-
import scrapy, json, re, time

from Spider.spider.tool.config import ConfigTool
from Spider.spider.item.eastmoneyitem import EastmoneyItem
from Spider.spider.tool.common_def import *

"""
__date__: 2020-06-15
__author__: luoshen
东方财富

"""


class EastmoneySpider(scrapy.Spider):
    """策略报告、券商晨报、宏观研究、新股研报、个股研报、行业研报"""
    name = 'eastmoney'
    base_url = 'http://data.eastmoney.com'
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'spider.middleware.common.RandomUserAgent': 10,
            # 'spider.middleware.common.ProxyMiddleWare': 20,
        },
        'ITEM_PIPELINES': {
            'spider.pipeline.pipelines.ReportsMongoPipeline': 120,
        },
        # 'DOWNLOAD_DELAY': 0.5,
        'LOG_LEVEL': 'WARNING'
    }
    cookies = {
        "Cookie": "自己的cookie"
    }
    type = {
        "clbg": "datatable7270063",
        "hgyj": "datatable1062280",
        "qscb": "datatable4594326",
        "ggyb": "datatable6061074",
        "xgyb": "datatable9433834",
        "hyyb": "datatable3735394"

    }

    def start_requests(self):
        start_url1 = "http://reportapi.eastmoney.com/report/jg?cb={cb}&pageSize=50&beginTime=2018-04-03&" \
                    "endTime={time}&pageNo={page}&fields=&qType={type}&orgCode=&author=&_={timestamp}"
        start_url2 = "http://reportapi.eastmoney.com/report/list?cb={cb}&industryCode=*&pageSize=50&" \
                     "industry=*&rating=&ratingChange=&beginTime=2018-04-03&endTime={time}&pageNo={page}&" \
                     "fields=&qType={type}&orgCode=&code=*&rcode=&_={timestamp}"
        start_url3 = "http://reportapi.eastmoney.com/report/newStockList?cb={cb}&pageSize=50&" \
                     "beginTime=2018-04-03&endTime={time}&pageNo={page}&fields=&qType={type}&_={timestamp}"
        start_url4 = "http://reportapi.eastmoney.com/report/list?cb={cb}&industryCode=*&pageSize=50&" \
                     "industry=*&rating=*&ratingChange=*&beginTime=2018-04-03&endTime={time}&pageNo={page}&" \
                     "fields=&qType={type}&orgCode=&rcode=&_={timestamp}"
        info = int(ConfigTool().get('global', 'increment'))
        endTime = time.strftime('%Y-%m-%d', time.localtime())
        timestamp = round(time.time() * 1000)
        if info == 1:
            # 策略报告
            for page in range(1, 4):
                url = start_url1.format(cb=self.type['clbg'], time=endTime, page=page, type=str(2), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
                                     meta={"report_type": "策略报告"})
            # 宏观研究
            for page in range(1, 3):
                url = start_url1.format(cb=self.type['hgyj'], time=endTime, page=page, type=str(3), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
                                     meta={"report_type": "宏观研究"})
            # 券商晨报
            for page in range(1, 3):
                url = start_url1.format(cb=self.type['qscb'], time=endTime, page=page, type=str(4), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
                                     meta={"report_type": "券商晨报"})
            # 个股研报
            for page in range(1, 11):
                url = start_url2.format(cb=self.type['ggyb'], time=endTime, page=page, type=str(0), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_two,
                                     meta={"report_type": "个股研报"})
            # 新股研报
            for page in range(1, 2):
                url = start_url3.format(cb=self.type['xgyb'], time=endTime, page=page, type=str(4), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_two,
                                     meta={"report_type": "新股研报"})
            # 行业研报
            for page in range(1, 11):
                url = start_url4.format(cb=self.type['hyyb'], time=endTime, page=page, type=str(1), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_three,
                                     meta={"report_type": "行业研报"})
        elif info == 0:
            # 策略报告
            for page in range(1, 567):
                url = start_url1.format(cb=self.type['clbg'], time=endTime, page=page, type=str(2), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
                                     meta={"report_type": "策略报告"})
            # 宏观研究
            for page in range(1, 340):
                url = start_url1.format(cb=self.type['hgyj'], time=endTime, page=page, type=str(3), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
                                     meta={"report_type": "宏观研究"})
            # 券商晨报
            for page in range(1, 416):
                url = start_url1.format(cb=self.type['qscb'], time=endTime, page=page, type=str(4), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_one,
                                     meta={"report_type": "券商晨报"})
            # 个股研报
            for page in range(1, 1317):
                url = start_url2.format(cb=self.type['ggyb'], time=endTime, page=page, type=str(0), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_two,
                                     meta={"report_type": "新股研报"})
            # 新股研报
            for page in range(1, 21):
                url = start_url3.format(cb=self.type['xgyb'], time=endTime, page=page, type=str(4), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_two,
                                     meta={"report_type": "个股研报"})
            # 行业研报
            for page in range(1, 1403):
                url = start_url4.format(cb=self.type['hyyb'], time=endTime, page=page, type=str(1), timestamp=timestamp)
                yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_list_three,
                                     meta={"report_type": "行业研报"}, dont_filter=True)

    def parse_list_one(self, response):
        rsp = re.search(r'{.*\}', response.text)
        if rsp:
            datas = json.loads(rsp.group()).get("data")
            for data in datas:
                try:
                    item = EastmoneyItem()
                    title = data.get("title")
                    item["title"] = deal_title(title)
                    pdate = data.get("publishDate", "")
                    item["pdate"] = pdate.split(" ")[0] if pdate else ""
                    item["companyCode"] = data.get("orgCode", "")
                    item["industry"] = []
                    item["author"] = data.get("researcher", "")
                    item["organ"] = data.get("orgName", "")
                    item["report_type"] = response.meta["report_type"]
                    encodeUrl = data.get("encodeUrl", "")
                    if encodeUrl:
                        if item['report_type'] == '策略报告':
                            detail_url = "http://data.eastmoney.com/report/zw_strategy.jshtml?encodeUrl={}".\
                                format(encodeUrl)
                            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
                                                 dont_filter=True)
                        elif item['report_type'] == '宏观研究':
                            detail_url = "http://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={}". \
                                format(encodeUrl)
                            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
                                                 dont_filter=True)
                        elif item['report_type'] == '券商晨报':
                            detail_url = "http://data.eastmoney.com/report/zw_brokerreport.jshtml?encodeUrl={}". \
                                format(encodeUrl)
                            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
                                                 dont_filter=True)
                except Exception as e:
                    print(e, response.url, self.name)
                    continue

    def parse_list_two(self, response):
        rsp = re.search(r'{.*\}', response.text)
        if rsp:
            datas = json.loads(rsp.group()).get("data")
            for data in datas:
                try:
                    item = EastmoneyItem()
                    title = data.get("title")
                    item["title"] = deal_title(title)
                    pdate = data.get("publishDate", "")
                    item["pdate"] = pdate.split(' ')[0] if pdate else ''
                    item["industry"] = [data.get("indvInduName", "")]
                    item["author"] = data.get("researcher", "")
                    item["organ"] = data.get("orgName", "")
                    item['companyCode'] = data.get("orgCode", "")
                    item["infoCode"] = data.get("infoCode", "")
                    item["stockCode"] = data.get("stockCode", "")
                    item['report_type'] = response.meta['report_type']
                    encodeUrl = data.get("encodeUrl", "")
                    if encodeUrl:
                        if item['report_type'] == '个股研报':
                            detail_url = "http://data.eastmoney.com/report/zw_stock.jshtml?encodeUrl={}". \
                                format(encodeUrl)
                            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
                                                 dont_filter=True)
                        elif item['report_type'] == '新股研报':
                            detail_url = "http://data.eastmoney.com/report/zw_stock.jshtml?encodeUrl={}". \
                                format(encodeUrl)
                            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item},
                                                 dont_filter=True)
                except Exception as e:
                    print(e, response.url, self.name)
                    continue

    def parse_list_three(self, response):
        rsp = re.search(r'{.*\}', response.text)
        if rsp:
            datas = json.loads(rsp.group()).get("data")
            for data in datas:
                try:
                    item = EastmoneyItem()
                    title = data.get("title")
                    item["title"] = deal_title(title)
                    pdate = data.get("publishDate", "")
                    item["pdate"] = pdate.split(' ')[0] if pdate else ''
                    item["industry"] = [data.get("industryName", "")]
                    item["author"] = data.get("researcher", "")
                    item["organ"] = data.get("orgName", "")
                    item['companyCode'] = data.get("orgCode", "")
                    item["infoCode"] = data.get("infoCode", "")
                    item['report_type'] = response.meta['report_type']
                    encodeUrl = data.get("encodeUrl", "")
                    if encodeUrl:
                        detail_url = "http://data.eastmoney.com/report/zw_industry.jshtml?encodeUrl={}".format(encodeUrl)
                        yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item},
                                             dont_filter=True)
                except Exception as e:
                    print(e, response.url, self.name)
                    continue

    def parse_detail(self, response):
        try:
            item = response.meta["item"]
            item["src"] = response.url
            item["sid"] = gen_sid(response.url)
            item["enact"] = '东方财富'
            item["download_status"] = 0
            pdf = response.xpath('//a[text()="查看PDF原文"]/@href|//span[@class="to-link"]/a/@href').get()
            item["pdf_url"] = parse_data(pdf, "")
            item["spider_name"] = self.name
            item["download_date"] = time.strftime("%Y-%m-%d", time.localtime())
            yield item
        except Exception as e:
            print(e, response.url, self.name)

middlware、item、pipeline模块没有,只贴上逻辑

 

你可能感兴趣的:(爬虫,python,爬虫)