scrapy框架循环爬取今日头条热点数据

  • scrapy框架爬取今日头条数据,主要实现一下几个主要功能:
  • 数据存储到mongodb数据库
  • 图片下载
  • 随机切换User-Agent
  • 对接IP代理池
  • 实现邮件发送
1.首先按F12打开开发者工具,如图:
  • 由于今日头条的数据是js动态加载,我们需要找到加载数据的接口,进攻分析,数据接口链接如上图所示。
  • 首先查看Headers信息:


    headers

    我们发现query string parameters 中有as和cp这两个参数,这是今日头条一种反扒措施,我们需要编写算法来生成这两个参数(网上找的)

  • 我们再看response也就是返回的数据信息:


    response

    我们将数据拷贝到在线的json格式转换器中,方便我们对数据进行分析:


    json数据
  • 上图我们可以看到,返回的数据包含来新闻的全部信息,其中在最后还有一个max_behot_time参数,通过改变这个参数信息,我们可以实现新闻数据循环抓取
  • spider代码如下:
# -*- coding: utf-8 -*-
import scrapy
import json
import time
import hashlib
import random
import requests
from datetime import datetime
from ..emailsend import EmailSend
from toutiao_two.items import ToutiaoTwoItem


class ToutiaoSpiderSpider(scrapy.Spider):
    name = 'toutiao_spider'
    allowed_domains = ['www.toutiao.com']
    headers = {
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'Host': 'www.toutiao.com',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Connection': 'keep-alive',
        'X-Requested-With': 'XMLHttpRequest',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    }
    cookies = {'tt_webid': '6722356446824613389'}
    start_url = 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='

    max_behot_time = '0'
    D = {'hot_time': '0'}

    def get_as_cp(self):  # 该函数主要是为了获取as和cp参数,程序参考今日头条中的加密js文件:home_4abea46.js
        zz = {}
        now = round(time.time())
        print(now)  # 获取当前计算机时间
        e = hex(int(now)).upper()[2:]  # hex()转换一个整数对象为16进制的字符串表示
        print('e:', e)
        a = hashlib.md5()  # hashlib.md5().hexdigest()创建hash对象并返回16进制结果
        print('a:', a)
        a.update(str(int(now)).encode('utf-8'))
        i = a.hexdigest().upper()
        print('i:', i)
        if len(e) != 8:
            zz = {'as': '479BB4B7254C150',
                  'cp': '7E0AC8874BB0985'}
            return zz

        n = i[:5]
        a = i[-5:]
        r = ''
        s = ''
        for i in range(5):
            s = s + n[i] + e[i]
        for j in range(5):
            r = r + e[j + 3] + a[j]
        zz = {
            'as': 'A1' + s + e[-3:],
            'cp': e[0:3] + r + 'E1'
        }
        print('zz:', zz)
        return zz

    def start_requests(self):
        global start_time
        start_time = datetime.now()
        ascp = self.get_as_cp()
        yield scrapy.FormRequest(url=self.start_url + self.max_behot_time + '&max_behot_time_tmp=' + self.max_behot_time + '&tadrequire=true&as=' + ascp[
                'as'] + '&cp=' + ascp['cp'],
                                 method='GET',
                                 headers=self.headers,
                                 cookies=self.cookies,
                                 callback=self.parse,
                                 )

    def parse(self, response):
        json_result = json.loads(response.text)
        # if json_result is None:
        #     print(self.D['hot_time'], '=====')
        #     time.sleep(20)
        #     yield scrapy.FormRequest(
        #         url=self.start_url + self.D['hot_time'] + '&max_behot_time_tmp=' + self.D['hot_time'] + '&tadrequire=true&as=' +
        #             'A115DD5DE72AC29' + '&cp=' + '5DD7FA9C02D90E1',
        #         method='GET',
        #         headers=self.headers,
        #         cookies=self.cookies,
        #         callback=self.parse,
        #     )
        item = ToutiaoTwoItem()

        infos = json_result['data']
        for info in infos:
            image_url_list = []
            item['abstract'] = info['abstract'] if info.get('abstract') else ''
            item['chinese_tag'] = info['chinese_tag'] if info.get('chinese_tag') else ''
            item['title'] = info['title'] if info.get('title') else ''
            item['source'] = info['source'] if info.get('source') else ''
            image_urls = info['image_list'] if info.get('image_list') else ''
            for image_url in image_urls:
                url = 'https:' + image_url['url']
                image_url_list.append(url)
            item['image_url'] = image_url_list
            yield item
        time.sleep(random.randint(1, 4))
        print(self.D['hot_time'])
        if json_result.get('next'):
            next = json_result['next']
            if next.get('max_behot_time'):
                max_behot_time = str(json_result['next']['max_behot_time'])
                self.D.update({'hot_time': max_behot_time})
                ascp = self.get_as_cp()
                yield scrapy.FormRequest(
                    url=self.start_url + max_behot_time + '&max_behot_time_tmp=' + max_behot_time + '&tadrequire=true&as=' +
                        str(ascp['as']) + '&cp=' + str(ascp['cp']),
                    method='GET',
                    headers=self.headers,
                    cookies=self.cookies,
                    callback=self.parse,
                    )

    def closed(self, reason):
        # 爬虫关闭的时候,会调用这个方法
        email = EmailSend()
        # 爬虫耗时
        use_time = datetime.now() - start_time

        close_time = 'toutiao爬虫开始时间{};结束时间:{};爬虫耗时:{}'.format(start_time, datetime.now(), use_time)
        content = '爬虫关闭原因:{}'.format(reason)
        email.send_text_email('发送者邮箱@qq.com', '接受者邮箱@qq.com', close_time, content)
实现IP代理池对接,在middlewares.py中实现,增加如下代码,并在settings中配置:
# 对接IP代理池
class ProxyMiddleware():
    def __init__(self, proxy_url):
        self.logger = logging.getLogger(__name__)
        self.proxy_url = proxy_url

    def get_random_proxy(self):
        try:
            response = requests.get(self.proxy_url)
            if response.status_code == 200:
                proxy = response.text
                return proxy
        except requests.ConnectionError:
            return False

    def process_request(self, request, spider):
        if request.meta.get('retry_times'):
            proxy = self.get_random_proxy()
            if proxy:
                uri = 'https://{proxy}'.format(proxy=proxy)
                self.logger.debug('使用代理 ' + proxy)
                request.meta['proxy'] = uri

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(
            proxy_url=settings.get('PROXY_URL')
        )
  • setteings
# 抛出可用IP地址
PROXY_URL = 'http://localhost:5555/random'
设置随机切换User-Agent,同样在middlewares.py中实现,并在settings配置:
# 随机切换User-Agent
class RandomUserAgent(object):
    """Randomly rotate user agents based on a list of predefined ones"""

    def __init__(self, agents):
        self.agents = agents

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings.getlist('USER_AGENTS'))

    def process_request(self, request, spider):
        #print "**************************" + random.choice(self.agents)
        request.headers.setdefault('User-Agent', random.choice(self.agents))
  • settings
USER_AGENTS = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]

DOWNLOADER_MIDDLEWARES = {
   'toutiao_two.middlewares.RandomUserAgent': 543,
   'toutiao_two.middlewares.ProxyMiddleware': 550,
}
在pipipelines中实现图片下载
import copy
import pymongo
from pymongo import MongoClient
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline


# 下载图片
class ToutaioImagePipeline(ImagesPipeline):
    def file_path(self, request, response=None, info=None):
        url = request.url
        file_name = url.split('/')[-1] + '.jpg'
        return file_name

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem('Image DownloadedFailed')
        return item

    def get_media_requests(self, item, info):
        for image_url in item['image_url']:
            yield Request(image_url)
  • settings中配置图片存储路径:
IMAGES_STORE = './images'
  • 同样在pipipelines中实现数据存储
# mongodb
class ToutiaoTwoMongoPipeline():
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        """
        问题:数据存入数据库之后,出现大量重复数据
        解决思路:
        在process_item中执行数据插入之前,先对变量进行复制copy,再用复制copy的变量进行操作,通过互斥确保变量不被修改。因此,修正这个问题,我们只需要调整优化下process_item()方法。
        解决代码:process_item()     - copy.deepcopy(item)   ->导入copy包
        """
        asynItem = copy.deepcopy(item)
        infos = {'chinese_tag': asynItem['chinese_tag'], 'title': asynItem['title'], 'source': asynItem['source'],
                 'image_url': asynItem['image_url']}
        self.db.toutiao.insert(infos)
        return item
  • settings中配置数据库链接信息:
ITEM_PIPELINES = {
    'toutiao_two.pipelines.ToutiaoTwoMongoPipeline': 300,
    'toutiao_two.pipelines.ToutaioImagePipeline': 300,

}

MONGO_URI = 'localhost'
MONGO_DB = 'scrapy_toutiao'
实现邮件发送,在settings同级目录下新增emailsend.py文件,写入如下内容:
#-*- coding: utf-8 -*-
'''
---------------

Description of this file

:author: Luopeng
:date created: 2019-12-04
:python version: 3.6

---------------
'''
import smtplib
from email.mime.text import MIMEText

import logging


class EmailSend(object):
    def __init__(self):
        self.logging = logging.getLogger('Waring')
        self.email_host = 'smtp.qq.com'
        self.email_port = '465'
        self.email_pass = '*********' # 自己的授权码

    def send_text_email(self, from_addr, to_addrs, subject, content):
        self.logging.warning('send_text_email is willed 丢弃')
        self.logging.error('send_text_email is None')
        message_text = MIMEText(content, 'plain', 'utf8')
        message_text['From'] = from_addr
        message_text['To'] = to_addrs
        message_text['Subject'] = subject

        try:
            # 在创建客户端对象的同时,连接到邮箱服务器。
            client = smtplib.SMTP_SSL(host=self.email_host, port=self.email_port)
            login_result = client.login(from_addr, self.email_pass)
            if login_result and login_result[0] == 235:
                print('登录成功')
                client.sendmail(from_addr, to_addrs, message_text.as_string())
                print('邮件发送成功')
            else:
                print('邮件发送异常:', login_result[0], login_result[1])
        except Exception as e:
            # print('连接邮箱服务器异常:',e)
            self.logging.error('连接邮箱服务器异常:{}'.format(e))

    def send_image_email(self):
        pass

    def send_word_email(self):
        pass

    def send_video_email(self):
        pass
  • 具体调用请参考spider里面的代码

你可能感兴趣的:(scrapy框架循环爬取今日头条热点数据)