scrapy通用随机下载延迟、IP代理、UA

  • 目录结构

scrapy通用随机下载延迟、IP代理、UA_第1张图片

  • main.py文件
# -*- coding:utf-8 -*-

from scrapy import cmdline

cmdline.execute('scrapy crawl test'.split())

settings.py文件

# -*- coding: utf-8 -*-

BOT_NAME = 'mytest'

SPIDER_MODULES = ['mytest.spiders']
NEWSPIDER_MODULE = 'mytest.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mytest (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

#随机下载延迟
RANDOM_DELAY = 2

DOWNLOADER_MIDDLEWARES = {
   # 'mytest.middlewares.MytestDownloaderMiddleware': 543,
   'mytest.middlewares.RandomDelayMiddleware': 100,
   'mytest.middlewares.UserAgentMiddleware': 100,
   # 'mytest.middlewares.ProxyMiddleware': 100,
}
# 代理IP池
PROXIES =['http://47.94.230.42:9999','http://117.87.177.58:9000',
          'http://125.73.220.18:49128','http://117.191.11.72:8080']

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'mytest.pipelines.MytestPipeline': 300,
# }

pipline文件

# -*- coding: utf-8 -*-

import pymysql

class MytestPipeline(object):
    def open_spider(self,spider):
        self.conn = pymysql.connect(
            host='192.168.186.128',
            user='root',
            password='root',
            db='python',
            charset='utf8'
        )


    def process_item(self, item, spider):
        print('hahahahahahahhah')
        csl = self.conn.cursor()
        count = csl.execute('select title,comment from goods where comment<=5')
        # 打印受影响的行数
        print("查询到%d条数据:" % count)

        for i in range(count):
            # 获取查询的结果
            result = csl.fetchone()
            # 打印查询的结果
            print(result)
            # 获取查询的结果

        # 关闭Cursor对象
        csl.close()
        return item

    def close_spider(self,spider):
        self.conn.close()

middlerware文件

# -*- coding: utf-8 -*-

from scrapy import signals
from fake_useragent import UserAgent
import time,random,logging


class UserAgentMiddleware(object):
    def process_request(self, request, spider):
        request.headers.setdefault(b'User-Agent', UserAgent().random)


class RandomDelayMiddleware(object):
    def __init__(self, delay):
        self.delay = delay

    @classmethod
    def from_crawler(cls, crawler):
        delay = crawler.spider.settings.get("RANDOM_DELAY", 10)
        if not isinstance(delay, int):
            raise ValueError("RANDOM_DELAY need a int")
        return cls(delay)

    def process_request(self, request, spider):
        delay = random.randint(0, self.delay)
        logging.debug("### random delay: %s s ###" % delay)
        time.sleep(delay)


class ProxyMiddleware(object):
    '''
    设置Proxy
    '''

    def __init__(self, ip):
        self.ip = ip

    @classmethod
    def from_crawler(cls, crawler):
        return cls(ip=crawler.settings.get('PROXIES'))

    def process_request(self, request, spider):
        try:
            ip = random.choice(self.ip)
            request.meta['proxy'] = ip
        except:
            pass

test.py文件

# -*- coding: utf-8 -*-
import scrapy
from mytest.items import MytestItem

class TestSpider(scrapy.Spider):
    name = 'test'
    # allowed_domains = ['test.com']
    # start_urls = ['https://www.baidu.com/']
    '''测试IP代理'''
    def start_requests(self):
        url = 'http://httpbin.org/get'

        for i in range(1):
            yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)

    def parse(self, response):
        itme = MytestItem()
        itme['name'] = response.text
        # print(response.text)
        yield itme

        # print(response.request.headers)

你可能感兴趣的:(爬虫,python)