User-Agent
Cookies
IP
Selenium
1.User-Agent
settings.py文件中添加几个UA
USERAGENT = [
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
]
middlewares.py中设置User-Agent中间件
从settings.py文件中取值有两种方式
方法一:
class UserAgentDownloadMiddlerware(object):
def __init__(self,User_Agents):
self.User_Agents = User_Agents
@classmethod
def from_crawler(cls,crawler):
User_Agents = crawler.settings['USERAGENT']
return cls(User_Agents)
def process_request(self,request,spider):
"""
所有的request在交给下载器之前都会经过这个方法
:param request:
:param spider:
:return:
"""
import random
radom_ua = random.choice(self.User_Agents)
if random_ua:
request.headers['User-Agent']=random_ua
方法二:
class UserAgentDownloadMiddlerware(object):
def process_request(self,request,spider):
import random
User_Agent = spider.settings['USERAGENT']
random_ua = random.choice(User_Agent)
if random_ua:
request.headers['User-Agent']=random_ua
方法三:
class UserAgentDownloadMiddlerware(object):
def process_request(self,request,spider):
from fake_useragent import UserAgent
useAgent = UserAgent()
random_ua = useAgent.random
if random_ua:
print('经过了下载中间件',random_ua)
request.headers['User-Agent'] = random_ua
2.IP代理中间件
在settings.py文件中模拟一个代理池
PROXIES =[
{'ip':'127.0.0.1:6379','pwd':'zwz:1234'},有账号密码
{'ip':'127.0.0.1:6372','pwd':None},没有账号密码
{'ip':'127.0.0.1:6373','pwd':None},
{'ip':'127.0.0.1:6370','pwd':None}
]
middlewares.py中设置代理中间件
class ProxyDownloadMiddlerware(object):
def process_request(self,request,spider):
proxies = spider.settings['PROXIES']
import random
proxy_rm = random.choice(proxies)
if proxy_rm['pwd']:
#有账号密码的代理
#对账号密码进行base64编码
import base64
base64_pwd = base64.b64encode(proxy_rm['pwd'].encode('utf-8')).decode('utf-8')
# 对应到代理服务器的信令格式里
request.headers['Proxy-Authorization'] = 'Basic ' + base64_pwd
#设置ip
request.meta['proxy'] = proxy_rm['ip']
else:
# 设置ip
request.meta['proxy'] = proxy_rm['ip']
3.Cookie中间件
在settings.py文件中模拟一个Cookie池
COOKIES =[
{'cookie1':'xxxx'},
{'cookie1':'xxxx'},
{'cookie1':'xxxx'},
{'cookie1':'xxxx'},
{'cookie1':'xxxx'},
]
middlewares.py中设置cookie中间件
import random
class RandomCookiesMiddleware(object):
def process_request(self, request, spider):
cookies = spider.settings['COOKIES']
# 随机获取一个cookies
cookie = random.choice(cookies)
if cookie:
request.cookies = cookie
4.Selenium获取动态的网页爬取
在创建的spider项目里添加,因为有的网页是动态的,还有静态的,所以放在需要爬取动态页面的spider里
import scrapy
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['baidu.com']
start_urls = ['http://www.baidu.com/']
# 创建浏览器驱动
driver = webdriver.Firefox(
executable_path='/home/zwz/Desktop/浏览器驱动/geckodriver/'
)
driver.set_page_load_timeout(10)
def parse(self, response):
print(response.status,response.request.headers)
middlewares.py中设置Selenium获取动态的网页爬取
#scrapy并不支持动态加载网页的爬取
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from scrapy.http import HtmlResponse
class SeleniumDownloadMiddlerWare(object):
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
#使用信号量,监控爬虫结束的信号
crawler.signals.connect(s.close, signal=signals.spider_closed)
return s
def close(self, spider):
import time
time.sleep(5)
spider.driver.close()
def process_request(self,request,spider):
if spider.name == 'test':
#获取url
url = request.url
if url:
try:
# self.driver.get(url)
spider.driver.get(url)
# pageSource = self.driver.page_source
pageSource = spider.driver.page_source
if pageSource:
"""
url, status=200, headers=None,
body=b'', flags=None, request=None
"""
return HtmlResponse(
url=url,
status=200,
body=pageSource.encode('utf-8'),
request=request
)
except TimeoutException as err:
print('请求超时',url)
return HtmlResponse(
url=url,
status=408,
body=b'',
request=request
)
最后别忘记设置和激活下载中间件
在settings.py文件中
DOWNLOADER_MIDDLEWARES = {
# 'downloadmiddlerware.middlewares.DownloadmiddlerwareDownloaderMiddleware': 543,
'downloadmiddlerware.middlewares.UserAgentDownloadMiddlerware':543,
'downloadmiddlerware.middlewares.ProxyDownloadMiddlerware':544,
'downloadmiddlerware.middlewares.RandomCookiesMiddleware':545,
'downloadmiddlerware.middlewares.SeleniumDownloadMiddlerWare':546,
}
关于爬虫的断点爬取:
scrapy crawl 爬虫名称 -s JOBDIR=crawls/爬虫名称
requests.queue : 保存的请求的任务队列
requests.seen : 保存的是指纹
spider.status : 爬虫运行的状态
scrapy settings.py设置文件(相关参数)
项目名称
BOT_NAME = 'downloadmiddlerware'
# 项目名称
BOT_NAME = 'downloadmiddlerware'
#爬虫存储的文件路径
SPIDER_MODULES = ['downloadmiddlerware.spiders']
#创建爬虫文件的模版,创建号的爬虫文件会存放在这个目录下
NEWSPIDER_MODULE = 'downloadmiddlerware.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#设置ua,来模拟浏览器请求
#USER_AGENT = 'downloadmiddlerware (+http://www.yourdomain.com)'
# Obey robots.txt rules
# 设置是否需要准守robot协议:默认为True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 设置请求的最大并发数据(下载器) 默认是16
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#设置请求的下载延时,默认为0
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#设置网站的最大并发请求数量,默认是8
CONCURRENT_REQUESTS_PER_DOMAIN = 16
#设置某个ip的最大并发请求数量,默认是0
# 如果非0
# 1.CONCURRENT_REQUESTS_PER_DOMAIN不生效,
# 这时候请求的并发数量将针对于ip,而不是网站了
#
# 2.设置的DOWNLOAD_DELAY就是正对于ip而不是网站了
CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# 是否携带cookies:默认为True
COOKIES_ENABLED = False
#COOKIES_DEBUG:跟踪cookies,默认情况下为False
COOKIES_DEBUG =True
#关于日志信息的设置
LOG_FILE = 'xxx.log'
LOG_LEVEL = 'INFO/DEBUG/....'
# Disable Telnet Console (enabled by default)
#是一个终端的扩展插件
TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#设置默认的请求头(cookies信息不要放在这里)
DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#设置和激活爬虫中间件
#SPIDER_MIDDLEWARES = {
# 'downloadmiddlerware.middlewares.DownloadmiddlerwareSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#设置和激活下载中间件(后面的数字表示优先级)
DOWNLOADER_MIDDLEWARES = {
# 'downloadmiddlerware.middlewares.DownloadmiddlerwareDownloaderMiddleware': 543,
# 'downloadmiddlerware.middlewares.UserAgentDownloadMiddlerware':543,
'downloadmiddlerware.middlewares.SeleniumDownloadMiddlerWare':543,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#设置扩展
EXTENSIONS = {
'scrapy.extensions.telnet.TelnetConsole': None,
}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#设置和激活管道文件(后面的数字表示优先级)
ITEM_PIPELINES = {
'downloadmiddlerware.pipelines.DownloadmiddlerwarePipeline': 300,
}
#自动限速的扩展(实现上一个请求和下一个请求之间的时间是不固定的)
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#默认请情框下自动限速的扩展是关闭的:AUTOTHROTTLE_ENABLED:False
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#初始的下载吧延时默认是5秒
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#最大下载延时
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#针对于网站的最大的并行请求数量
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#调试模式:默认为False,未开启
AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#设置数据的缓存,默认情况下是未开启的
HTTPCACHE_ENABLED = True
#设置缓存的超时时间,默认为0表示永久有效
HTTPCACHE_EXPIRATION_SECS = 0
#设置缓存的存储文件路径
HTTPCACHE_DIR = 'httpcache'
#忽略某些状态码的请求结果(Response)
HTTPCACHE_IGNORE_HTTP_CODES = []
#开始缓存的扩展插件
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'