scrapy.Requests()中的cookies属于字典,需要转换
def start_requests(self):
cookie_str = "GUID=**702477247"
cookie_dic = {}
cookie_lst = cookie_str.split("; ")
for it in cookie_lst:
if "https://" in it:
it_cop = it.replace("=","|",1)
k, v = it_cop.split("|")
cookie_dic[k.strip()] = v.strip()
else:
k,v = it.split("=")
cookie_dic[k.strip()] = v.strip()
head = {
"Referer":"https://user.17k.com/www/bookshelf/",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
yield scrapy.Request(
url=self.start_urls[0],
headers=head,
cookies=cookie_dic)
scrapy.FormRequest()可以提交post请求
def start_requests(self):
url = "https://passport.17k.com/ck/user/login"
yield scrapy.FormRequest(
url=url,
formdata ={
"loginName":"***",
"password":"***"
},
callback=self.parse
)
def parse(self, response,**kwargs):
yield scrapy.Request(
url=self.start_urls[0],
callback=self.get_shujia
)
def get_shujia(self, resp):
print(resp.text)
- DownloaderMiddleware
下载中间件,它是介于引擎和下载器之间,引擎在获取到request对象后,会交给下载器去下载,在这之间我们可以设置下载中间件- SpiderMiddleware
爬虫中间件.是处于引擎和spider之间的中间件
class XiaoshuoDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
'''
:param request:当前的请求
:param response:请求的响应
:param spider:发送请求的spider
:return:
'''
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
在引擎将请求的信息交给下载器之前,自动的调用该方
param request:当前请求
param spider:发出该请求的spider
注意,process_request返回值是有规定的。
- 如果返回的None,不做拦截.继续向后面的中间件执行.
- 如果返回的是Request、后续的中间件将不再执行.将请求重新交给引擎.引擎重新扔给调度器
- 如果返回的是Response、后续的中间件将不再执行.将响应信息交给引擎.引擎将响应丢给spider,进行数
# setting文件中打开中间USER_AGENT
# USER_AGENT = "xiaoshuo (+http://www.yourdomain.com)"
# 导包
from xiaoshuo.settings import USER_AGENT_LIST
from random import choice
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
UA = choice(USER_AGENT_LIST)
request.headers['User-Agent'] = UA
return None
# 打开setting文件中的中间件
#DOWNLOADER_MIDDLEWARES = {
# "xiaoshuo.middlewares.XiaoshuoDownloaderMiddleware": 543,
}
配置代理的网站:https://www.kuaidaili.com/
免费代理
# 在setting文件中配置proxt列表
PROXY_IP_LIST = [
"27.154.6.110":"20714",
"115.219.1.53":"20231"
]
# 导包
from xiaoshuo.settings import PROXY_IP_LIST
from random import choice
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
IP = choice(PROXY_IP_LIST)
request.meta['proxy'] = "https://" + ip
return None
设置隧道道理
# 快代理的网站右scrapy中间件的源码
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
proxy = "tps138.kdlapi.com:15919"
request.meta['proxy'] = f"http://{proxy}"
# 隧道生成的用户名密码
request.headers ['Proxy-Authorization'] = basic_auth_header('user','pwd')
request.headers ["Connection"]= "close"
使用selenium登录获取cookie
原来中间件最大优先级是100
我们要想替换掉原来的中间件,可以设置优先级为99
DOWNLOADER_MIDDLEWARES = {
"xiaoshuo.middlewares.XiaoshuoDownloaderMiddleware": 99,
}
def process_request(self, request, spider):
#所有的请求都会到我这里.
#需要进行判断.判断出是否需要用selenium来处理请求
#开始selenium的操作、返回页面源代码组装的response
新建一个reauest.py文件处理selenium请求
from scrapy import Request
class SeleniumRequest(Request):
paa
# 自定义selenium请求,继承scrapy中的request、
中间件中判断请求是不是 SeleniumReques请求
# 导包
from scrapy.http.response.html import HtmlResponse
from selenium.webdriver import Chrome
def process_request(self, request, spider):
#所有的请求都会到我这里.
#需要进行判断.判断出是否需要用selenium来处理请求
#开始selenium的操作、返回页面源代码组装的response
# isinstance(request,SeleniumReques):
if isinstance(request,SeleniumReques):
self.web.get(request.url)
page_source = self.web.page_source
# 封装一个响应对象
return HtmlResponse(
url = request.url,
status = 200,
body = page_source,
request = request,
encoding = 'utf-8'
)
else:
return None
def spider_opened(self, spider):
self.web = Chrome()
在中间件中自定义步骤
# 修改from_crawler函数
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
# 自定义的步骤 执行时间
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
'''
爬虫执行的时机
engine_started = object()
engine_stopped = object()
spider_opened = object()
spider_idle = object()
spider_closed = object()
spider_error = object()
request_scheduled = object()
request_dropped = object()
request_reached_downloader = object()
request_left_downloader = object()
response_received = object()
response_downloaded = object()
headers_received = object()
bytes_received = object()
item_scraped = object()
item_dropped = object()
item_error = object()
feed_slot_closed = object()
feed_exporter_closed = object()
'''