scrapy模拟登陆发送post请求
方式1(通过formdata参数)
import scrapy
import re
class PostSpider(scrapy.Spider):
name = 'post'
allowed_domains = ['github.com']
start_urls = ['https://github.com/login']
def parse(self, response):
authenticity_token = response.xpath('//input[@name="authenticity_token"]/@value').get()
utf8 = response.xpath('//input[@name="utf8"]/@value').get()
commit = response.xpath('//input[@name="commit"]/@value').get()
post_data = dict(
login='xxxxxxx',
password='xxxxxx',
authenticity_token=authenticity_token,
utf8=utf8,
commit=commit,
)
yield scrapy.FormRequest(
url='https://github.com/session',
formdata=post_data,
callback=self.parse_login
)
def parse_login(self, response):
res = re.findall(r'xxxxx',response.body.decode('utf-8'))
print(res)
方式2(通过scrapy提供的scrapy.FormRequest.from_response)【推荐使用】
import scrapy
import re
class Post2Spider(scrapy.Spider):
name = 'post2'
allowed_domains = ['github.com']
start_urls = ['http://github.com/login']
def parse(self, response):
yield scrapy.FormRequest.from_response(
response=response,
formdata={
'login':'leadingme163',
'password':'lhw200915'
},
callback=self.parse_login
)
def parse_login(self, response):
res = re.findall(r'leadingme163',response.body.decode('utf-8'))
print(res)
cookies模拟登录
def start_resquests(self):
cookies = "INTVER=1; _uuid=963801D3-642F-80E0-A13F-25B9141DFC4F22352infoc; buvid3=0ABF0CD2-090B-45DA-B77E-D971B09D3E56155831infoc; sid=lrwt9x6b; DedeUserID=383674827; DedeUserID__ckMd5=bf62095a3dcf76d2; SESSDATA=b0f5c591%2C1585285180%2Ccf74a321; bili_jct=de9d42f026a33fd2e2e9c896b7b29c0d; CURRENT_FNVAL=16; rpdid=|(u)mmYkJmYk0J'ul)kYmJkRR; LIVE_BUVID=AUTO2715826932452584; im_notify_type_383674827=0; bp_t_offset_383674827=360630163123384638; CURRENT_QUALITY=32"
cookies = {i.split("=")[0]:i.split("=")[1] for i in cookkies.split(";")}
yield scrapy.Request(
url=self.start_urls[0],
callback=self.parse,
cookies=cookies
)
def parse(self, response):
print(response)
使用session登录避免了异步访问(CSRF)
CSRF 攻击之所以能够成功,是因为黑客可以完全伪造用户的请求,该请求中所有的用户验证信息都是存在于 cookie 中,
因此黑客可以在不知道这些验证信息的情况下直接利用用户自己的 cookie 来通过安全验证
防范CSRF攻击
在业界目前防御 CSRF 攻击主要有三种策略:验证 HTTP Referer 字段;在请求地址中添加 token 并验证;
在 HTTP 头中自定义属性并验证