本文章代码仅供学习使用,如有侵权请联系作者删除,多谢。
主要通过一个scrapy爬虫,理解如何登陆网站并使用登陆后的cookies继续爬取。
登陆的用户名密码用XXX表示。
# -*- coding: utf-8 -*- import os import scrapy from scrapy.spider import CrawlSpider, Rule from scrapy.http.request import Request from scrapy.linkextractors import LinkExtractor def add_cookie(r): r.meta.update(cookiejar=1) new_r = r.replace(meta=r.meta) return new_r class ExampleSpider(CrawlSpider): name = "example1" rules = ( Rule(LinkExtractor(allow='ResumeViewFolder'),process_request=add_cookie,callback='parse_one_candidate',follow=True), Rule(LinkExtractor(allow='ehire.51job.com',),process_request=add_cookie,follow=True) ) def start_requests(self): yield Request('http://ehire.51job.com/MainLogin.aspx', callback=self.parse_login_page) def parse_login_page(self, response): cookies = {} cookie_keys = ['hidLangType', 'hidAccessKey', 'hidEhireGuid', 'hidRetUrl', 'fksc', '__VIEWSTATE'] isRememberMe = "false" for key in cookie_keys: css_value = "#" + key + "::attr(value)" try: cookie_value = response.css(css_value).extract()[0] except Exception as e: print("cookies value err", css_value, e) cookies[key] = '' else: cookies[key] = cookie_value cookies['txtMemberNameCN'] = "xxxx" cookies['txtUserNameCN'] = 'xxxx' cookies['txtPasswordCN'] = 'xxxx' cookies['ctmName'] = "xxxx" cookies['userName'] = 'xxxx' cookies['password'] = 'xxxx' cookies['checkCode'] = '' cookies['oldAccessKey'] = cookies['hidAccessKey'] cookies['langtype'] = cookies['hidLangType'] cookies['isRememberMe'] = 'false' cookies['sc'] = cookies['fksc'] cookies['ec'] = cookies['hidEhireGuid'] cookies['returl'] = '' cookies['referrurl'] = '' return [ scrapy.FormRequest("https://ehirelogin.51job.com/Member/UserLogin.aspx?", formdata=cookies, meta={'cookiejar': 1}, callback=self.login_in) ] def login_in(self, response): self.recored2file(response) for request in self._requests_to_follow(response): yield request def recored2file(self, response): with open('./login.html','wb') as f: f.write(response.body) def parse_one_candidate(self, response): pass