scrapy爬虫实战(四)--------------登陆51job并使用cookies进行爬取

本文章代码仅供学习使用,如有侵权请联系作者删除,多谢。


主要通过一个scrapy爬虫,理解如何登陆网站并使用登陆后的cookies继续爬取。

登陆的用户名密码用XXX表示。

# -*- coding: utf-8 -*-
import os

import scrapy

from scrapy.spider import CrawlSpider, Rule
from scrapy.http.request import Request
from scrapy.linkextractors import LinkExtractor

def add_cookie(r):
    r.meta.update(cookiejar=1)
    new_r = r.replace(meta=r.meta)
    return new_r

class ExampleSpider(CrawlSpider):
    name = "example1"
    rules = (
        Rule(LinkExtractor(allow='ResumeViewFolder'),process_request=add_cookie,callback='parse_one_candidate',follow=True),
        Rule(LinkExtractor(allow='ehire.51job.com',),process_request=add_cookie,follow=True)
    )

    def start_requests(self):
        yield Request('http://ehire.51job.com/MainLogin.aspx',
                      callback=self.parse_login_page)

    def parse_login_page(self, response):
        cookies = {}
        cookie_keys = ['hidLangType', 'hidAccessKey', 'hidEhireGuid', 'hidRetUrl', 'fksc', '__VIEWSTATE']
        isRememberMe = "false"
        for key in cookie_keys:
            css_value = "#" + key + "::attr(value)"
            try:
                cookie_value = response.css(css_value).extract()[0]
            except Exception as e:
                print("cookies value err", css_value, e)
                cookies[key] = ''
            else:
                cookies[key] = cookie_value
        cookies['txtMemberNameCN'] = "xxxx"
        cookies['txtUserNameCN'] = 'xxxx'
        cookies['txtPasswordCN'] = 'xxxx'
        cookies['ctmName'] = "xxxx"
        cookies['userName'] = 'xxxx'
        cookies['password'] = 'xxxx'
        cookies['checkCode'] = ''
        cookies['oldAccessKey'] = cookies['hidAccessKey']
        cookies['langtype'] = cookies['hidLangType']
        cookies['isRememberMe'] = 'false'
        cookies['sc'] = cookies['fksc']
        cookies['ec'] = cookies['hidEhireGuid']
        cookies['returl'] = ''
        cookies['referrurl'] = ''

        return [
            scrapy.FormRequest("https://ehirelogin.51job.com/Member/UserLogin.aspx?",
                               formdata=cookies,
                               meta={'cookiejar': 1},
                               callback=self.login_in)
        ]

    def login_in(self, response):
        self.recored2file(response)
        for request in self._requests_to_follow(response):
            yield request

    def recored2file(self, response):
        with open('./login.html','wb') as f:
            f.write(response.body)

    def parse_one_candidate(self, response):
        pass

你可能感兴趣的:(Python)