爬虫练手:使用scrapy模拟登录豆瓣(有验证码)并获取登录后信息

python版本:3.5

douban_self_info.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request,FormRequest
import urllib.request

class DbSpider(scrapy.Spider):
    name = "db"
    allowed_domains = ["douban.com"]
    # start_urls = ['http://douban.com/']

    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}

    def start_requests(self):
        return [Request("https://accounts.douban.com/login",
                        callback=self.parse,
                        meta={
                            "cookiejar":1
                            }
                        )]

    def parse(self, response):
        captcha = response.xpath("//img[@id='captcha_image']/@src").extract()
        if len(captcha) > 0:
            print("此时有验证码")
            localpath = "E:/pictest/captchar.jpg"
            urllib.request.urlretrieve(captcha[0],filename=localpath)
            print("请查看本地验证码图片并输入验证码")
            captcha_value = input()

            data = {
                "form_email": "xxxxx",
                "form_password": "xxxxx",
                "captcha-solution":str(captcha_value),
                "redir": "https://www.douban.com/people/82984134/"  # 登录后要返回的页面
            }

        else:
            print("此时没有验证码")

            data = {
                "form_email":"xxxxx",
                "form_password":"xxxxx",
                "redir":"https://www.douban.com/people/82984134/"#登录后要返回的页面
            }
        print("登录中。。。。。。")
        return [FormRequest.from_response(response,
                                          meta={"cookiejar":response.meta["cookiejar"]},
                                          headers=self.headers,
                                          formdata=data,
                                          callback=self.next,
                                          )]

    def next(self,response):
        print("此时已经登录完成并爬取了个人中心的数据")
        title = response.xpath("/html/head/title/text()").extract()
        # db_id = response.xpath("//div[@class='infobox']/div[@class='p1']/text()").extract()
        print(title[0])
        # print(db_id[0])

你可能感兴趣的:(爬虫练手:使用scrapy模拟登录豆瓣(有验证码)并获取登录后信息)