scrapy 模拟登陆 并且爬取51cto 文章

a51cto.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import FormRequest
from cto.items import CtospiderItem

class CtoSpider(scrapy.Spider):
    name = '51cto'
    allowed_domains = ['51cto.com']

    def start_requests(self):
        urls = ['http://home.51cto.com/index']
        for url in urls:
            yield scrapy.Request(url, callback=self.cto_login, meta={'cookiejar': 1})

    def cto_login(self, response):
        # 获取csrf值
        csrf = response.xpath("//input[@name='d1g0Smlta3o7DE0kJiU8OQM3WTMjXhtDJCp8JC0qADhPH2YbGT5dHw==']/@value").extract_first()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://blog.51cto.com',
            'Content-Type': 'application/x-www-form-urlencoded',
        }
        # 此处为logger输出供调试时使用
        # self.logger.info("获取csrf值为 %s" % csrf)
        yield FormRequest.from_response(response,
                                        url='https://blog.51cto.com/linuxliu?type=1',
                                        headers=headers,
                                        meta={'cookiejar': response.meta['cookiejar']},
                                        formdata={
                                                  # 这个位置注意0要加引号,不然会报错,这个参数意思是是否记住密码10天内自动登录
                                                  'LoginForm[rememberMe]': '0',
                                                  'LoginForm[username]': '****',
                                                  'LoginForm[password]': '****',
                                                  '_csrf': csrf,
                                                  },
                                        callback=self.after_login,
                                        dont_click=True,
                                        )

    def after_login(self, response):

        # 获取的网页内容
        home_page = response.xpath("//a[@class='con']/text()").extract()
        if 'wx5c789cd76c3af' in home_page:
            self.logger.info('我的博客')
        else:
            self.logger.error('登录失败')

        resps = response.css("ul.artical-list li")
        for resp in resps:
            # 写入item字段中
            item['title_url'] = resp.css("a.tit::attr(href)").extract_first()
            item['title'] = resp.css("a.tit::text").extract_first().strip()
            # fullname的格式为“[名称](链接)”之所以这样是因为
            # markdown语法里这个表示链接的意思,点击名称直接打开链接内容
            item['fullname'] = '[' + item['title'] + ']' + '(' + item['title_url'] + ')'
            # 此处logger也是调试使用
            # self.logger.info("title url的值为:%s , title的值为%s" % (tit_url, tit))
            yield item

        # 下一页内容获取
        next_page = response.css('li.next a::attr(href)').extract_first()
        # self.logger.info("下一页链接为:%s" % next_page)
        if next_page is not None:
            yield scrapy.Request(next_page, callback=self.after_login)
items.py
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class CtospiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    title_url = scrapy.Field()
    fullname = scrapy.Field()

 

 

scrapy crawl 51cto -o cto.csv

你可能感兴趣的:(爬虫)