python scrapy 模拟登录(使用selenium自动登录)

常用

1、scrapy startproject si

放chromedriver.exe到 si/si文件夹里

2、vi settings.py
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'loginscrapy.middlewares.LoginscrapyDownloaderMiddleware': 543,
}
3、vi middlewares.py
from scrapy import signals
from scrapy.http import HtmlResponse
from selenium import webdriver
import os,sys
from PIL import Image
import time
import tesserocr
import requests
class LoginscrapyDownloaderMiddleware(object):

@classmethod
def from_crawler(cls, crawler):
    # This method is used by Scrapy to create your spiders.
    s = cls()
    crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
    return s

def yzm(self, path):
    ima = Image.open(path)
    image = ima.resize((480, 200), Image.ANTIALIAS)
    image = image.convert('L')
    # 这个是二值化阈值
    threshold = 160
    table = []
    for i in range(256):
        if i < threshold:
            table.append(0)
        else:
            table.append(1)
    # 通过表格转换成二进制图片,1的作用是白色,不然就全部黑色了
    image = image.point(table, "1")
    return tesserocr.image_to_text(image)[:5]  # print ocr text from image

def dl(self, browser, username, password):
    un = browser.find_element_by_id("username")
    un.clear()
    un.send_keys(username)
    pw = browser.find_element_by_id("password")
    pw.clear()
    pw.send_keys(password)
    if os.path.exists("Login_page.png"):
        os.remove("Login_page.png")
    if os.path.exists("Verification.png"):
        os.remove("Verification.png")
    browser.save_screenshot('Login_page.png')
    photo = Image.open('Login_page.png')
    box = (980, 378, 1076, 417)
    photo.crop(box).save('Verification.png')
    yz = browser.find_element_by_id("jcaptcha")
    yzmmmmm = self.yzm("Verification.png")
    print(yzmmmmm)
    yz.send_keys(yzmmmmm)
    lg = browser.find_element_by_id("login-btn")
    lg.click()
    time.sleep(1)
    try:
        err = browser.find_element_by_class_name("err-vaild")
        return err.text.strip()
    except:
        return None

def process_request(self, request, spider):
    if spider.name == "login":
        if request.url.find("login") != -1:
            chromedriver = "chromedriver.exe"
            options = webdriver.ChromeOptions()
            options.add_argument('--ignore-certificate-errors')
            spider.driver = webdriver.Chrome(chromedriver, chrome_options=options)  # Get local session of firefox
            spider.driver.set_window_size(1200, 900)
            spider.driver.get(request.url)
            username = "yourname"
            password = "yourpass"
            err = self.dl(spider.driver, username, password)
            i = 0
            while err != None and i < 10:
                i += 1
                err = self.dl(spider.driver, username, password)
            if err != None:
                print("无法登录,用户名或密码或验证码错误!需要重新执行程序.")
                sys.exit()
            time.sleep(2)
            spider.cookies = spider.driver.get_cookies()
            #spider.driver.close()
            return HtmlResponse(url=spider.driver.current_url,  # 登录后的url
                                body=spider.driver.page_source,  # html源码
                                encoding='utf-8')
        else:
            req = requests.session()  # 会话
            for cookie in spider.cookies:
                req.cookies.set(cookie['name'], cookie["value"])
            req.headers.clear()  # 清空头
            newpage = req.get(request.url)
            return HtmlResponse(url=request.url,  # 当前连接
                                body=newpage.text,  # 源代码
                                encoding="utf-8")  # 返回页面信息
    return None

4、vi si/si/main.py
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(file)))
execute(['scarpy', 'crawl', 'login'])
5、vi si/si/spiders/login.py

-- coding: utf-8 --

import scrapy

class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['chinanetcenter']
start_urls = ['https://portal.chinanetcenter.com/cas/login?service=https%3A%2F%2Fsi.chinanetcenter.com%2Fr_sec_login&appcode=serviceinsight'
,'https://si.chinanetcenter.com/']

def __init__(self):
    super().__init__()
    driver=None
    cookies=None

def parse(self, response):
    print(response.url)

你可能感兴趣的:(python scrapy 模拟登录(使用selenium自动登录))