利用Puppeteer实现验证码网站登录

Puppeteer

puppeteer是由Google官方推出的一个node库,可以启动Chromium浏览器模拟人为操作,类似于PhantomJS。这为爬虫和自动化测试提供了便利。

流程

登录流程很简单:启动puppeteer打开目标网站,输入信息提交登录

利用Puppeteer实现验证码网站登录_第1张图片

架构

由于puppeteer是基于node的库,可以利用node做一个http服务。

1. 调用方发起http请求,puppeteer创建page实例打开目标网站,返回验证码等信息

2. 新生成的page会维护在一个对象池中,并由一个watcher监控page打开超时

3. 调用方传递登录账号验证码等信息发起登录请求

4. 从puppeteer对象池中拿出上一次的page实例,执行登录

利用Puppeteer实现验证码网站登录_第2张图片

示例

模拟登录浙江税务局

puppeteer对象:

const puppeteer = require('puppeteer')

class PageObject {
    constructor(options = {}) {
        this.headless = options.headless;
        this.args = options.args || [];
        this.executablePath = options.executablePath;

        this.browser = null;
        this.page = null;
    }

    async init() {
        this.browser = await puppeteer.launch({
            headless: this.headless,
            executablePath: this.executablePath,
            ignoreHTTPSErrors: true,
            args: this.args
        })
        this.page = await this.browser.newPage()
    }

    async close() {
        await this.browser.close()
    }
}

module.exports = PageObject

脚本:

var express = require('express');
var router = express.Router();
const puppeteer_path = '/app/chrome/Chromium';
var PageObject = require('pageobject');

pageObject = new PageObject({
    executablePath : puppeteer_path,
    headless : false
});

router.get('/login', (req, res, next) => {
    (async() => {
        await pageObject.init();
        await pageObject.page.setJavaScriptEnabled(true);
        await pageObject.page.setViewport({
            width:1024,
            height:768
        });

        var bufferImg;
        await pageObject.page.on('response', response => {
            if (response.url().indexOf('kaptcha.jpg') > 0){
                response.buffer().then(function (value) {
                    bufferImg = value;
                });
            }
        });

        await pageObject.page.goto('http://etax.zjtax.gov.cn/dzswj/user/login.html');
        await pageObject.page.waitForSelector('#username');
        await pageObject.close()

        res.send({
            img : bufferImg.toString('base64')
        });
    })();
})

router.post('/submit', (req, res, next) => {
    (async() => {
        await pageObject.page.type('#username', req.body.loginName);
        await pageObject.page.type('#password', req.body.loginPwd);
        await pageObject.page.type('#vcode', req.body.loginCode);
        await delay(1000);

        var result = await pageObject.page.evaluate(() => document.querySelector(".bootbox-body"));
        if (result == null){
            res.send({
                status : 'success'
            });
            await pageObject.page.click('#loginBtn1');
        }
    })();
})

function delay(timeout) {
    return new Promise((resolve) => {
        setTimeout(resolve, timeout);
    });
}

module.exports = router;

启动nodejs,就可以对外提供http服务了

你可能感兴趣的:(爬虫)