Puppeteer拦截某条url并返回其响应内容(场景和方法) API RequestInterception拦截器的使用

page.setRequestInterception(true)拦截器的使用方法和场景

现附上Puppeteer的Api的链接https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md

实用场景(没错就是实用):比如我用Puppeteer模拟某个网页,然后我只想抓到这条网页的url的response的内容,或者我需要截图或者生成PDF但是只要文件我就可以过滤掉后缀是图片的url

使用的api:定位到api的链接https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#class-request 

主要是class: Request 和 class: Response 两大块相结合

官方例子参考1:

await page.setRequestInterception(true);
page.on('request', request => {
  request.respond({
    status: 404,
    contentType: 'text/plain',
    body: 'Not Found!'
  });
});

实际使用的例子参考1:

'use strict';

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch({
        ignoreHTTPSErrors: true,
        headless: false,
        args: ['--no-sandbox', '--disable-setuid-sandbox'],
    }).catch(() => browser.close);
    const page = await browser.newPage();

    let response = await page.goto('http://www.google.com');
    response.json();// 将response.body 转成json。
    console.log(response);
    await page.setRequestInterception(true);
    page.on('requestfailed', request => {
        console.log(request.url() + ' ' + request.failure().errorText);
    });
    // response.ok(); //  返回一个boolean值 如果状态码为200-299则为true, 其他则为false.
    // response.status; // 返回状态码
    // response.text(); // 返回 response body.
    // response.headers // 返回 HTTP headers
    await browser.close();
})();

实际参考例子2:

'use strict';

const puppeteer = require('puppeteer');
(async () => {
    try {
        const browser = await puppeteer.launch({
            ignoreHTTPSErrors: true,
            headless: false,
            args: ['--no-sandbox', '--disable-setuid-sandbox'],
        }).catch(() => browser.close);
        const page = await browser.newPage();
        await page.setRequestInterception(true);
        var num = 0;
        await page.on('request', request => {
            //这就是请求的类型如果是图片类型的话执行abort拦截操作 否则continue继续请求别的
            if (request.resourceType() === 'image') {
                console.log(num + "image: ");
                let res = request.response();
                console.log(request.url);
                console.log(res);
                num++;
                request.abort();
            } else {
                // request.respond({
                //     status: 200,
                //     contentType: 'text/plain',
                //     body: 'GOOD!'
                // });
                console.log("continue")
                request.continue();
            }

        });
        // page.on('request', request => {
        //     if (request.resourceType() === 'image')
        //         request.abort();
        //     else
        //         request.continue();
        // });
        await page.goto('https://news.google.com/news/');
        await page.screenshot({path: 'news.png', fullPage: true});

        await browser.close();
    } catch (e) {
        console.log(e);
    }

})();

基本代码注释也很清楚也很容易理解,这是两个比较常用的例子,接下来是实战中更加常用的实用性例子

实际参考例子3(重点):

/**
 * 获取拦截某条url内容的
 * @param page
 * @returns {Promise}
 */
async function getResponseMsg(page) {
    return new Promise((resolve, reject) => {
        page.on('request', request => {
            if (request.url() === 'https://test.do') {
                console.log(request.url());
                console.log("拦截到了这条url然后就该请求了");
                page.on('response', response => {
                    if (response.url() === 'https://test.do') {
                        const req = response.request();
                        console.log("Response 的:" + req.method, response.status, req.url);
                        let message = response.text();
                        message.then(function (result1) {
                            results = result1;
                            resolve(results);
                        });
                    }

                });
                request.continue();

            }
            else {
                console.log(request.url());
                console.log("continue");
                request.continue();
            }

        });
    }).catch(new Function()).then();

}

稍微解释下上面这个例子,就是拦截拿到内容 然后返回 代码也清晰不多累赘 全是爬坑干货 欢迎一起爬坑

你可能感兴趣的:(Node,Puppeteer,爬虫)