最近公司需要写爬虫,就用到了puppeteer,使用的时候有进行分装了,分享给大家~~~~
```javascript
// 官方文档 https://github.com/puppeteer/puppeteer
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
const chalk = require("chalk");
const log = console.log;
global.MAX_WSE = 1; //启动几个浏览器
global.WSE_LIST = []; //存储browserWSEndpoint列表
global.URL_LIST = {}; //正在请求的url地址的列表
global.WAIT_URL_DATE = 10; //N秒内不请求重复的url地址
global.MAX_TAG = 20; //每个浏览器最大标签页
global.REQUEST_BOOL = 0;
module.exports = {
/**
* 启动浏览器
* @return {*}
*/
async Pupp() {
global.REQUEST_BOOL = 1;
for (var i = 0; i < MAX_WSE; i++) {
if (global.WSE_LIST[i]) {
continue;
}
const browser = await puppeteer.launch({
headless: false,
ignoreDefaultArgs: ["--enable-automation"],
args: [
"--no-sandbox",
"--start-maximized",
"--disable-features=site-per-process",
"--disable-web-security",
"--disable-features=IsolateOrigins,site-per-process",
],
timeout: 0,
});
var browserWSEndpoint = await browser.wsEndpoint();
var browserArr = {
browserWSEndpoint: browserWSEndpoint,
runNumber: (runNumber = 0),
};
await global.WSE_LIST.push(browserArr);
}
global.REQUEST_BOOL = 2;
return;
},
/**
* 打开页面
* @return {*}
*/
async Page() {
// 启动浏览器
if (global.REQUEST_BOOL != 1) {
await this.Pupp();
} else {
return false;
}
try {
// 如果有打开的浏览器就随机挑选一个
if (global.WSE_LIST) {
var tmp = Math.floor(Math.random() * global.MAX_WSE);
var browserWSEndpoint = global.WSE_LIST[tmp].browserWSEndpoint;
} else {
var browserResult = await this.ctx.helper.Page();
}
// 连接已经打开的浏览器窗口
global.WSE_LIST[tmp].runNumber += 1;
var browserResult = await puppeteer.connect({ browserWSEndpoint });
} catch (error) {
// 页面错误删除错误浏览器
await global.WSE_LIST.splice(tmp, 1);
// 启动浏览器 将新的浏览器对象返回
var browserResult = await this.ctx.helper.Page();
}
let pageList = await browserResult.pages();
await pageList.filter((item) => {
if (item.url() == "about:blank" && pageList.length > 1) {
item.close();
}
});
if (pageList.length > global.MAX_TAG) {
return false;
}
return browserResult;
},
/**
*打开一个url地址
* @param {*} url 打开的地址
* @param {*} conf
* @param {*} waitFor 等待的时间
* @param {*} page
* @returns
*/
async goto(url, conf = {}, waitFor = 0, page = null) {
// N秒内不请求重复的url地址
var url_time = new Date().getTime() - global.URL_LIST[url];
log(global.URL_LIST);
if (
(url_time && url_time > global.WAIT_URL_DATE * 1000) ||
!global.URL_LIST[url]
) {
global.URL_LIST[url] = new Date().getTime();
// 选择一个浏览器
if (!page) {
var browserResult = await this.ctx.helper.Page();
var page = await browserResult.newPage();
}
try {
// 设置窗口最大化;
let currentScreen = await page.evaluate(() => {
return {
width: window.screen.availWidth,
height: window.screen.availHeight,
};
});
await page.setViewport(currentScreen);
// 设置浏览器用户信息
await page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
);
console.clear();
await page.goto(url, conf);
await page.waitFor(waitFor * 1000);
log(chalk.green("链接打开成功"));
return page;
} catch (error) {
delete global.URL_LIST[url];
await page.close();
log(chalk.red("链接打开失败"));
return false;
}
} else {
log(chalk.red("已经在请求的url地址"));
// return "当前url已在"+global.WAIT_URL_DATE+"秒内请求";
return false;
}
},
/**
* 获取浏览器cookie
* @param {*} page
*/
async getCookie(page) {
let cookie = await page.evaluate(() => document.cookie);
console.log("cookie :" + cookie);
},
/**
* 设置浏览器cookie
* @param {*} page
* @param {*} cookie
*/
async setCookie(page, cookie, domain = "/") {
let cookies = cookie.split(";").map((pair) => {
let name = pair.trim().slice(0, pair.trim().indexOf("="));
let value = pair.trim().slice(pair.trim().indexOf("=") + 1);
return { name, value, domain };
});
await Promise.all(
cookies.map((pair) => {
return page.setCookie(pair);
})
);
},
/**
*解析化html
* @param {*} page
*/
async JqueryHtml(page) {
var $ = await page
.evaluate(function () {
return document.body.innerHTML;
})
.then(function (html) {
let $ = cheerio.load(html);
return $;
});
return $;
},
/**
* @param {执行等待} page
* @param {*} time
*/
async waitFor(page, time = 1000) {
try {
await page.waitFor(time);
} catch (error) {
log(error);
await page.close();
}
},
/**
* @param {执行标签等待} page
* @param {*} time
*/
async waitForSelector(page, select) {
try {
await page.waitForSelector(select);
} catch (error) {
await page.close();
log(error);
}
},
/**
* 请求拦截
* @return {*}
*/
async RequestInterception(page, Type = "image,css,script,stylesheet") {
await page.setRequestInterception(true);
page.on("request", (interceptedRequest) => {
if (Type.indexOf(interceptedRequest.resourceType()) == -1) {
interceptedRequest.abort();
} else {
interceptedRequest.continue();
}
});
},
};
```