说明:将pdf首页和pdf内容分开爬取,最后合并到一起
准备:node的16以上版本,创建一个app.js文件
//安装插件 puppeteer fs pdf-lib
const puppeteer = require('puppeteer');
const fs = require('fs');
const { PDFDocument } = require('pdf-lib');
(async () => {
const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'], ignoreHTTPSErrors: true, headless: true });
Promise.all([
cover(
browser,
'需要爬的网站'
),
content(
browser,
'需要爬的网站'
)]).then(async(res) => {
await browser.close();
console.timeEnd()
const pdfDoc = await PDFDocument.create();
const coverDoc = await PDFDocument.load(res[0]);
const [coverPage] = await pdfDoc.copyPages(coverDoc, [0]);
pdfDoc.addPage(coverPage);
const reportDoc = await PDFDocument.load(res[1]);
const reportPages = await pdfDoc.copyPages(reportDoc, reportDoc.getPageIndices());
reportPages.forEach((page) => {
pdfDoc.addPage(page);
});
const pdfBytes = await pdfDoc.save();
fs.writeFileSync(`合并.pdf`, pdfBytes);
})
})()
//爬首页 没有页头页尾
const cover = (browser, url) => {
return new Promise(async (resolve) => {
const page = await browser.newPage();
await page.goto(`${url}`, { timeout: 0, waitUntil: 'networkidle0' });
const coverBuffer = await page.pdf({
path: '首页.pdf',
printBackground: true,
preferCSSPageSize: false,
displayHeaderFooter: false,
landscape: false,
scale: 1,
pageRanges: '1',
format: 'A4',
margin: {
top: '0cm',
bottom: '0cm',
left: '0cm',
right: '0cm'
},
})
resolve(coverBuffer)
})
}
//爬内容 存在页头页尾
const content = (browser, url) => {
return new Promise(async (resolve) => {
const page = await browser.newPage();
await page.goto(`${url}`, { timeout: 0, waitUntil: 'networkidle0' });
const contentBuffer = page.pdf({
path: '内容.pdf',
printBackground: true,
preferCSSPageSize: false,
displayHeaderFooter: true,
landscape: false,
pageRanges: '1-',
format: 'A4',
margin: {
top: '1cm',
bottom: '1cm',
left: '0cm',
right: '0cm'
},
headerTemplate: '',
footerTemplate: '',
})
resolve(contentBuffer)
})
}
在当前存放app.js的文件夹中打开终端,执行node app.js即可
说明:linux需要安装chorme浏览器,并配置字体包
准备:node的16以上版本,创建一个app.js文件
//安装插件 npm i puppeteer fs pdf-lib express
const express = require('express');
const app = express();
const puppeteer = require('puppeteer');
const fs = require('fs');
const { PDFDocument } = require('pdf-lib');
// const pdf = require('pdf-parse');
// const { Readable } = require("stream");
const port = 4000
// cors处理跨域
// var cors = require('cors')
// var app = express()
// var corsOptions = {
// origin: 'http://www.sina.com', //只有该域名可以访问
// optionsSuccessStatus: 200
// }
// app.get('/products', cors(corsOptions), function (req, res, next) {
// res.json({msg: '只有前面的sina.com域名可以访问'})
// })
app.get('/reportPDF', async (req, next) => {
next.header('Access-Control-Allow-Origin', '*'); //这个表示任意域名都可以访问,这样写不能携带cookie了。
// next.header('Access-Control-Allow-Origin', 'http://www.baidu.com'); //这样写,只有www.baidu.com 可以访问。
// next.header('Access-Control-Allow-Headers', 'Content-Type, Content-Length, Authorization, Accept, X-Requested-With , yourHeaderFeild');
// next.header('Access-Control-Allow-Methods', 'PUT, POST, GET, DELETE, OPTIONS');//设置方法
const { id, token } = req.query || {};
try {
if (!id || !token) {
next.send({
code: -1,
msg: "缺少参数"
});
return;
}
//executablePath这里需要将chrome路径输入 例如/data/node/chrome-linux/chrome
const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'], ignoreHTTPSErrors: true, headless: true, executablePath: 'xxxxx' });
// 此处id token 和 req.headers.origin的作用
// id进行到爬取页面后,我的爬取页面需要id去请求数据
// token的由于我的爬取页面需要登录
// req.headers.origin的作用是,由于我是在同一个网页中发起请求,所以要爬取同一个网页的页面,origin可以拿到我当前发起请求的网页域名
Promise.all([cover(browser, id, token, req.headers.origin), content(browser, id, token, req.headers.origin)]).then(async (res) => {
await browser.close();
const pdfDoc = await PDFDocument.create();
const coverDoc = await PDFDocument.load(res[0]);
const [coverPage] = await pdfDoc.copyPages(coverDoc, [0]);
pdfDoc.addPage(coverPage);
const reportDoc = await PDFDocument.load(res[1]);
const reportPages = await pdfDoc.copyPages(reportDoc, reportDoc.getPageIndices());
reportPages.forEach((page) => {
pdfDoc.addPage(page);
});
const pdfBytes = await pdfDoc.save();
// fs.writeFileSync(`合并.pdf`, pdfBytes);
next.set({
'Content-Type': 'application/pdf',
// 'Content-Length': pdfBytes.length,
});
next.send(Buffer.from(pdfBytes));
})
} catch (e) {
next.send({
code: -1,
msg: '爬取错误',
});
logger.info(e)
}
})
app.listen(port, () => {
console.log('运行成功')
})
// 异常处理
process.on('unhandledRejection', error => {
process.exit(1);
});
process.on('uncaughtException', function (err) {
try {
var killTimer = setTimeout(function () {
process.exit(1);
}, 30000);
killTimer.unref();
} catch (e) {
console.log('error when exit', e.stack);
}
});
const cover = (browser, id, token, url) => {
return new Promise(async (resolve) => {
const page = await browser.newPage();
await page.goto(`${url}/pia/pia-pdf?type=hideContent&id=${id}&token=${token}`, { timeout: 0, waitUntil: 'networkidle0' });
const coverBuffer = await page.pdf({
path: '首页.pdf',
printBackground: true,
preferCSSPageSize: false,
displayHeaderFooter: false,
landscape: false,
scale: 1,
pageRanges: '1',
format: 'A4',
margin: {
top: '0cm',
bottom: '0cm',
left: '0cm',
right: '0cm'
},
})
page.close()
resolve(coverBuffer)
})
}
const content = (browser, id, token, url) => {
return new Promise(async (resolve) => {
const page = await browser.newPage();
await page.goto(`${url}/pia/pia-pdf?type=hideCover&id=${id}&token=${token}`, { timeout: 0, waitUntil: 'networkidle0' });
const contentBuffer = page.pdf({
path: '内容.pdf',
printBackground: true,
preferCSSPageSize: false,
displayHeaderFooter: true,
landscape: false,
pageRanges: '1-',
format: 'A4',
margin: {
top: '1cm',
bottom: '1cm',
left: '0cm',
right: '0cm'
},
headerTemplate: '',
footerTemplate: '本文档为内部保密资料,涉及信息不得对外展示 /',
})
page.close()
resolve(contentBuffer)
})
}
// 可用来获取 pdf的信息
// pdf(pdfBytes).then(function(data) {
// console.log(data)
// // let toc ={}, page;
// const pagePattern = /Page [0-9]+\/[0-9]+/;
// const topicPattern = /Title: [A-Za-z 0-9]+/;
// const lines = data.text.split('\n');
// // lines.forEach((chunk, i, lines) => {
// // console.log(chunk,lines)
// // if(chunk.match(pagePattern)) {
// // page = chunk
// // }
// // if(chunk.match(topicPattern) && !toc[chunk]) {
// // toc[chunk] = page
// // }
// // });
// // console.log(toc); // Use this object to fill in values for your table of content
// });
导出遇到的问题可参考
https://zhaoqize.github.io/puppeteer-api-zh_CN/#?product=Puppeteer&version=v17.1.3&show=api-class-page
https://blog.csdn.net/qq_41000891/article/details/119914508