在node服务中使用puppeteer导出pdf

本地测试

说明:将pdf首页和pdf内容分开爬取,最后合并到一起
准备:node的16以上版本,创建一个app.js文件

//安装插件 puppeteer fs pdf-lib
const puppeteer = require('puppeteer');
const fs = require('fs');
const { PDFDocument } = require('pdf-lib');

(async () => {
  const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'], ignoreHTTPSErrors: true, headless: true });
  Promise.all([
    cover(
      browser,
      '需要爬的网站'
      ),
    content(
      browser,
      '需要爬的网站'
      )]).then(async(res) => {
    await browser.close();
    console.timeEnd()
    const pdfDoc = await PDFDocument.create();
    const coverDoc = await PDFDocument.load(res[0]);
    const [coverPage] = await pdfDoc.copyPages(coverDoc, [0]);
    pdfDoc.addPage(coverPage);
    const reportDoc = await PDFDocument.load(res[1]);
    const reportPages = await pdfDoc.copyPages(reportDoc, reportDoc.getPageIndices());
    reportPages.forEach((page) => {
      pdfDoc.addPage(page);
    });
    const pdfBytes = await pdfDoc.save();
    fs.writeFileSync(`合并.pdf`, pdfBytes);
  })
})()
//爬首页 没有页头页尾
const cover = (browser, url) => {
  return new Promise(async (resolve) => {
    const page = await browser.newPage();
    await page.goto(`${url}`, { timeout: 0, waitUntil: 'networkidle0' });
    const coverBuffer = await page.pdf({
      path: '首页.pdf',
      printBackground: true,
      preferCSSPageSize: false,
      displayHeaderFooter: false,
      landscape: false,
      scale: 1,
      pageRanges: '1',
      format: 'A4',
      margin: {
        top: '0cm',
        bottom: '0cm',
        left: '0cm',
        right: '0cm'
      },
    })
    resolve(coverBuffer)
  })
}
//爬内容 存在页头页尾
const content = (browser, url) => {
  return new Promise(async (resolve) => {
    const page = await browser.newPage();
    await page.goto(`${url}`, { timeout: 0, waitUntil: 'networkidle0' });
    const contentBuffer = page.pdf({
      path: '内容.pdf',
      printBackground: true,
      preferCSSPageSize: false,
      displayHeaderFooter: true,
      landscape: false,
      pageRanges: '1-',
      format: 'A4',
      margin: {
        top: '1cm',
        bottom: '1cm',
        left: '0cm',
        right: '0cm'
      },
      headerTemplate: '
'
, footerTemplate: '
'
, }) resolve(contentBuffer) }) }

在当前存放app.js的文件夹中打开终端,执行node app.js即可

通过linux部署node服务,并通过node服务接口爬取

说明:linux需要安装chorme浏览器,并配置字体包
准备:node的16以上版本,创建一个app.js文件

//安装插件 npm i puppeteer fs pdf-lib express
const express = require('express');
const app = express();
const puppeteer = require('puppeteer');
const fs = require('fs');
const { PDFDocument } = require('pdf-lib');
// const pdf = require('pdf-parse');
// const { Readable } = require("stream");
const port = 4000
// cors处理跨域
// var cors = require('cors')
// var app = express()
// var corsOptions = {
//   origin: 'http://www.sina.com', //只有该域名可以访问
//   optionsSuccessStatus: 200 
// }
// app.get('/products', cors(corsOptions), function (req, res, next) {
//   res.json({msg: '只有前面的sina.com域名可以访问'})
// })

app.get('/reportPDF', async (req, next) => {
  next.header('Access-Control-Allow-Origin', '*'); //这个表示任意域名都可以访问,这样写不能携带cookie了。
  // next.header('Access-Control-Allow-Origin', 'http://www.baidu.com'); //这样写,只有www.baidu.com 可以访问。
  // next.header('Access-Control-Allow-Headers', 'Content-Type, Content-Length, Authorization, Accept, X-Requested-With , yourHeaderFeild');
  // next.header('Access-Control-Allow-Methods', 'PUT, POST, GET, DELETE, OPTIONS');//设置方法
  const { id, token } = req.query || {};
  try {
    if (!id || !token) {
      next.send({
        code: -1,
        msg: "缺少参数"
      });
      return;
    }
    //executablePath这里需要将chrome路径输入 例如/data/node/chrome-linux/chrome
    const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'], ignoreHTTPSErrors: true, headless: true, executablePath: 'xxxxx' });
    // 此处id token 和 req.headers.origin的作用
    // id进行到爬取页面后,我的爬取页面需要id去请求数据
    // token的由于我的爬取页面需要登录
    // req.headers.origin的作用是,由于我是在同一个网页中发起请求,所以要爬取同一个网页的页面,origin可以拿到我当前发起请求的网页域名
    Promise.all([cover(browser, id, token, req.headers.origin), content(browser, id, token, req.headers.origin)]).then(async (res) => {
      await browser.close();
      const pdfDoc = await PDFDocument.create();
      const coverDoc = await PDFDocument.load(res[0]);
      const [coverPage] = await pdfDoc.copyPages(coverDoc, [0]);
      pdfDoc.addPage(coverPage);
      const reportDoc = await PDFDocument.load(res[1]);
      const reportPages = await pdfDoc.copyPages(reportDoc, reportDoc.getPageIndices());
      reportPages.forEach((page) => {
        pdfDoc.addPage(page);
      });
      const pdfBytes = await pdfDoc.save();
      // fs.writeFileSync(`合并.pdf`, pdfBytes);
      next.set({
        'Content-Type': 'application/pdf',
        // 'Content-Length': pdfBytes.length,
      });
      next.send(Buffer.from(pdfBytes));
    })
  } catch (e) {
    next.send({
      code: -1,
      msg: '爬取错误',
    });
    logger.info(e)
  }
})
app.listen(port, () => {
  console.log('运行成功')
})
// 异常处理
process.on('unhandledRejection', error => {
  process.exit(1);
});
process.on('uncaughtException', function (err) {
  try {
    var killTimer = setTimeout(function () {
      process.exit(1);
    }, 30000);
    killTimer.unref();
  } catch (e) {
    console.log('error when exit', e.stack);
  }
});

const cover = (browser, id, token, url) => {
  return new Promise(async (resolve) => {
    const page = await browser.newPage();
    await page.goto(`${url}/pia/pia-pdf?type=hideContent&id=${id}&token=${token}`, { timeout: 0, waitUntil: 'networkidle0' });
    const coverBuffer = await page.pdf({
      path: '首页.pdf',
      printBackground: true,
      preferCSSPageSize: false,
      displayHeaderFooter: false,
      landscape: false,
      scale: 1,
      pageRanges: '1',
      format: 'A4',
      margin: {
        top: '0cm',
        bottom: '0cm',
        left: '0cm',
        right: '0cm'
      },
    })
    page.close()
    resolve(coverBuffer)
  })
}
const content = (browser, id, token, url) => {
  return new Promise(async (resolve) => {
    const page = await browser.newPage();
    await page.goto(`${url}/pia/pia-pdf?type=hideCover&id=${id}&token=${token}`, { timeout: 0, waitUntil: 'networkidle0' });
    const contentBuffer = page.pdf({
      path: '内容.pdf',
      printBackground: true,
      preferCSSPageSize: false,
      displayHeaderFooter: true,
      landscape: false,
      pageRanges: '1-',
      format: 'A4',
      margin: {
        top: '1cm',
        bottom: '1cm',
        left: '0cm',
        right: '0cm'
      },
      headerTemplate: '
'
, footerTemplate: '
本文档为内部保密资料,涉及信息不得对外展示 /
'
, }) page.close() resolve(contentBuffer) }) } // 可用来获取 pdf的信息 // pdf(pdfBytes).then(function(data) { // console.log(data) // // let toc ={}, page; // const pagePattern = /Page [0-9]+\/[0-9]+/; // const topicPattern = /Title: [A-Za-z 0-9]+/; // const lines = data.text.split('\n'); // // lines.forEach((chunk, i, lines) => { // // console.log(chunk,lines) // // if(chunk.match(pagePattern)) { // // page = chunk // // } // // if(chunk.match(topicPattern) && !toc[chunk]) { // // toc[chunk] = page // // } // // }); // // console.log(toc); // Use this object to fill in values for your table of content // });

导出遇到的问题可参考
https://zhaoqize.github.io/puppeteer-api-zh_CN/#?product=Puppeteer&version=v17.1.3&show=api-class-page
https://blog.csdn.net/qq_41000891/article/details/119914508

你可能感兴趣的:(javascript,前端,vue.js)