puppeteer下载网页

const puppeteer = require("puppeteer");
const fs = require("fs-extra");
const jschardet = require("jschardet"); // 判断文件编码
const url = "https://www.yibaixun.com/";
(async () => {
  const browser = await puppeteer.launch({
    headless: true,
    // slowMo: 250,
    userDataDir: "./data",
  });
  const page = await browser.newPage();

  // 拦截器
  // await page.setRequestInterception(true);
  // page.on("request", (request) => {
  //   console.log(request.continue())
  //   // if (request.resourceType() === "document") {
  //   //   request.continue();
  //   // } else {
  //   //   request.abort();
  //   // }
  // });

  // 响应事件
  page.on("response", async (response) => {
    const file = './www/';
    const responseUrl = response.url();
    const fileType = response.headers()['content-type'];
    const text = await response.buffer();
    const encoding = jschardet.detect(text).encoding;
    if (encoding == 'UTF-8') {
      console.log('UTF-8=', responseUrl);
    }
    if (encoding == 'ascii') {
      console.log('ascii=', responseUrl);
    }
    if (encoding == 'ISO-8859-2') {
      console.log('ISO-8859-2=', responseUrl);
    }
    if (encoding == 'windows-1252') {
      console.log('windows-1252=', responseUrl);
    }
    // console.log('ma=', ma);
    let name = responseUrl.match(new RegExp(`${url}(\\S*)`));
    if (name == null) {
      const oss = 'https://oss.yibaixun.com/';
      name = responseUrl.match(new RegExp(`${oss}(\\S*)`))[1];
    } else {
      name = name[1];
    }
    // console.log('name=', name);
    if (fileType == 'text/html') {
      fs.outputFileSync(`${file}index.html`, text);
    } else {
      fs.outputFileSync(`${file}${name}`, text);
    }
  });
  const data = await page.goto(url);
  // await fse.outputFile(`www`, await data.buffer()); //下载到你想要的路径
  // console.log('data=', data)
  const c = await page.screenshot({ path: "example.png" });
  // console.log('c=', c)
  const dimensions = await page.evaluate(() => {
    return {
      width: document.documentElement.clientWidth,
      height: document.documentElement.clientHeight,
      deviceScaleFactor: window.devicePixelRatio,
    };
  });

  // console.log('Dimensions:', dimensions);
  // let content = await page.content()
  //   console.log(content);
  // page.on('console', msg => console.log('PAGE LOG:', ...msg.args));
  // await page.evaluate(() => console.log(`url is ${location.href}`));

  // 保存 html 文件
  const html = await page.content();

  const file = "./index.html";
  fs.outputFileSync(file, html);

  await browser.close();
})();

你可能感兴趣的:(js,爬虫)