nodejs 爬视频初探

直接上代码

var cheerio = require("cheerio");
var fs = require('fs');
var async = require("async");
const superagent = require('superagent')
const request = require('request')
const mkdirp = require('mkdirp')
const path = require('path')
var options = {
    uri: 'http://xxxxx', //要爬的视频网站
    dirfile: './output/', //保存目录
    downLimit: 2//视频并行下载上限
}, prolist = [], videolist = [];
down(options.uri)
async function down(url) {
//首页列表
await new Promise((resolve) => {
    superagent
        .get(url)
        .end((error, response) => {
            if (!error && response.statusCode == 200) {
                //获取页面文档数据
                var $ = cheerio.load(response.text, {
                    normalizeWhitespace: true,
                    decodeEntities: false
                });
                //这一步需要去网站界面来查看elements
                $(".comapny-card.bg-fff.div-animationone").each((i, obj) => {
                    let json = {
                        dir: $(obj).find("h6").text(),
                        title: $(obj).find("a").attr("title"),
                        url: $(obj).find("a").attr("href")
                    }
                    prolist.push(json)
                })
                resolve()
            }
        })
})
// 详情
for (var opt of prolist) {
    await mkdir(opt.dir);
    await new Promise((resolve) => {
        //这是一个小坑,需要来模拟浏览器,添加header就可以
        superagent
            .get(opt.url)
            .set("Connection", "keep-alive")
            .set("Content-Length", 0)
            .set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
            .set("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Mobile Safari/537.36")
            .end((error, response) => {
                // 获取具体视频的详情界面(获取视频路径)
                if (!error && response.statusCode == 200) {
                    var $ = cheerio.load(response.text, {
                        normalizeWhitespace: true,
                        decodeEntities: false
                    });
                    $("#List1_1 .video_name1").each((i, obj) => {
                        let json = {
                            title: $(obj).attr("title"),
                            url: $(obj).attr("rel")
                        }
                        videolist.push(json)
                    })
                    resolve()
                }
            })
    })
}
await sleep(2000);
// 下载视频
if (videolist?.length) {
    await downliu(opt.dir, videolist, function () {
        console.log('下载结束');
    })
}
}

/**
  * 创建视频保存的目录
  */
 function mkdir(title) {
    console.log('创建目录:%s', title);
    if (!fs.existsSync(options.dirfile + title)) {
       mkdirp(options.dirfile + title, function (err) {
            console.log(`目录:${title} 创建成功`);
        });
    }
}

function sleep(duration) {
    return new Promise((resolve, reject) => {
        setTimeout(resolve, duration);
    });
};

/**
 * 下载视频
 */
function downliu(dir, links, callback) {
  console.log(`发现${links.length}个视频,准备开始下载...`);
  async.eachLimit(links, options.downLimit, function (video, callback) {
      // 获取url最后的名字
      var fileName = path.basename(video.title).replace(/ /g, '');
      // 去掉/
      var toPath = path.join(options.dirfile + dir, fileName);
      console.log(`开始下载视频:${fileName},保存到:${dir}`);
      request(encodeURI(video.url)).on('error', function (err) {
          callback();
      }).pipe(fs.createWriteStream(toPath + ".mp4")).on('finish', () => {
          console.log(`视频下载成功:${video.url}`);
          callback();
      })
  }, callback);
}

你可能感兴趣的:(nodejs 爬视频初探)