Node.js 爬取语文备课大师课件

代码目录结构如下：

目录.png

bin/downloadfile.js:


var fs = require('fs');
var request = require('request');
// var async = require("async");
var exec = require('child_process').exec;
var cheerio = require('cheerio');
var config = require('../conf/config');

//加载网页解析库
var Classparse = require("../lib/classparse");

var url = config["downloadSrc"][0];
var dst = config["downloadDir"][0];
var Classparse = new Classparse(url);

//创建目的目录
if (mkdir(dst)) {
    return;
}

console.log("开启帕尼尼计划...向语文备课大师进军！");

//先获取目录列表
Classparse.getClassCatalogList().then(function(links) {
    var newDir = '';

    //根据每一个目录，获取对应下的所有下载连接
    for (var i = 0, len = links.length; i < len; i++) {

        (function(i) {
            if (i != 40) {
                return;
            }
            Classparse.getDownloadlinkArray(links[i]).then(function(downlist) {
                newDir = dst + '/' + i + '、' + downlist[0].dirTitle;
                mkdir(newDir);
                for (var index = 0; index < downlist.length; index++) {

                    Classparse.superagentDown(downlist[index].href, newDir, downlist[index].title);
                }
            });
        })(i);
    }

}).catch(function(e) {
    if (e) console.log(e);
});


//自动创建目的下载目录
function mkdir(dst) {
    if (dst) {
        var cmd = "sh -x mkdir.sh " + dst;
        var out = exec(cmd);
        out.on("exit", function(code) {
            if (code == 0) {
                console.log("mkdir " + dst + " sucess.");
            }
        });
        return 0;
    } else {
        console.error("dst目录为空！");
        return 1;
    }
}

bin/mkdir.sh ：

#!/bin/bash
dst=$1

if [ "${dst}" == "" ];then
  exit -1
fi

if [ ! -e ${dst} ];then
  mkdir -p ${dst}
  ret=$?
else 
  ret=1
fi

exit ${ret}

conf/config.js :

var conf = {
    downloadSrc: [
        "http://www.xiexingcun.com/xy8/List/List_992.html",
        "http://data.xiexingcun.com/G/List/List_7380.htm"
    ],
    downloadDir: [
        "../resultClass/class"
    ]
};

module.exports = conf;

lib/classpare.js :

var request = require('request');
var cheerio = require('cheerio');
var exec = require('child_process').exec;
var iconv = require('iconv-lite');
var fs = require('fs');
var superagent = require('superagent');

var Classparse = function(url){
  this.url = url;
  this.trEles = ".left_tdbg1 td p:nth-child(1) a";
  this.downlistTreeEles = ".main_tdbgall .listA";

};

/* dec: 首先访问首页，抓取首页中的目录
 * param: this.url - 首页请求的地址
 * return: links - 子页面的链接数组
 * */
Classparse.prototype.getClassCatalogList = function(){
    var _this = this;
    return new Promise(function(resolve, reject){
        request.get({url:_this.url,encoding:null},function(e, r, html){
            //设置编码方式
            html = iconv.decode(html, 'gb2312');
            //如果请求到了首页的模板
            if (!e && r.statusCode === 200) {
                var $ = cheerio.load(html,{decodeEntities: false}),
                    treeDom = $(_this.trEles),
                    links = [],
                    href = '',
                    title = '';
                treeDom.each(function(index, item){
                    if (index > 2) {
                        href = $(item).attr('href').replace(/[ ]/g, '')
                                                   .replace(/[\r\n]/g, '');
                        title = $(item).text().replace(/[ ]/g, '')
                                              .replace(/[\r\n]/g, '');
                        links.push({
                            'href': href,
                            'title': title
                        })
                    }
                });
                resolve(links);
            }else {
                console.error("Error: " + e);
                reject(e);
            }
        });
    })
}

/* dec: 根据目录项（也就是课程名称）获取该课程下的所有课件文档URL
 * param: this.url - 首页请求的地址
 * return: links - 子页面的链接数组
 * */
Classparse.prototype.getDownloadlinkArray = function(item){
    var _this = this;
    return new Promise(function(resolve, reject){
        request.get({url: item.href, encoding:null}, function(e, r, html){
            //设置编码方式
            html = iconv.decode(html, 'gb2312');
            if (!e && r.statusCode === 200) {
                var $ = cheerio.load(html,{decodeEntities: false}),
                    treeDom = $(_this.downlistTreeEles),
                    links = [],
                    href = '',
                    title = '';

                console.log(item.title);
                treeDom.each(function(index, childitem){
                    //现在拿到的是一个一个的tr,tr里面才有a标签
                    href = $(childitem).attr('href').replace(/[ \r\n]/g, '');
                    title = $(childitem).text().replace(/[ ( )\r\n]/g, '');

                    //首先要判断是不是以/开头的
                    //因为有一些标签是http://www.xiexingcun.com/centre.html这样过来的
                    //拿到的url还要再剔除：例如拿到/D/HTML/98877.htm，要取出 D 和 98877
                    // 然后拼成一个新的URL可以提供下载的：
                    //http://data.xiexingcun.com/D/abShowSoftDown.asp?UrlID=1&SoftID=98877
                    if (/^\//.test(href)) {
                        href = 'http://data.xiexingcun.com/' +  href.split('/')[1] + '/' + 'abShowSoftDown.asp?UrlID=1&SoftID=' + href.split('/')[3].split('.')[0];
                        // console.log(href);
                        // console.log(title);
                        links.push({
                            'dirTitle': item.title,
                            'href': href,
                            'title': title
                        })
                    }
                });
                // console.log(links);

                resolve(links);
            }else {
                console.error("Error: " + e);
                reject(e);
            }
        });
    });
}

/*
 *  request
 * In: link - 下载链接，saveDir - 保存目录,name - 保存文件名
 * Out: null
 * */
Classparse.prototype.testdown = function(link, saveDir, name){
    var cmDir = saveDir + "/" + name + '.zip';
    var type = '';
    var ip = this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254);
    var stream = fs.createWriteStream(cmDir);
    request(link).pipe(stream);
}

/*
 *  superagent
 * In: link - 下载链接，saveDir - 保存目录,name - 保存文件名
 * Out: null
 * */
Classparse.prototype.superagentDown = function(link, saveDir, name){
    var cmDir = saveDir + "/" + name,
        type = '',
        ip = this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254);

    superagent
          .get(link)
          .set('X-Forwarded-For',ip)
          .end(function(err, sres){
              if (!sres) {
                  return;
              }
            //   console.log(sres.headers['content-type']);
              if (sres.headers['content-type'] === 'application/x-zip-compressed') {
                  type = '.zip';
              }else if(sres.headers['content-disposition']){
                  type = '.' + sres.headers['content-disposition'].split('.')[1];
              }else {
                  type = '.rar';
              }
              //获得文件类型后，进行pipe管子数据传输
              var stream = fs.createWriteStream(cmDir + type);
              superagent(link).set('X-Forwarded-For',ip).pipe(stream);
              console.log("下载文件: #" + name + '#  成功！');
          });
}

/*
 *  利用exec方法下载
 * In: link - 下载链接，saveDir - 保存目录,name - 保存文件名
 * Out: null
 * */
Classparse.prototype.download = function(link, saveDir, name){
    // var cmd = "curl '" + link + "' -o " + saveDir + "/" + name + '';
    var ip = this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254);
    var cmd = "curl -H 'X-Forwarded-For:" + ip + "' -o '" + saveDir + "/" + name + ".doc' '" + link + "'" ;
    console.log(cmd);

    exec(cmd, function(e,stdout,stderr){
        if (!e) {
            console.log("下载文件: #" + name + '#  成功！');
        }else {
            console.log("download err: " + e);
        }

    });
};

//获取范围内的随机数
Classparse.prototype.getRandomNum = function(min, max){
    return Math.floor(min + Math.random() * (max - min));
};

module.exports = Classparse;

lib/common.js :(暂可不用)

var exec = require('child_process').exec;

//继承封装函数
function extend(Child,Parent){
  var F = function(){};
  F.prototype = Parent.prototype;
  Child.prototype = new F();
  Child.constructor = Child;
  Child.uber = Parent.prototype;
}

//CommonParser: 解析的公共模板类定义
var CommonParser = function(url){
  this.url = url;            //由前缀+搜索词构成
};

//获取匹配的搜索结果信息
CommonParser.prototype.getMatchResultInfo = function(){
  //具体实例实现逻辑
};


//详情页面获取下载链接
CommonParser.prototype.getLink = function(){
  //实例实现
};

//下载文件
CommonParser.prototype.download = function(link,dst,name){
  var cmd = "wget " + link + " -O " + dst + "/" + name;
  console.log(cmd);
  exec(cmd,function(e){
    if(e)console.error("Error: " + e);
  });
};

exports.extend = extend;
exports.CommonParser = CommonParser;
exports.exec = exec;

//运行代码：
node downloadfile

//下载后的文件在class这个目录中

Node.js 爬取语文备课大师课件

你可能感兴趣的:(Node.js 爬取语文备课大师课件)