代码目录结构如下:
bin/downloadfile.js:
var fs = require('fs');
var request = require('request');
// var async = require("async");
var exec = require('child_process').exec;
var cheerio = require('cheerio');
var config = require('../conf/config');
//加载网页解析库
var Classparse = require("../lib/classparse");
var url = config["downloadSrc"][0];
var dst = config["downloadDir"][0];
var Classparse = new Classparse(url);
//创建目的目录
if (mkdir(dst)) {
return;
}
console.log("开启帕尼尼计划...向语文备课大师进军!");
//先获取目录列表
Classparse.getClassCatalogList().then(function(links) {
var newDir = '';
//根据每一个目录,获取对应下的所有下载连接
for (var i = 0, len = links.length; i < len; i++) {
(function(i) {
if (i != 40) {
return;
}
Classparse.getDownloadlinkArray(links[i]).then(function(downlist) {
newDir = dst + '/' + i + '、' + downlist[0].dirTitle;
mkdir(newDir);
for (var index = 0; index < downlist.length; index++) {
Classparse.superagentDown(downlist[index].href, newDir, downlist[index].title);
}
});
})(i);
}
}).catch(function(e) {
if (e) console.log(e);
});
//自动创建目的下载目录
function mkdir(dst) {
if (dst) {
var cmd = "sh -x mkdir.sh " + dst;
var out = exec(cmd);
out.on("exit", function(code) {
if (code == 0) {
console.log("mkdir " + dst + " sucess.");
}
});
return 0;
} else {
console.error("dst目录为空!");
return 1;
}
}
bin/mkdir.sh :
#!/bin/bash
dst=$1
if [ "${dst}" == "" ];then
exit -1
fi
if [ ! -e ${dst} ];then
mkdir -p ${dst}
ret=$?
else
ret=1
fi
exit ${ret}
conf/config.js :
var conf = {
downloadSrc: [
"http://www.xiexingcun.com/xy8/List/List_992.html",
"http://data.xiexingcun.com/G/List/List_7380.htm"
],
downloadDir: [
"../resultClass/class"
]
};
module.exports = conf;
lib/classpare.js :
var request = require('request');
var cheerio = require('cheerio');
var exec = require('child_process').exec;
var iconv = require('iconv-lite');
var fs = require('fs');
var superagent = require('superagent');
var Classparse = function(url){
this.url = url;
this.trEles = ".left_tdbg1 td p:nth-child(1) a";
this.downlistTreeEles = ".main_tdbgall .listA";
};
/* dec: 首先访问首页,抓取首页中的目录
* param: this.url - 首页请求的地址
* return: links - 子页面的链接数组
* */
Classparse.prototype.getClassCatalogList = function(){
var _this = this;
return new Promise(function(resolve, reject){
request.get({url:_this.url,encoding:null},function(e, r, html){
//设置编码方式
html = iconv.decode(html, 'gb2312');
//如果请求到了首页的模板
if (!e && r.statusCode === 200) {
var $ = cheerio.load(html,{decodeEntities: false}),
treeDom = $(_this.trEles),
links = [],
href = '',
title = '';
treeDom.each(function(index, item){
if (index > 2) {
href = $(item).attr('href').replace(/[ ]/g, '')
.replace(/[\r\n]/g, '');
title = $(item).text().replace(/[ ]/g, '')
.replace(/[\r\n]/g, '');
links.push({
'href': href,
'title': title
})
}
});
resolve(links);
}else {
console.error("Error: " + e);
reject(e);
}
});
})
}
/* dec: 根据目录项(也就是课程名称)获取该课程下的所有课件文档URL
* param: this.url - 首页请求的地址
* return: links - 子页面的链接数组
* */
Classparse.prototype.getDownloadlinkArray = function(item){
var _this = this;
return new Promise(function(resolve, reject){
request.get({url: item.href, encoding:null}, function(e, r, html){
//设置编码方式
html = iconv.decode(html, 'gb2312');
if (!e && r.statusCode === 200) {
var $ = cheerio.load(html,{decodeEntities: false}),
treeDom = $(_this.downlistTreeEles),
links = [],
href = '',
title = '';
console.log(item.title);
treeDom.each(function(index, childitem){
//现在拿到的是一个一个的tr,tr里面才有a标签
href = $(childitem).attr('href').replace(/[ \r\n]/g, '');
title = $(childitem).text().replace(/[ ( )\r\n]/g, '');
//首先要判断是不是以/开头的
//因为有一些标签是http://www.xiexingcun.com/centre.html这样过来的
//拿到的url还要再剔除:例如拿到/D/HTML/98877.htm,要取出 D 和 98877
// 然后拼成一个新的URL可以提供下载的:
//http://data.xiexingcun.com/D/abShowSoftDown.asp?UrlID=1&SoftID=98877
if (/^\//.test(href)) {
href = 'http://data.xiexingcun.com/' + href.split('/')[1] + '/' + 'abShowSoftDown.asp?UrlID=1&SoftID=' + href.split('/')[3].split('.')[0];
// console.log(href);
// console.log(title);
links.push({
'dirTitle': item.title,
'href': href,
'title': title
})
}
});
// console.log(links);
resolve(links);
}else {
console.error("Error: " + e);
reject(e);
}
});
});
}
/*
* request
* In: link - 下载链接,saveDir - 保存目录,name - 保存文件名
* Out: null
* */
Classparse.prototype.testdown = function(link, saveDir, name){
var cmDir = saveDir + "/" + name + '.zip';
var type = '';
var ip = this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254);
var stream = fs.createWriteStream(cmDir);
request(link).pipe(stream);
}
/*
* superagent
* In: link - 下载链接,saveDir - 保存目录,name - 保存文件名
* Out: null
* */
Classparse.prototype.superagentDown = function(link, saveDir, name){
var cmDir = saveDir + "/" + name,
type = '',
ip = this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254);
superagent
.get(link)
.set('X-Forwarded-For',ip)
.end(function(err, sres){
if (!sres) {
return;
}
// console.log(sres.headers['content-type']);
if (sres.headers['content-type'] === 'application/x-zip-compressed') {
type = '.zip';
}else if(sres.headers['content-disposition']){
type = '.' + sres.headers['content-disposition'].split('.')[1];
}else {
type = '.rar';
}
//获得文件类型后,进行pipe管子数据传输
var stream = fs.createWriteStream(cmDir + type);
superagent(link).set('X-Forwarded-For',ip).pipe(stream);
console.log("下载文件: #" + name + '# 成功!');
});
}
/*
* 利用exec方法下载
* In: link - 下载链接,saveDir - 保存目录,name - 保存文件名
* Out: null
* */
Classparse.prototype.download = function(link, saveDir, name){
// var cmd = "curl '" + link + "' -o " + saveDir + "/" + name + '';
var ip = this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254) + '.' + this.getRandomNum(1, 254);
var cmd = "curl -H 'X-Forwarded-For:" + ip + "' -o '" + saveDir + "/" + name + ".doc' '" + link + "'" ;
console.log(cmd);
exec(cmd, function(e,stdout,stderr){
if (!e) {
console.log("下载文件: #" + name + '# 成功!');
}else {
console.log("download err: " + e);
}
});
};
//获取范围内的随机数
Classparse.prototype.getRandomNum = function(min, max){
return Math.floor(min + Math.random() * (max - min));
};
module.exports = Classparse;
lib/common.js :(暂可不用)
var exec = require('child_process').exec;
//继承封装函数
function extend(Child,Parent){
var F = function(){};
F.prototype = Parent.prototype;
Child.prototype = new F();
Child.constructor = Child;
Child.uber = Parent.prototype;
}
//CommonParser: 解析的公共模板类定义
var CommonParser = function(url){
this.url = url; //由前缀+搜索词构成
};
//获取匹配的搜索结果信息
CommonParser.prototype.getMatchResultInfo = function(){
//具体实例实现逻辑
};
//详情页面获取下载链接
CommonParser.prototype.getLink = function(){
//实例实现
};
//下载文件
CommonParser.prototype.download = function(link,dst,name){
var cmd = "wget " + link + " -O " + dst + "/" + name;
console.log(cmd);
exec(cmd,function(e){
if(e)console.error("Error: " + e);
});
};
exports.extend = extend;
exports.CommonParser = CommonParser;
exports.exec = exec;
//运行代码:
node downloadfile
//下载后的文件在class这个目录中