最近在听小说《全职高手》,但是没找到打包下载的地方,就找了个小说网站准备弄个爬虫爬下来。
所以现学现用了些,菜鸡一只,大神勿喷。直接上代码。爬的是下载的地址,然后用迅雷下载回来。
这个有个问题,就是会爬一会死掉。学艺不精。望指教。
var request = require("request");
var cheerio = require("cheerio");
var async = require('async');
var moment = require("moment");
var fs = require('fs');
var iconv = require('iconv-lite')
var startUrl = 'http://www.ting89.com/';
var name = "全职高手";
var options = {
url: startUrl+'-1.html',
method: 'GET',
charset: "utf-8",
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36",
}
};
var concurrencyCount = 0;
var fetchUrl = function (body, url, callback) {
body =iconv.decode(body, 'gb2312');
var $ = cheerio.load(body);
var dl= $('iframe').attr('src');
if(!!dl)
{
dl = dl.replace('http://play.ting89.com/down/down.php?url=','');
var input = dl+"\n";
fs.appendFileSync(name+'.csv', input);
}
// delay 的值在 2000 以内,是个随机的整数
var delay = parseInt((Math.random() * 10000000) % 2000, 10);
concurrencyCount++;
console.log('现在的并发数是', concurrencyCount, ',正在抓取的是', url, ',耗时' + delay + '毫秒\n');
setTimeout(function () {
concurrencyCount--;
callback(null, url + ' html content');
}, delay);
};
var urls = [];
request({
url: 'http://www.ting89.com/books/12828.html',
method: 'GET',
charset: "UTF-8",
encoding: null,
},function(err, response, body){
if(err){
return;
}
console.log(response.statusCode);
//body =iconv.decode(body, 'gb2312');
var $ = cheerio.load(body);
$('li').children('a').each(function(i, e) {
var url = $(e).attr('href');
if(url.indexOf("down") > 0)
{
console.log('有效'+url);
urls.push(startUrl + url);
}
else
console.log('无效'+url);
});
console.log('获得有效连接数:'+urls.length);
async.mapLimit(urls,5,function(url,callback){
var header = {
url: url,
method: 'GET',
charset: "utf-8",
encoding: null,
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36",
}
};
request(header,function(err, response, body){
if(err){
console.log("重新爬取页面:"+url);
var temp_header = {
url: url,
method: 'GET',
charset: "utf-8",
encoding: null,
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36",
}
};
console.log(temp_header);
request(temp_header,function(err, response, body){
if(err){
console.log(err);
}
if(!!response && response.statusCode==200){
fetchUrl(body,url,callback);
}
});
}
if(!!response.statusCode && response.statusCode == 200){
fetchUrl(body,url,callback);
}
});
},function (err, result) {
console.log('final:');
console.log(result);
}
);
})