用nodejs+request+cheerio做一个爬取有声小说的小虫子。

最近在听小说《全职高手》,但是没找到打包下载的地方,就找了个小说网站准备弄个爬虫爬下来。

所以现学现用了些,菜鸡一只,大神勿喷。直接上代码。爬的是下载的地址,然后用迅雷下载回来。

这个有个问题,就是会爬一会死掉。学艺不精。望指教。

var request = require("request");
var cheerio = require("cheerio");
var async = require('async');
var moment = require("moment");
var fs = require('fs');
var iconv = require('iconv-lite')



var startUrl = 'http://www.ting89.com/';
var name = "全职高手";


var options = {
    url: startUrl+'-1.html',
    method: 'GET',
    charset: "utf-8",
    headers: {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36",
    
    }
};


var concurrencyCount = 0;
var fetchUrl = function (body, url, callback) {
        body =iconv.decode(body, 'gb2312');
        var $ = cheerio.load(body);
        var dl= $('iframe').attr('src');
        if(!!dl)
        {
            dl = dl.replace('http://play.ting89.com/down/down.php?url=','');
            var input = dl+"\n";
            fs.appendFileSync(name+'.csv', input);
        }        
        // delay 的值在 2000 以内,是个随机的整数
        var delay = parseInt((Math.random() * 10000000) % 2000, 10);
        concurrencyCount++;
        console.log('现在的并发数是', concurrencyCount, ',正在抓取的是', url, ',耗时' + delay + '毫秒\n');
        setTimeout(function () {
            concurrencyCount--;
            callback(null, url + ' html content');
        }, delay);
    };

var urls = [];
request({
        url: 'http://www.ting89.com/books/12828.html',    
        method: 'GET',
        charset: "UTF-8",
        encoding: null,
        },function(err, response, body){
        if(err){
            return;
        }
        console.log(response.statusCode);
        //body =iconv.decode(body, 'gb2312');
        var $ = cheerio.load(body);
       
        $('li').children('a').each(function(i, e) {
            var url = $(e).attr('href');
            if(url.indexOf("down") > 0)
            {
                console.log('有效'+url);
                urls.push(startUrl + url);
            }                
            else
                console.log('无效'+url);
        });
        
        console.log('获得有效连接数:'+urls.length);
        async.mapLimit(urls,5,function(url,callback){
                        
            var header = {
                url: url,
                method: 'GET',
                charset: "utf-8",
                encoding: null,
                headers: {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36",        
                }
            };
            request(header,function(err, response, body){
                
                if(err){
                    console.log("重新爬取页面:"+url);
                    var temp_header = {
                        url: url,
                        method: 'GET',
                        charset: "utf-8",
                        encoding: null,
                        headers: {
                            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36",
        
                        }
                    };
                    console.log(temp_header);
                    request(temp_header,function(err, response, body){
                        
                        if(err){
                            console.log(err);
                        }
                        if(!!response && response.statusCode==200){
                            fetchUrl(body,url,callback);
                        }
                    });
                }
        
                if(!!response.statusCode && response.statusCode == 200){
                   fetchUrl(body,url,callback);
                }
            });
        },function (err, result) {
            console.log('final:');
            console.log(result);
          }
        );
})

 

你可能感兴趣的:(node.js)