用nodejs库cheerio抓取网页内容与图片

     之前都是PHP(phpQuery)抓取,但jQuery更强大, 于是用nodejs。 只是node-jquery的依赖太多,只好用cheerio

    下面是一个抓取脚本:

var http = require('http');
var fs = require('fs');
var cheerio = require("cheerio");
var bufferhelper = require("bufferhelper");
var Iconv = require("iconv").Iconv;
var mysql = require("mysql");


var myConn = mysql.createConnection({
    host: '192.168.1.133',
    user: 'php',
    password: 'php',
    port:'3306',
    database:'crawl_db',
    charset: 'UTF8MB4_GENERAL_CI',
    connectTimeout: 1000
});

//var IMAGE_DIR = __dirname+"/images/";
var BASE_DIR = '/web/wwwroot/crawl_data/';
var IMAGE_DIR = 'public/media/cards/';

// 根据src和id取得一个目标存放路径
var create_file_dest = function(src, id){
    //var basename = src.replace(/.*\/([^\/]+\.(png|jpg|jpeg|gif))$/g, "$1");
    //var saveTo = IMAGE_DIR + basename;
    var basename = id + src.substr( src.lastIndexOf('.') );
    var dest = IMAGE_DIR + basename;
    return dest;
};

// 将图片下载到目的路径
var download_image = function(src, dest){
    http.get(src, function(res){
        var dataBuffer = new bufferhelper();
        res.on('data', function(chunk){
               //dataBuffer += chunk;
               dataBuffer.concat(chunk);

        }).on('end', function(){
            var con = dataBuffer.toBuffer();
            fs.writeFile(dest, con, function(err){
                if(err) console.log("Error: write file failure ["+ dest +"]");
            });
        }).on('error', function(e){
                 console.log( "Got error:"+ e.message );
        });

    });

};


http.get(
    {
        hostname: '2gree.info',
          port: 80,
          path: '/artgenre?category_id=45&type=bbsios2',
          headers: {
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'
          }
    }, 
    function(res){
        var html = '';
        var chunks = [];
        var dataBuffer = new bufferhelper();
        
        res.on('data', function(chunk){
            chunks.push(chunk);
            dataBuffer.concat(chunk);
        });
        res.on('end', function(){
            var convt = new Iconv('SHIFT_JIS', 'UTF-8');
            html = convt.convert( dataBuffer.toBuffer() );

//fs.writeFile(__dirname+'/hehe.html', html);
            var sql = 'INSERT INTO xc_card(card_id, name, image) VALUES ?';
            var cats = [], values = [];
            var $ = cheerio.load(html);
            $('#news li').each(function(i){
                var e = {"title":"", src:"", dest:""};
                var cardId = 2001+i;
                e.src = $(this).find('.image img').attr('src');
                e.title = $(this).find('.title').text();
                e.dest = create_file_dest(e.src, cardId);
                cats.push(e);

                values[i] = [cardId, e.title, e.dest];

                if(!e.src) return true;

                download_image(e.src, BASE_DIR + e.dest);
            });

            myConn.query(sql, [values], function(err){
                if(err) console.log(err);
                myConn.end();
            });

            cats = JSON.stringify(cats);
            fs.writeFile(__dirname+'/details.json', cats);
        });
        res.on('error', function(e){
            console.log(e.message);
        });
    }
);

 

转载于:https://www.cnblogs.com/antarctican/p/5191231.html

你可能感兴趣的:(用nodejs库cheerio抓取网页内容与图片)