node 爬虫 实战 - 爬取拉勾网职位数据

node 爬虫 实战 - 爬取拉勾网职位数据,主要想把数据用于大数据学习,到时候大数据分析可以自己分析一下职位的情况,和比较一些我现在的职位在深圳乃至全国的开发人员水平。

涉及到的技术栈:node.js ,mongoDB , express 

源码下载:https://gitee.com/draven_lee/node-spider ,如果有帮助到你,不需要打赏,欢迎给star。

这个爬虫功能有点简单,仅供学习参考。

  1. 爬取lagou.com首页的menu菜单的url,并且把menu的前30页url保存进去爬虫队列中,之后沿着队列的url,把需要的数据提取出来。
  2. 爬取的过程中,发现,如果太频繁的和没有用户的登录态,会被和谐到其他页面去,应该是拉勾网做了防爬虫机制,所以我把爬取的速度放慢,加上模拟的登录态,可以成功爬取到数据。不过按照这个爬取有点慢的速度,我爬了四天四夜,爬了10W+条职位数据。

废话不说,先上代码。

var express = require('express');
var MongoClient = require('mongodb').MongoClient;
const request = require('superagent');
var url = "mongodb://localhost:27017/draven";
var router = express.Router();
var cheerio = require('cheerio');
var Crawler = require("crawler");


//爬取 数据
router.get('/crawlData', function(req, res, next) {


    var url = "mongodb://localhost:27017/draven";
    var menuList = [];
    var urlList = []
    var location = '深圳';

    MongoClient.connect(url, function(err, db) {
        if (err) throw err;
        var dbo = db.db("draven");


        var c = new Crawler({
            preRequest: function(options, done) {
                // 'options' here is not the 'options' you pass to 'c.queue', instead, it's the options that is going to be passed to 'request' module
                console.log(options.uri);
                // when done is called, the request will start
                done();
            },
            jQuery: true,       //是否用 cheerio 的jQuery语法
            rateLimit:25000,    //爬取速度 25秒 爬一次
            maxConnections : 1, //最大爬取并发量 1
            headers:{           //模拟真实用户浏览器状态
                'Cookie':'index_location_city='+encodeURI(location)+'user_trace_token=20181127172617-5d56fc60-618b-4486-9762-21efad3c49df; JSESSIONID=ABAAABAAAFCAAEG70DFEA8B139FF80287ABDF2F4C137946; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E6%B7%B1%E5%9C%B3; _ga=GA1.2.405959562.1543310779; _gid=GA1.2.577762828.1543310779; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543310402,1543310779; LGSID=20181127172618-7ef404ed-f226-11e8-80e4-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20181127172618-7ef406c2-f226-11e8-80e4-525400f775ce; _gat=1; TG-TRACK-CODE=index_navigation; SEARCH_ID=88db5c7fa2464090a6dd7041f35074ba; X_HTTP_TOKEN=492369107a1a20441020ab9b771f2f6d; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221675489482d244-0f3ef5ad6aef94-4313362-2073600-1675489482e36f%22%2C%22%24device_id%22%3A%221675489482d244-0f3ef5ad6aef94-4313362-2073600-1675489482e36f%22%7D; sajssdk_2015_cross_new_user=1; ab_test_random_num=0; _putrc=69D503B669D896FC123F89F2B170EADC; login=true; hasDeliver=0; gate_login_token=33f3414d87f12e09e089b3b6daf10134f0a5ebf49fad63dfd9b8bc4e3a4f162b; unick=hello; LGRID=20181127174101-8d501f2b-f228-11e8-8c21-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543311662',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
            },
            callback : function (error, res, done) {
                if(error){
                    console.log(error);
                    done();
                }else{
                    var $ = res.$;

                    var jobList = []
                    console.log($('title').text())
                    $('.con_list_item').each(function (idx, item) {
                        var $item = $(item);
                        jobList.push({
                            name: $item.find('.position_link').find('h3').text(),
                            address: $item.find('.add').find('em').text(),
                            company: $item.find('.company_name').find('a').text(),
                            companyLink: $item.find('a').attr('href'),
                            companyImg: $item.find('.com_logo').find('img').attr('src'),
                            money: $item.find('.money').text(),
                            label:$item.find('.list_item_bot').find('span').text(),
                            welfare:$item.find('.li_b_r').text()
                        });
                    });
                    try {

                        console.log('c.queueSize',c.queueSize);
                        console.log('jobList',jobList.length)
                        if(jobList.length > 0 ){
                            //保存到数据库中
                            dbo.collection("job").insertMany(jobList, function(err, res) {
                                if (err) throw err;
                                console.log('job 数据导入成功!');
                                // db.close();

                            })
                        }
                        done();
                    }catch(e){
                        console.log(e);
                        done();
                    }
                }
            }
        });

        //爬取首页的menu的数据
        c.queue([
            {
            uri: 'https://www.lagou.com/',
            headers:{
                'Set-Cookie':'index_location_city='+encodeURI(location),
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
                'JSESSIONID':'ABAAABAAAGFABEFC5E29AF672C4DAF0B10AEE494D83FD62',
                'login':true
            },
            // The global callback won't be called
            callback: function (error, res, done) {
                if(error){
                    console.log(error);
                }else{
                    var $ = res.$;
                    $('.menu_sub a').each(function (idx, element) {
                        var $element = $(element);
                        menuList.push({
                            name: $element.text(),
                            tjId: $element.attr('data-lg-tj-id'),
                            // tjIdName:changeName($element.attr('data-lg-tj-id')),
                            tjNo:$element.attr('data-lg-tj-no'),
                            tjCid:$element.attr('data-lg-tj-cid'),
                            link:$element.attr('href'),
                        });
                        //组装menu前30页的url 
                        for(var i = 1 ;i<=30 ;i++){
                            urlList.push($element.attr('href')+i+'/');
                        }

                    });
                    //把首页爬取的menu URL数据加入到需要爬取的队列中
                    c.queue(urlList);
                    console.log(urlList,urlList.length)
                    console.log('menuList 共',menuList.length ,'条数据');
                    dbo.collection("menu").insertMany(menuList, function(err, res) {
                        if (err) throw err;
                        console.log('数据导入成功!');
                        // db.close();
                    })
                }
                done();
            }
        }])
        res.render('craw');
    });
});



module.exports = router;

 

你可能感兴趣的:(node.js,javascript)