Just for fun——写个爬虫抓取whois信息

目标对象和过程

爬取的网站是西部数码,该网站在https://www.west.cn/web/whois...可以查询whois信息,通过chrome调试知道,数据是从接口:https://www.west.cn/web/whois...中获取的

cookie

分析该请求发现,qtoken2016这个key是最为重要(反爬虫的标记,我之前写的时候,还是qtoken),这个token的生成不在本页,而在https://www.west.cn/main/whoi...这一页点击查看whois的时候,发送请求:https://www.west.cn/services/...来生成新的token,得到的结果是混淆的js代码:

var l=[119,98,115,33,117,116,101,112,98,62,92,50,50,54,45,50,49,50,45,50,50,52,45,50,50,49,45,50,50,55,45,50,51,51,45,50,49,58,45,50,50,54,45,50,50,52,45,50,49,55,45,50,50,54,45,50,49,50,45,50,50,51,45,50,50,54,45,50,50,52,45,50,50,51,45,50,51,51,45,50,50,51,45,50,50,58,45,50,49,55,45,50,50,55,45,50,49,50,94,60,119,98,115,33,101,99,105,107,114,62,92,57,45,53,45,50,56,45,50,49,45,50,50,45,50,57,45,58,45,50,51,45,51,49,45,50,54,45,49,45,50,52,45,55,45,54,45,50,55,45,51,50,45,52,45,50,58,45,50,53,45,50,45,56,45,51,94,60,119,98,115,33,99,62,35,35,60,103,112,115,33,41,100,62,49,60,100,61,101,99,105,107,114,47,109,102,111,104,117,105,60,100,44,44,42,124,99,44,62,84,117,115,106,111,104,47,103,115,112,110,68,105,98,115,68,112,101,102,41,117,116,101,112,98,92,101,99,105,107,114,92,100,94,94,42,126,60,37,47,100,112,112,108,106,102,41,40,114,117,112,108,102,111,51,49,50,55,40,45,99,45,124,113,98,117,105,59,40,48,40,126,42,60];eval(function(p,a,c,k,e,d){e=function(c){return(c35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1;};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p;}('6 3=\'\';7(2=0;2<4.5;2++){3+=8.a(4[2]-1)};9(3)',11,11,'||i|t|l|length|var|for|String|eval|fromCharCode'.split('|'),0,{}))

对于这一点,用js的eval函数模拟一下就可以解决。

代码

/**
 * Created by salamander on 2016/11/8.
 */
let request = require('request');
let Q = require('q');
let datetime = require('locutus/php/datetime');

let getTokenUrl = 'http://www.west.cn/main/whois.asp?act=gettok&_=';
let whoisUrl = 'http://www.west.cn/web/whois/whoisinfo?domain=';
// 需要的字段
let needFields = ['domain', 'registrar', 'country', 'mail', 'whoisinfo', 'add_time', 'registrant_name', 'expire_date'];

// 模仿jquery
let jQuery, $;
$ = jQuery = {
    token: ''
};

jQuery.cookie =  function(name, value, options) {
    this.token = value;
};

/**
 * 获取西部数码whois信息
 * @param domain 域名
 * @param proxy 代理
 */
function getWestWhois(domain, proxy) {
    let defer = Q.defer();
    let firstOptions = {
        url: getTokenUrl + (new Date()).getTime()
    };
    if(proxy) {
        firstOptions.proxy = 'http://' + proxy.trim();
    }
    request(firstOptions, function (error, response, body) {
        if(!error && response.statusCode === 200) {
            // 模拟执行js代码
            try {
                eval(body);
            } catch (err) {
                defer.reject('解析json出错:' + err);
                return;
            }
            if($.token) {
                let options = {
                    url: whoisUrl +  domain + '&server=&refresh=1',
                    headers: {
                        'Cookie': 'qtoken=' + $.token,
                        'X-Requested-With': 'XMLHttpRequest'
                    }
                };
                // 添加代理
                if(proxy) {
                    options.proxy = 'http://' + proxy.trim();
                }
                request(options, function (error, response, body) {
                    if(!error && response.statusCode === 200) {
                        let data = null;
                        try {
                            data = JSON.parse(body);
                        } catch (err) {
                            defer.reject('解析json出错:' + err);
                            return;
                        }
                        if(data['code'] === 200) {
                            defer.resolve(extractWestData(domain, data));
                        } else {
                            defer.reject('查询西部数码whois失败')
                        }
                    } else {
                        defer.reject('请求西部数码whois失败');
                    }
                })
            } else {
                defer.reject('生成token失败');
                return defer.promise;
            }
        } else {
            defer.reject(error);
        }
    });
    return defer.promise;
}

/**
 * 提取西部数码数据
 * @param domain
 * @param data
 */
function extractWestData(domain, data) {
    let country = solveCountry(domain, data['body']);
    return {
        domain: domain,
        mail: data['dom_em'],
        errcode: 0,
        country: country,
        registrant_name: data['dom_org'],
        registrar: data['registrer'],
        expire: data['expdate'],
        whoisinfo: JSON.stringify({
            domain: domain,
            mail: data['dom_em'],
            errcode: 0,
            country: country,
            registrant_name: data['dom_org'],
            registrar: data['registrer'],
            expire: data['expdate']
        })
    };

    function solveCountry(domain, html) {
        if(domain && domain.substr(-1, 3) === '.cn') {
            return  'CN';
        }
        let result = html.match(/Registrant Country: (\S+?)/);
        if(result) {
            return result[1].trim();
        }
        return '';
    }
}


module.exports.getWestWhois = getWestWhois;

使用

let westWhois = require('./west_whois.js');

westWhois.getWestWhois('51nazi.com').then((info) => {
    console.log(info);
});

结果:

另外

51nazi.com这个域名是我的,有意出售。

你可能感兴趣的:(javascript,node.js,nodejs爬虫)