目标对象和过程
爬取的网站是西部数码,该网站在https://www.west.cn/web/whois...可以查询whois信息,通过chrome调试知道,数据是从接口:https://www.west.cn/web/whois...中获取的
cookie
分析该请求发现,qtoken2016这个key是最为重要(反爬虫的标记,我之前写的时候,还是qtoken),这个token的生成不在本页,而在https://www.west.cn/main/whoi...这一页点击查看whois的时候,发送请求:https://www.west.cn/services/...来生成新的token,得到的结果是混淆的js代码:
var l=[119,98,115,33,117,116,101,112,98,62,92,50,50,54,45,50,49,50,45,50,50,52,45,50,50,49,45,50,50,55,45,50,51,51,45,50,49,58,45,50,50,54,45,50,50,52,45,50,49,55,45,50,50,54,45,50,49,50,45,50,50,51,45,50,50,54,45,50,50,52,45,50,50,51,45,50,51,51,45,50,50,51,45,50,50,58,45,50,49,55,45,50,50,55,45,50,49,50,94,60,119,98,115,33,101,99,105,107,114,62,92,57,45,53,45,50,56,45,50,49,45,50,50,45,50,57,45,58,45,50,51,45,51,49,45,50,54,45,49,45,50,52,45,55,45,54,45,50,55,45,51,50,45,52,45,50,58,45,50,53,45,50,45,56,45,51,94,60,119,98,115,33,99,62,35,35,60,103,112,115,33,41,100,62,49,60,100,61,101,99,105,107,114,47,109,102,111,104,117,105,60,100,44,44,42,124,99,44,62,84,117,115,106,111,104,47,103,115,112,110,68,105,98,115,68,112,101,102,41,117,116,101,112,98,92,101,99,105,107,114,92,100,94,94,42,126,60,37,47,100,112,112,108,106,102,41,40,114,117,112,108,102,111,51,49,50,55,40,45,99,45,124,113,98,117,105,59,40,48,40,126,42,60];eval(function(p,a,c,k,e,d){e=function(c){return(c35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1;};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p;}('6 3=\'\';7(2=0;2<4.5;2++){3+=8.a(4[2]-1)};9(3)',11,11,'||i|t|l|length|var|for|String|eval|fromCharCode'.split('|'),0,{}))
对于这一点,用js的eval函数模拟一下就可以解决。
代码
/**
* Created by salamander on 2016/11/8.
*/
let request = require('request');
let Q = require('q');
let datetime = require('locutus/php/datetime');
let getTokenUrl = 'http://www.west.cn/main/whois.asp?act=gettok&_=';
let whoisUrl = 'http://www.west.cn/web/whois/whoisinfo?domain=';
// 需要的字段
let needFields = ['domain', 'registrar', 'country', 'mail', 'whoisinfo', 'add_time', 'registrant_name', 'expire_date'];
// 模仿jquery
let jQuery, $;
$ = jQuery = {
token: ''
};
jQuery.cookie = function(name, value, options) {
this.token = value;
};
/**
* 获取西部数码whois信息
* @param domain 域名
* @param proxy 代理
*/
function getWestWhois(domain, proxy) {
let defer = Q.defer();
let firstOptions = {
url: getTokenUrl + (new Date()).getTime()
};
if(proxy) {
firstOptions.proxy = 'http://' + proxy.trim();
}
request(firstOptions, function (error, response, body) {
if(!error && response.statusCode === 200) {
// 模拟执行js代码
try {
eval(body);
} catch (err) {
defer.reject('解析json出错:' + err);
return;
}
if($.token) {
let options = {
url: whoisUrl + domain + '&server=&refresh=1',
headers: {
'Cookie': 'qtoken=' + $.token,
'X-Requested-With': 'XMLHttpRequest'
}
};
// 添加代理
if(proxy) {
options.proxy = 'http://' + proxy.trim();
}
request(options, function (error, response, body) {
if(!error && response.statusCode === 200) {
let data = null;
try {
data = JSON.parse(body);
} catch (err) {
defer.reject('解析json出错:' + err);
return;
}
if(data['code'] === 200) {
defer.resolve(extractWestData(domain, data));
} else {
defer.reject('查询西部数码whois失败')
}
} else {
defer.reject('请求西部数码whois失败');
}
})
} else {
defer.reject('生成token失败');
return defer.promise;
}
} else {
defer.reject(error);
}
});
return defer.promise;
}
/**
* 提取西部数码数据
* @param domain
* @param data
*/
function extractWestData(domain, data) {
let country = solveCountry(domain, data['body']);
return {
domain: domain,
mail: data['dom_em'],
errcode: 0,
country: country,
registrant_name: data['dom_org'],
registrar: data['registrer'],
expire: data['expdate'],
whoisinfo: JSON.stringify({
domain: domain,
mail: data['dom_em'],
errcode: 0,
country: country,
registrant_name: data['dom_org'],
registrar: data['registrer'],
expire: data['expdate']
})
};
function solveCountry(domain, html) {
if(domain && domain.substr(-1, 3) === '.cn') {
return 'CN';
}
let result = html.match(/Registrant Country: (\S+?)
/);
if(result) {
return result[1].trim();
}
return '';
}
}
module.exports.getWestWhois = getWestWhois;
使用
let westWhois = require('./west_whois.js');
westWhois.getWestWhois('51nazi.com').then((info) => {
console.log(info);
});
结果:
另外
51nazi.com这个域名是我的,有意出售。