运营人员需要抓取快递信息,用的第三方的不太靠谱,自己前端遍历,发现每一个IP抓取50条,就被屏蔽了,也可以每秒5~6个慢慢抓,测试过不会被屏蔽
准备工作
用的是 云连HTTP代理 的每天10个的免费 IP地址
用的request去抓取信息和获取IP地址
前端对接 用的是koa2 ,koa-bodyparser处理post请求,koa2-cors处理前端请求跨域
动态IP设置是网上百度到了,自己做了修改
package
{ "name": "ip", "version": "1.0.0", "description": "", "main": "index.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, "author": "", "license": "ISC", "dependencies": { "bluebird": "^3.5.1", "koa": "^2.5.2", "koa-bodyparser": "^3.2.0", "koa2-cors": "^2.0.6", "query-string": "^6.1.0", "request": "^2.88.0" } }
一下是动态IP设置和快递信息请求
const request = require("request");
const Promise = require("bluebird");
const queryString = require('query-string');
const userAgents = [
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
];
//这里只做测试,所以用变量存,而实际应用中,应该使用数据缓存
const expiryTime = 1 * 60 * 1000;// 过期间隔时间,毫秒
let ips = null; //代理ip
let time = Date.now();// 存储代理IP的时间,判断是否过期,如果过期重新请求
/**
* 请求免费代理,可做缓存,这里就存在变量中,只做测试
*/
const getProxyList = (flag) => {
return new Promise((resolve, reject) => {
const nowDate = Date.now();
/*if (!flag && nowDate - time < expiryTime) {
console.log('直接return')
resolve(ips);
return;
}*/
// 动态IP的接口
const apiURL = 'http://xx.xxx.xxx.xx:xxxx/Index-generate_api_url.html?packid=7&fa=5&qty=10&port=1&format=json&ss=5&css=&pro=&city=';
const options = {
method: 'GET',
url: apiURL,
gzip: true,
encoding: null,
};
request(options, (error, response, body) => {
console.log('body:', JSON.parse(body.toString()));
const ret = JSON.parse(body.toString()).success === 'false' ? ips : JSON.parse(body.toString()).data.map(res => res.IP + ':' + res.Port);
ips = ret;
console.log(ret)
time = Date.now();
resolve(ret);
});
})
}
//爬取网页
async function reptile(data) {
return new Promise((resolve, reject) => {
let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
let ip = ips[parseInt(Math.random() * ips.length)];
console.log('ip:', ip);
let useIp = `http://${ip}`;
const options = {
method: 'GET',
url: 'http://www.kuaidi100.com/query?' + queryString.stringify(data),
gzip: true,
encoding: null,
headers: {
'User-Agent': userAgent, //动态设置浏览器头部信息
},
//proxy: useIp, //动态设置代理ip
timeout: 8000
};
request(options, (error, response, body) => {
//这里是因为有些ip 不能访问,所以如果没有访问到,则继续掉用其他ip 访问
if (error) {
console.log(`爬取页面失败,${error},正在重新寻找代理ip... ×`);
// 如果是代理ip无法访问,另外选择一个代理
} else {
console.log('爬取页面成功, √', body.toString());
console.log('爬取页面成功, √', data);
}
resolve(body)
})
});
}
//启动方法
async function startFun(data) {
if (!ips) {
await getProxyList();
}
const body = await reptile(data);//爬取网页
if (!body || body.toString().split('非法访问:IP禁止访问').length > 1) {
await getProxyList(true);
return { code: 400, msg: '抓取失败' };
}
//解析html
return { code: 200, data: body.toString(), msg: '' };
}
//启动方法
module.exports = startFun;
koa的代码
const Koa = require('koa')
const app = new Koa()
const startFun = require('./startFun')
const bodyParser = require('koa-bodyparser')
app.use(bodyParser())
app.use(async (ctx, next) => {
// 允许来自所有域名请求
ctx.set("Access-Control-Allow-Origin", "*");
ctx.set("Access-Control-Allow-Methods", "OPTIONS, GET, PUT, POST, DELETE");
ctx.set("Access-Control-Allow-Headers", "x-requested-with, accept, origin, content-type");
ctx.set("Content-Type", "application/json;charset=utf-8");
ctx.set("Access-Control-Allow-Credentials", true);
ctx.set("Access-Control-Max-Age", 300);
ctx.set("Access-Control-Expose-Headers", "myData");
await next();
})
app.use(async (ctx) => {
if (ctx.url && ctx.url.split('?')[0] === '/chakuaidi' && ctx.method === 'GET') {
const data = await startFun(ctx.query);
let postData = ctx.request.body
ctx.body = data
}
})
app.listen(3000, () => {
console.log('demo2 is run')
})
前端代码
var a = ['xxxxxxxx', 'xxxxxxx', 'xxxxxx', ]
function aa(index) {
index = index || 0;
if (index >= a.length) return;
setTimeout(function () {
$.get('http://xxx.xx.xx.xxx:xxxx/chakuaidi?type=yunda&postid=' + a[index] + '&temp=' + Math.random(), function (res) {
if (res.code === 200) {
res = res.data,
res = JSON.parse(res);
console.log((res.nu || a[index]) + ',' + index + ',' + (!res.data.length ? '暂无数据' : res.data[0].context));
aa(index + 1)
} else {
aa(index)
}
})
},1000)
}
console.log(a.length)
请求是比每秒查几个快了很多 发现到了150个 就开始报禁止IP ,应该是IP太少了,不过比一条一条差感觉快很多,量少推荐,
前端拿到数据 在写一个前端的导出 数据直接导出来 直接就可以用了