[原创]记一次链家爬虫经历

最近手闲了有点痒痒,想获得一个区域内链家挂牌的二手房价格,

1-找一个显示我们需要数据的网站

https://sh.lianjia.com/ditu/

2-F12抓一下包

2.1-分析请求/返回


目前确定可以确定的是
1.数据通过这个请求返回https://ajax.lianjia.com/map/search/ershoufang。
2.数据有以上city_id group_type max_lat min_lat max_lng min_lng request_ts 七个参数。

总而言之先照葫芦画瓢把请求头cookies和get参数原封不动放进python里看看能不能正确返回

import requests


ret = requests.get('https://ajax.lianjia.com/map/search/ershoufang/?callback=jQuery111108991260227644935_1534302676901&city_id=310000&group_type=district&max_lat=31.498134&min_lat=30.979427&max_lng=121.245974&min_lng=121.0735&filters=%7B%7D&request_ts=1534302686664&source=ljpc&authorization=8a1a509b870c5d9f9a350de01cf9729d&_=1534302676903 ')
print(re.text)

发现返回值是
jQuery111108991260227644935_1534302676901({"request_id":"106302269","uniq_id":"D319-0C6F-89D5-7EEE-017203C43185","errno":10001,"error":"invalid request","data":null})
无效请求

2.2-分析原因

GET请求所带的参数如下

city_id //城市ID身份证前六位 (北京110000,上海310000)
group_type //community社区 district地区
max_lat //最大经度
min_lat //最小经度
max_lng //最大纬度
min_lng //最小纬度
request_ts //13位时间戳
authorization=8a1a509b870c5d9f9a350de01cf9729d←此参数意义不明需要研究
_=1534302676903 //13位时间戳

authorization参数应该是一个校验参数,校验请求是否合法

2.3-分析authorization参数

1.开始抓包js下断点
2.搜索下authorization看看
3.发现index.js这里有
4.定位到这个参数的位置
5.下断点开始debug随后发现一个getMd5的函数
参数传进去对比两个authorization,完全一样,下一步就是替换get参数再试试
发现了目标函数getMd5()的函数,把他复制下来放进js调试器,稍微改写一下代码

2.4-编写JS并调试输出

var window = window || {};
function e(e, t) {
var n = (65535 & e) + (65535 & t);
return (e >> 16) + (t >> 16) + (n >> 16) << 16 | 65535 & n
}
function t(e, t) {
return e << t | e >>> 32 - t
}
function n(n, i, a, r, o, s) {
return e(t(e(e(i, n), e(r, s)), o), a)
}
function i(e, t, i, a, r, o, s) {
return n(t & i | ~t & a, e, t, r, o, s)
}
function a(e, t, i, a, r, o, s) {
return n(t & a | i & ~a, e, t, r, o, s)
}
function r(e, t, i, a, r, o, s) {
return n(t ^ i ^ a, e, t, r, o, s)
}
function o(e, t, i, a, r, o, s) {
return n(i ^ (t | ~a), e, t, r, o, s)
}
function s(t, n) {
t[n >> 5] |= 128 << n % 32,
t[14 + (n + 64 >>> 9 << 4)] = n;
var s, l, c, d, u, g = 1732584193,
f = -271733879,
m = -1732584194,
p = 271733878;
for (s = 0; s < t.length; s += 16) l = g,
c = f,
d = m,
u = p,
g = i(g, f, m, p, t[s], 7, -680876936),
p = i(p, g, f, m, t[s + 1], 12, -389564586),
m = i(m, p, g, f, t[s + 2], 17, 606105819),
f = i(f, m, p, g, t[s + 3], 22, -1044525330),
g = i(g, f, m, p, t[s + 4], 7, -176418897),
p = i(p, g, f, m, t[s + 5], 12, 1200080426),
m = i(m, p, g, f, t[s + 6], 17, -1473231341),
f = i(f, m, p, g, t[s + 7], 22, -45705983),
g = i(g, f, m, p, t[s + 8], 7, 1770035416),
p = i(p, g, f, m, t[s + 9], 12, -1958414417),
m = i(m, p, g, f, t[s + 10], 17, -42063),
f = i(f, m, p, g, t[s + 11], 22, -1990404162),
g = i(g, f, m, p, t[s + 12], 7, 1804603682),
p = i(p, g, f, m, t[s + 13], 12, -40341101),
m = i(m, p, g, f, t[s + 14], 17, -1502002290),
f = i(f, m, p, g, t[s + 15], 22, 1236535329),
g = a(g, f, m, p, t[s + 1], 5, -165796510),
p = a(p, g, f, m, t[s + 6], 9, -1069501632),
m = a(m, p, g, f, t[s + 11], 14, 643717713),
f = a(f, m, p, g, t[s], 20, -373897302),
g = a(g, f, m, p, t[s + 5], 5, -701558691),
p = a(p, g, f, m, t[s + 10], 9, 38016083),
m = a(m, p, g, f, t[s + 15], 14, -660478335),
f = a(f, m, p, g, t[s + 4], 20, -405537848),
g = a(g, f, m, p, t[s + 9], 5, 568446438),
p = a(p, g, f, m, t[s + 14], 9, -1019803690),
m = a(m, p, g, f, t[s + 3], 14, -187363961),
f = a(f, m, p, g, t[s + 8], 20, 1163531501),
g = a(g, f, m, p, t[s + 13], 5, -1444681467),
p = a(p, g, f, m, t[s + 2], 9, -51403784),
m = a(m, p, g, f, t[s + 7], 14, 1735328473),
f = a(f, m, p, g, t[s + 12], 20, -1926607734),
g = r(g, f, m, p, t[s + 5], 4, -378558),
p = r(p, g, f, m, t[s + 8], 11, -2022574463),
m = r(m, p, g, f, t[s + 11], 16, 1839030562),
f = r(f, m, p, g, t[s + 14], 23, -35309556),
g = r(g, f, m, p, t[s + 1], 4, -1530992060),
p = r(p, g, f, m, t[s + 4], 11, 1272893353),
m = r(m, p, g, f, t[s + 7], 16, -155497632),
f = r(f, m, p, g, t[s + 10], 23, -1094730640),
g = r(g, f, m, p, t[s + 13], 4, 681279174),
p = r(p, g, f, m, t[s], 11, -358537222),
m = r(m, p, g, f, t[s + 3], 16, -722521979),
f = r(f, m, p, g, t[s + 6], 23, 76029189),
g = r(g, f, m, p, t[s + 9], 4, -640364487),
p = r(p, g, f, m, t[s + 12], 11, -421815835),
m = r(m, p, g, f, t[s + 15], 16, 530742520),
f = r(f, m, p, g, t[s + 2], 23, -995338651),
g = o(g, f, m, p, t[s], 6, -198630844),
p = o(p, g, f, m, t[s + 7], 10, 1126891415),
m = o(m, p, g, f, t[s + 14], 15, -1416354905),
f = o(f, m, p, g, t[s + 5], 21, -57434055),
g = o(g, f, m, p, t[s + 12], 6, 1700485571),
p = o(p, g, f, m, t[s + 3], 10, -1894986606),
m = o(m, p, g, f, t[s + 10], 15, -1051523),
f = o(f, m, p, g, t[s + 1], 21, -2054922799),
g = o(g, f, m, p, t[s + 8], 6, 1873313359),
p = o(p, g, f, m, t[s + 15], 10, -30611744),
m = o(m, p, g, f, t[s + 6], 15, -1560198380),
f = o(f, m, p, g, t[s + 13], 21, 1309151649),
g = o(g, f, m, p, t[s + 4], 6, -145523070),
p = o(p, g, f, m, t[s + 11], 10, -1120210379),
m = o(m, p, g, f, t[s + 2], 15, 718787259),
f = o(f, m, p, g, t[s + 9], 21, -343485551),
g = e(g, l),
f = e(f, c),
m = e(m, d),
p = e(p, u);
return [g, f, m, p]
}
function l(e) {
var t, n = "";
for (t = 0; t < 32 * e.length; t += 8) n += String.fromCharCode(e[t >> 5] >>> t % 32 & 255);
return n
}
function c(e) {
var t, n = [];
for (n[(e.length >> 2) - 1] = void 0, t = 0; t < n.length; t += 1) n[t] = 0;
for (t = 0; t < 8 * e.length; t += 8) n[t >> 5] |= (255 & e.charCodeAt(t / 8)) << t % 32;
return n
}
function d(e) {
return l(s(c(e), 8 * e.length))
}
function u(e, t) {
var n, i, a = c(e),
r = [],
o = [];
for (r[15] = o[15] = void 0, a.length > 16 && (a = s(a, 8 * e.length)), n = 0; n < 16; n += 1) r[n] = 909522486 ^ a[n],
o[n] = 1549556828 ^ a[n];
return i = s(r.concat(c(t)), 512 + 8 * t.length),
l(s(o.concat(i), 640))
}
function g(e) {
var t, n, i = "0123456789abcdef",
a = "";
for (n = 0; n < e.length; n += 1) t = e.charCodeAt(n),
a += i.charAt(t >>> 4 & 15) + i.charAt(15 & t);
return a
}
function f(e) {
return unescape(encodeURIComponent(e))
}
function m(e) {
return d(f(e))
}
function p(e) {
return g(m(e))
}
function _(e, t) {
return u(f(e), f(t))
}
function h(e, t) {
return g(_(e, t))
}
function v(e, t, n) {
return t ? n ? _(t, e) : h(t, e) : n ? m(e) : p(e)
}
function getMd5(e) {
var t = [],
i = "";
for (var a in e) t.push(a);
t.sort();
for (var a = 0; a < t.length; a++) {
    var r = t[a];
    "filters" !== r && (i += r + "=" + e[r])
}
return i ? (window.md5 = n, v("vfkpbin1ix2rb88gfjebs0f60cbvhedl" + i)) : "";
}


返回值与网站一致,说明我们计算成功

主要问题解决了程序也就写出来了,具体的python与js代码如何交互在这里就不写了,进项目自己看吧都写了注释,地址在下面,基础功能都已经写好了,日后或许会更新一下

GitHub链接:https://github.com/xjkj123/Lianjia

项目内包含:

1.爬取市区所有区域轮廓经纬度并存入数据库
2.爬取区域内所有住房信息地理位置区域均价

完成,看看返回值

[{'id': '5011000002522', 'name': '控江路896弄', 'longitude': 121.5400975, 'latitude': 31.28779031, 'unit_price': 58333.3, 'count': 1}, {'id': '5011000012863', 'name': '海林大楼', 'longitude': 121.5408274, 'latitude': 31.28532439, 'unit_price': 66857.1, 'count': 1}, {'id': '5011000013110', 'name': '双阳二村', 'longitude': 121.5405593, 'latitude': 31.28424652, 'unit_price': 49603.2, 'count': 1}, {'id': '5011000019497', 'name': '隆昌路718弄', 'longitude': 121.5436034, 'latitude': 31.28960961, 'unit_price': 59514.9, 'count': 1}, {'id': '5011000020513', 'name': '永吉路81弄', 'longitude': 121.540138, 'latitude': 31.28891289, 'unit_price': 64888.2, 'count': 1}, {'id': '5011102209082', 'name': '双阳路501号', 'longitude': 121.5375665, 'latitude': 31.289226, 'unit_price': 73375.3, 'count': 1}, {'id': '5011102209084', 'name': '双阳路503号', 'longitude': 121.537643, 'latitude': 31.289278, 'unit_price': 63868.6, 'count': 1}, {'id': '5011102209214', 'name': '控江路641号', 'longitude': 121.5463299, 'latitude': 31.29040561, 'unit_price': 52173.9, 'count': 1}, {'id': '5010023573140072', 'name': '控江路764弄', 'longitude': 121.544502, 'latitude': 31.290065, 'unit_price': 53513.7, 'count': 1}, {'id': '5011000000353', 'name': '控江路802弄', 'longitude': 121.542579, 'latitude': 31.289577, 'unit_price': 56257.9, 'count': 2}, {'id': '5011000005204', 'name': '隆昌路696弄', 'longitude': 121.5459504, 'latitude': 31.28833351, 'unit_price': 56122, 'count': 2}, {'id': '5011000014811', 'name': '双阳大楼', 'longitude': 121.5387095, 'latitude': 31.2877461, 'unit_price': 58382.2, 'count': 2}, {'id': '5011000018378', 'name': '建发公园首府', 'longitude': 121.5391605, 'latitude': 31.2874396, 'unit_price': 98843.3, 'count': 2}, {'id': '5011000001982', 'name': '隆昌路700弄', 'longitude': 121.5441811, 'latitude': 31.2887839, 'unit_price': 54543.2, 'count': 3}, {'id': '5011000004717', 'name': '心仪雅苑', 'longitude': 121.5465933, 'latitude': 31.28487159, 'unit_price': 70531.3, 'count': 3}, {'id': '5011000004825', 'name': '杨浦公寓', 'longitude': 121.538266, 'latitude': 31.286736, 'unit_price': 80917.8, 'count': 3}, {'id': '5011000016193', 'name': '国富苑', 'longitude': 121.5387341, 'latitude': 31.28608776, 'unit_price': 58787.1, 'count': 3}, {'id': '5011000007481', 'name': '控江路680弄', 'longitude': 121.5455006, 'latitude': 31.2902507, 'unit_price': 56308.2, 'count': 4}, {'id': '5011000008269', 'name': '双阳路490弄', 'longitude': 121.5372191, 'latitude': 31.28942731, 'unit_price': 77336.5, 'count': 4}, {'id': '5011000001938', 'name': '延吉一村', 'longitude': 121.542634, 'latitude': 31.291257, 'unit_price': 62665.9, 'count': 5}, {'id': '5011000004561', 'name': '控江路874弄', 'longitude': 121.5415793, 'latitude': 31.28853888, 'unit_price': 59520.2, 'count': 5}, {'id': '5011000013973', 'name': '泰鸿新苑', 'longitude': 121.5409604, 'latitude': 31.29084533, 'unit_price': 75630.4, 'count': 5}, {'id': '5011000004772', 'name': '永吉路97弄', 'longitude': 121.540425, 'latitude': 31.290023, 'unit_price': 58043.1, 'count': 7}, {'id': '5010023573139558', 'name': '控江路740弄', 'longitude': 121.545283, 'latitude': 31.290516, 'unit_price': 56861.2, 'count': 7}, {'id': '5011000004570', 'name': '控江路888弄', 'longitude': 121.5414509, 'latitude': 31.28844646, 'unit_price': 56924.2, 'count': 9}, {'id': '5011000008245', 'name': '延吉中路117弄', 'longitude': 121.542088, 'latitude': 31.289952, 'unit_price': 59854.8, 'count': 9}, {'id': '5011000002603', 'name': '控江路1039弄', 'longitude': 121.5371509, 'latitude': 31.28595732, 'unit_price': 61710.2, 'count': 10}, {'id': '5011000015671', 'name': '友谊新村', 'longitude': 121.542485, 'latitude': 31.283752, 'unit_price': 59760.2, 'count': 10}, {'id': '509821540319238', 'name': '控江西三村', 'longitude': 121.537798, 'latitude': 31.291252, 'unit_price': 58060.2, 'count': 10}, {'id': '5011000004654', 'name': '控江七村', 'longitude': 121.5402786, 'latitude': 31.28960679, 'unit_price': 62334.5, 'count': 12}, {'id': '5011000004758', 'name': '控江路645弄', 'longitude': 121.5462676, 'latitude': 31.28994767, 'unit_price': 58386.2, 'count': 12}]
难点:
1. JS部分authorization的计算
2. python与js代码的交互
优点:
1.接口数据齐全
2.目前调用没有限制

你可能感兴趣的:([原创]记一次链家爬虫经历)