python爬虫 煎蛋网妹子图 动态爬虫1

                                                                                                                      ——————来自某潮汕人的菜鸟教程



    爬虫目标网站:http://jandan.net/ooxx

    爬虫目标中文名称:煎蛋网 妹子图

    爬虫需求:分析js下载 煎蛋网的妹子图

    



    爬虫分析:

        

        1、通过查看网页的源代码我们可以发现,这个网站的图片地址img src是被重新修改的,onload的意思是当网页加载完成之后触发事件 jandan_load_img 这个函数

python爬虫 煎蛋网妹子图 动态爬虫1_第1张图片

通过F12开发者工具我们可以在Console端轻易地获取到jandan_load_img这个函数的代码,

1 function jandan_load_img(b) {
2    var d = $(b);
3    var f = d.next("span.img-hash");
4    var e = f.text();
5    f.remove();
6    var c = jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4(e, "myblSB23xVOq7qKlgBHAUD52BF1yZlql");
7    var a = $('[查看原图]');
8    d.before(a);
9    d.before("
"); 10 d.removeAttr("onload"); 11 d.attr("src", location.protocol + c.replace(/(\/\/\w+\.sinaimg\.cn\/)(\w+)(\/.+\.gif)/, "$1thumb180$3")); 12 if (/\.gif$/.test(c)) { 13 d.attr("org_src", location.protocol + c); 14 b.onload = function() { 15 add_img_loading_mask(this, load_sina_gif) } } }
    函数体如上,下面开始解析这个函数
3838708

Ly93eDIuc2luYWltZy5jbi9tdzYwMC9lNDMxNjM2Nmx5MWZydmtmdDBhdHlqMjBoczBucG15aC5qcGc=

    首先先举个例子,比如上面这张美女图的源代码

    那么在函数体中经过前4行时var e="Ly93eDIuc2luYWltZy5jbi9tdzYwMC9lNDMxNjM2Nmx5MWZydmtmdDBhdHlqMjBoczBucG15aC5qcGc="

    

 var c = jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4(e, "myblSB23xVOq7qKlgBHAUD52BF1yZlql");

   第6行jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4,这个也是个函数,同样用F12开发者工具获得这个的函数体

   jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4函数体如下:

var jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4 = function(n, t, e) {
    var f = "DECODE";
    var t = t ? t : "";
    var e = e ? e : 0;
    var r = 4;
    t = md5(t);
    var d = n;
    var p = md5(t.substr(0, 16));
    var o = md5(t.substr(16, 16));
    if (r) {
        if (f == "DECODE") {
            var m = n.substr(0, r)
        }
    } else {
        var m = ""
    }
    var c = p + md5(p + m);
    var l;
    if (f == "DECODE") {
        n = n.substr(r);
        l = base64_decode(n)
    }
    var k = new Array(256);
    for (var h = 0; h < 256; h++) {
        k[h] = h
    }
    var b = new Array();
    for (var h = 0; h < 256; h++) {
        b[h] = c.charCodeAt(h % c.length)
    }
    for (var g = h = 0; h < 256; h++) {
        g = (g + k[h] + b[h]) % 256;
        tmp = k[h];
        k[h] = k[g];
        k[g] = tmp
    }
    var u = "";
    l = l.split("");
    for (var q = g = h = 0; h < l.length; h++) {
        q = (q + 1) % 256;
        g = (g + k[q]) % 256;
        tmp = k[q];
        k[q] = k[g];
        k[g] = tmp;
        u += chr(ord(l[h]) ^ (k[(k[q] + k[g]) % 256]))
    }
    if (f == "DECODE") {
        if ((u.substr(0, 10) == 0 || u.substr(0, 10) - time() > 0) && u.substr(10, 16) == md5(u.substr(26) + o).substr(0, 16)) {
            u = u.substr(26)
        } else {
            u = ""
        }
        u = base64_decode(d)
    }
    return u
}

 传进来的参数n="Ly93eDIuc2luYWltZy5jbi9tdzYwMC9lNDMxNjM2Nmx5MWZydmtmdDBhdHlqMjBoczBucG15aC5qcGc="

                       t="myblSB23xVOq7qKlgBHAUD52BF1yZlql"

 然后发现最后的

u = base64_decode(d)

而d=n,因此这个函数返回的是一个

base64_decode(n)

在此例中n="Ly93eDIuc2luYWltZy5jbi9tdzYwMC9lNDMxNjM2Nmx5MWZydmtmdDBhdHlqMjBoczBucG15aC5qcGc="

base64_decode在python中应该是这样子写的:

def base64_decode1(s):

    s = s.encode()
    missing_padding = len(s) % 4
    if missing_padding != 0:
        s += b'=' * (4 - missing_padding)
    return base64.decodestring(s)
运行测试程序:
import base64

def base64_decode1(s):

    s = s.encode()
    missing_padding = len(s) % 4
    if missing_padding != 0:
        s += b'=' * (4 - missing_padding)
    return base64.decodestring(s)

print(base64_decode1('Ly93eDIuc2luYWltZy5jbi9tdzYwMC9lNDMxNjM2Nmx5MWZydmtmdDBhdHlqMjBoczBucG15aC5qcGc='))

结果:b'//wx2.sinaimg.cn/mw600/e4316366ly1frvkft0atyj20hs0npmyh.jpg'

解析到这里,后面的等你来完善

不懂的请留言




你可能感兴趣的:(python爬虫)