python实现煎蛋网妹子图爬虫

网址:‘http://jandan.net/ooxx/’
目标:爬取妹子图并保存

#2019/3/27

import requests
import os
from base64 import b64decode
import time


def open_url(url):  #打开目标网页
    response = requests.get(url)
    html = response.content

    return html


def find_imgs(url):  #获取当前页面所有图片的链接
    html = open_url(url).decode('utf-8')
    
    img_addrs = []  #保存图片链接
    img_hash = []  #保存图片hash

    start = html.find('img-hash')
    while start != -1:
        end = html.find('<', start)
        img_hash.append(html[start+10 : end])
        start = html.find('img-hash', end)
    
    for i in range(len(img_hash)):
        imgurl = 'http:' + b64decode(img_hash[i]).decode('utf-8')
        img_addrs.append(imgurl)
    
    return img_addrs


def save_img(img_addrs):  #保存图片
    for each in img_addrs:
        file_name = str(each).split('/')[-1]
        
        with open(file_name, 'wb') as f:
            img = open_url(str(each))
            f.write(img)


def main(pages = 2):  #主程序
    folder = 'OOXX'
    os.mkdir(folder)  #创建目录
    os.chdir(folder)  #改变当前目录
    
    url = 'http://jandan.net/ooxx/'
    html = open_url(url).decode('utf-8')

    start = html.find('current-comment-page') + 23
    end = html.find(']', start)
    pages_num = int(html[start : end])  #获取总页码

    for i in range(pages):
        pages_num -= i
        page_url = url + 'page-' + str(pages_num) + '#comments'
        img_addrs = find_imgs(page_url)
        save_img(img_addrs)
        
        time.sleep(1)


if __name__ == '__main__':
    main(10)
  1. 分析网页源代码
<p>
<img src="//img.jandan.net/img/blank.gif" onload="jandan_load_img(this)" />
<span class="img-hash">Ly93czEuc2luYWltZy5jbi9tdzYwMC82MjMwNmVlYWx5MWcxNm9va2RnY3hqMjB1MDE0MHUweC5qcGc=span>
p>

在本该是图片链接的地方被blank.gif代替,后面跟一个jandan_load_img()函数。

F12打开审查元素,ctrl + shift + f 全局搜索jandan_load_img,找到函数的定义

function jandan_load_img(b) {
    var d = $(b);
    var f = d.next("span.img-hash");
    var e = f.text();
    f.remove();
    var c = jdOT1NSEizUgpefOcYePSffeQdahhkY3EM(e, "jlhmmbpome4lNkdFfWiqXYUioZ2sKzb3");
    var a = $('[查看原图]a>');
    d.before(a);
    d.before("<br>");
    d.removeAttr("onload");
    d.attr("src", location.protocol + c.replace(/(\/\/\w+\.sinaimg\.cn\/)(\w+)(\/.+\.gif)/, "$1thumb180$3"));
    if (/\.gif$/.test(c)) {
        d.attr("org_src", location.protocol + c);
        b.onload = function() {
            add_img_loading_mask(this, load_sina_gif)
        }
    }
}

接着查找名为jdOT1NSEizUgpefOcYePSffeQdahhkY3EM()的函数

var jdOT1NSEizUgpefOcYePSffeQdahhkY3EM = function(n, t, e) {
    var f = "DECODE";
    var t = t ? t : "";
    var e = e ? e : 0;
    var r = 4;
    t = md5(t);
    var d = n;
    var p = md5(t.substr(0, 16));
    var o = md5(t.substr(16, 16));
    if (r) {
        if (f == "DECODE") {
            var m = n.substr(0, r)
        }
    } else {
        var m = ""
    }
    var c = p + md5(p + m);
    var l;
    if (f == "DECODE") {
        n = n.substr(r);
        l = base64_decode(n)
    }
    var k = new Array(256);
    for (var h = 0; h < 256; h++) {
        k[h] = h
    }
    var b = new Array();
    for (var h = 0; h < 256; h++) {
        b[h] = c.charCodeAt(h % c.length)
    }
    for (var g = h = 0; h < 256; h++) {
        g = (g + k[h] + b[h]) % 256;
        tmp = k[h];
        k[h] = k[g];
        k[g] = tmp
    }
    var u = "";
    l = l.split("");
    for (var q = g = h = 0; h < l.length; h++) {
        q = (q + 1) % 256;
        g = (g + k[q]) % 256;
        tmp = k[q];
        k[q] = k[g];
        k[g] = tmp;
        u += chr(ord(l[h]) ^ (k[(k[q] + k[g]) % 256]))
    }
    if (f == "DECODE") {
        if ((u.substr(0, 10) == 0 || u.substr(0, 10) - time() > 0) && u.substr(10, 16) == md5(u.substr(26) + o).substr(0, 16)) {
            u = u.substr(26)
        } else {
            u = ""
        }
        u = base64_decode(d)
    }
    return u
};

分析该函数,发现返回值u = base64_decode(d),而d = n,n则是输入的第一个参数,回到jandan_load_img函数中,输入的第一个参数就是源代码中的img-hash后面的字符串。

继续搜索base64_decode函数,只是利用普通的BASE64加密。

function base64_decode(a) {
    return window.atob(a)

WindowOrWorkerGlobalScope.atob() 函数对已经使用base-64编码编码的数据字符串进行解码。您可以使用该btoa()方法对可能导致通信问题的数据进行编码和传输,然后将其传输并使用该atob()方法再次解码数据。例如,您可以对控制字符(如 ASCII 值0到 31)进行编码,传输和解码。

在python中,有专门的包base64处理该问题。


因网页源码改变,此为后续修改版本

#2019/4/13

import requests
from bs4 import BeautifulSoup
import lxml
import time
import os


def ooxx(pages=2):
    folder = 'OOXX'
    os.mkdir(folder)
    os.chdir(folder)

    url = 'http://jandan.net/ooxx/'
    time.sleep(1)
    req = requests.get(url).text
    soup = BeautifulSoup(req, 'lxml')

    pages_num = int(soup.find('span', class_='current-comment-page').text[1:-1])

    for i in range(pages):
        pages_num -= i
        page_url = url + 'page-' + str(pages_num) + '#comments'
        time.sleep(1)
        page_req = requests.get(page_url).text
        page_soup = BeautifulSoup(page_req, 'lxml')
        
        for j in page_soup.find_all('img'):
            img_addrs = 'http:' + j.attrs['src']
            filename = img_addrs.split('/')[-1]

            with open(filename, 'wb') as f:
                time.sleep(1)
                img = requests.get(img_addrs).content
                f.write(img)


if __name__ == '__main__':
    ooxx()

你可能感兴趣的:(python实现煎蛋网妹子图爬虫)