网址:‘http://jandan.net/ooxx/’
目标:爬取妹子图并保存
#2019/3/27
import requests
import os
from base64 import b64decode
import time
def open_url(url): #打开目标网页
response = requests.get(url)
html = response.content
return html
def find_imgs(url): #获取当前页面所有图片的链接
html = open_url(url).decode('utf-8')
img_addrs = [] #保存图片链接
img_hash = [] #保存图片hash
start = html.find('img-hash')
while start != -1:
end = html.find('<', start)
img_hash.append(html[start+10 : end])
start = html.find('img-hash', end)
for i in range(len(img_hash)):
imgurl = 'http:' + b64decode(img_hash[i]).decode('utf-8')
img_addrs.append(imgurl)
return img_addrs
def save_img(img_addrs): #保存图片
for each in img_addrs:
file_name = str(each).split('/')[-1]
with open(file_name, 'wb') as f:
img = open_url(str(each))
f.write(img)
def main(pages = 2): #主程序
folder = 'OOXX'
os.mkdir(folder) #创建目录
os.chdir(folder) #改变当前目录
url = 'http://jandan.net/ooxx/'
html = open_url(url).decode('utf-8')
start = html.find('current-comment-page') + 23
end = html.find(']', start)
pages_num = int(html[start : end]) #获取总页码
for i in range(pages):
pages_num -= i
page_url = url + 'page-' + str(pages_num) + '#comments'
img_addrs = find_imgs(page_url)
save_img(img_addrs)
time.sleep(1)
if __name__ == '__main__':
main(10)
<p>
<img src="//img.jandan.net/img/blank.gif" onload="jandan_load_img(this)" />
<span class="img-hash">Ly93czEuc2luYWltZy5jbi9tdzYwMC82MjMwNmVlYWx5MWcxNm9va2RnY3hqMjB1MDE0MHUweC5qcGc=span>
p>
在本该是图片链接的地方被blank.gif代替,后面跟一个jandan_load_img()函数。
F12打开审查元素,ctrl + shift + f 全局搜索jandan_load_img,找到函数的定义
function jandan_load_img(b) {
var d = $(b);
var f = d.next("span.img-hash");
var e = f.text();
f.remove();
var c = jdOT1NSEizUgpefOcYePSffeQdahhkY3EM(e, "jlhmmbpome4lNkdFfWiqXYUioZ2sKzb3");
var a = $('[查看原图]a>');
d.before(a);
d.before("<br>");
d.removeAttr("onload");
d.attr("src", location.protocol + c.replace(/(\/\/\w+\.sinaimg\.cn\/)(\w+)(\/.+\.gif)/, "$1thumb180$3"));
if (/\.gif$/.test(c)) {
d.attr("org_src", location.protocol + c);
b.onload = function() {
add_img_loading_mask(this, load_sina_gif)
}
}
}
接着查找名为jdOT1NSEizUgpefOcYePSffeQdahhkY3EM()的函数
var jdOT1NSEizUgpefOcYePSffeQdahhkY3EM = function(n, t, e) {
var f = "DECODE";
var t = t ? t : "";
var e = e ? e : 0;
var r = 4;
t = md5(t);
var d = n;
var p = md5(t.substr(0, 16));
var o = md5(t.substr(16, 16));
if (r) {
if (f == "DECODE") {
var m = n.substr(0, r)
}
} else {
var m = ""
}
var c = p + md5(p + m);
var l;
if (f == "DECODE") {
n = n.substr(r);
l = base64_decode(n)
}
var k = new Array(256);
for (var h = 0; h < 256; h++) {
k[h] = h
}
var b = new Array();
for (var h = 0; h < 256; h++) {
b[h] = c.charCodeAt(h % c.length)
}
for (var g = h = 0; h < 256; h++) {
g = (g + k[h] + b[h]) % 256;
tmp = k[h];
k[h] = k[g];
k[g] = tmp
}
var u = "";
l = l.split("");
for (var q = g = h = 0; h < l.length; h++) {
q = (q + 1) % 256;
g = (g + k[q]) % 256;
tmp = k[q];
k[q] = k[g];
k[g] = tmp;
u += chr(ord(l[h]) ^ (k[(k[q] + k[g]) % 256]))
}
if (f == "DECODE") {
if ((u.substr(0, 10) == 0 || u.substr(0, 10) - time() > 0) && u.substr(10, 16) == md5(u.substr(26) + o).substr(0, 16)) {
u = u.substr(26)
} else {
u = ""
}
u = base64_decode(d)
}
return u
};
分析该函数,发现返回值u = base64_decode(d),而d = n,n则是输入的第一个参数,回到jandan_load_img函数中,输入的第一个参数就是源代码中的img-hash后面的字符串。
继续搜索base64_decode函数,只是利用普通的BASE64加密。
function base64_decode(a) {
return window.atob(a)
WindowOrWorkerGlobalScope.atob() 函数对已经使用base-64编码编码的数据字符串进行解码。您可以使用该btoa()方法对可能导致通信问题的数据进行编码和传输,然后将其传输并使用该atob()方法再次解码数据。例如,您可以对控制字符(如 ASCII 值0到 31)进行编码,传输和解码。
在python中,有专门的包base64处理该问题。
因网页源码改变,此为后续修改版本
#2019/4/13
import requests
from bs4 import BeautifulSoup
import lxml
import time
import os
def ooxx(pages=2):
folder = 'OOXX'
os.mkdir(folder)
os.chdir(folder)
url = 'http://jandan.net/ooxx/'
time.sleep(1)
req = requests.get(url).text
soup = BeautifulSoup(req, 'lxml')
pages_num = int(soup.find('span', class_='current-comment-page').text[1:-1])
for i in range(pages):
pages_num -= i
page_url = url + 'page-' + str(pages_num) + '#comments'
time.sleep(1)
page_req = requests.get(page_url).text
page_soup = BeautifulSoup(page_req, 'lxml')
for j in page_soup.find_all('img'):
img_addrs = 'http:' + j.attrs['src']
filename = img_addrs.split('/')[-1]
with open(filename, 'wb') as f:
time.sleep(1)
img = requests.get(img_addrs).content
f.write(img)
if __name__ == '__main__':
ooxx()