python爬煎蛋妹子图

 

# python3
# jiandan meizi tu
import urllib.request
import os
import time
import random


def url_open(url):
    req1 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.0'})
    req2 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.1'})
    req3 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.5'})
    req4 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.1'})
    req_list = [req1, req2,req3, req4]
    response = urllib.request.urlopen(random.choice(req_list))
    html = response.read()
    # print ('url_open done!')
    return html

def get_current_page(url):
    html = url_open(url).decode('utf-8')
    a = html.find('current-comment-page') + 23
    b = html.find(']',a)
    return html[a:b]

def find_imgs(url):
    html = url_open(url).decode('utf-8')
    img_addrs = []
    a = html.find('img src="http')
    while a != -1:        
        b = html.find('.jpg',a, a+255)
        if b != -1:
            img_addrs.append(html[a+9:b+4])
        else:
            b = a + 13
        a = html.find('img src="http', b)
    return img_addrs

def save_imgs(folder,img_addrs):
    for each in img_addrs:
        filename = each.split('/')[-1]
        with open(filename,'wb') as f:
            img = url_open(each)
            f.write(img)


def download_mm(folder = 'xx',pages = 300):
    # os.mkdir(folder)
    os.chdir(folder)
    
    url = 'http://jandan.net/ooxx/'
    current_page_num = int(get_current_page(url))
    for i in range(pages):
        print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),'current_page_num', current_page_num)
        if i%3 == 0:
            print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),"sleep 2 seconds...")
            time.sleep(2)
        current_page_num -= 1
        page_url = url + 'page-' + str(current_page_num) + '#comments'
        img_addrs = find_imgs(page_url)
        save_imgs(folder, img_addrs)

if __name__ == '__main__':
    download_mm()

 

你可能感兴趣的:(python爬煎蛋妹子图)