妹子图

import re,os,random,time
from urllib import request
from bs4 import BeautifulSoup
start_url='http://www.mzitu.com/all'
headers={"User-Agent": "Mozilla/5.0 (X11; U; SunOS sun4u; en-US; rv:1.6) Gecko/20040503"}
req=request.Request(url=start_url,headers=headers)
with request.urlopen(req) as res:
    html=res.read()


html=html.decode('utf-8')
soup=BeautifulSoup(html,'lxml')
data=soup.find_all("ul","archives")
l,n=[],[]
for urls in data:
    url=urls.find_all('a')
    for i in url: 
        l.append(i['href'])
        n.append(i.get_text())


def f(x,a): 
    url=x
    headers={"User-Agent": "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6"}
    time.sleep(3)
    r=request.Request(url=x,headers=headers)
    with request.urlopen(r) as f:
        html=f.read()
    html=html.decode('utf-8')
    soup=BeautifulSoup(html,'lxml')
    data=soup.find_all('img')
    page=soup.find_all("div","pagenavi")
    data=re.findall(r'http://i.meizitu.net/20.*?jpg',str(data))
    for imgurl in data:
        global iurl
        iurl=imgurl
        savefile(imgurl,a)
    urls=re.findall('%s/\\d+'%url,str(html))
    yy=[]
    for ur in urls:
        page=ur[-2:].strip('/')
        yy.append(page)
    page=max(map(int,yy))
    l=['%02d'%x for x in range(2,int(page))]
    for i in l:
        iurl=iurl[:-6]+str(i)+'.jpg'
        print(iurl)
        savefile(iurl,a)


def savefile(url,a):
    p=url.rindex('/')
    name=url[p+1:]
    b=a+'\\'+name
    print(url)
    try:
        request.urlretrieve(url,b)
    except:
        pass
    time.sleep(3)


for x,y in zip(l,n):
    print(x,y)
    y=y[:4]
    time.sleep(0)
    a='d:\\1\\%s'%y
    if not os.path.isdir(a):
        os.mkdir(a)
        f(x,a)  
    else:
        print('已存在')
        continue
 

你可能感兴趣的:(妹子图)