import re,os,random,time
from urllib import request
from bs4 import BeautifulSoup
start_url='http://www.mzitu.com/all'
headers={"User-Agent": "Mozilla/5.0 (X11; U; SunOS sun4u; en-US; rv:1.6) Gecko/20040503"}
req=request.Request(url=start_url,headers=headers)
with request.urlopen(req) as res:
html=res.read()
html=html.decode('utf-8')
soup=BeautifulSoup(html,'lxml')
data=soup.find_all("ul","archives")
l,n=[],[]
for urls in data:
url=urls.find_all('a')
for i in url:
l.append(i['href'])
n.append(i.get_text())
def f(x,a):
url=x
headers={"User-Agent": "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6"}
time.sleep(3)
r=request.Request(url=x,headers=headers)
with request.urlopen(r) as f:
html=f.read()
html=html.decode('utf-8')
soup=BeautifulSoup(html,'lxml')
data=soup.find_all('img')
page=soup.find_all("div","pagenavi")
data=re.findall(r'http://i.meizitu.net/20.*?jpg',str(data))
for imgurl in data:
global iurl
iurl=imgurl
savefile(imgurl,a)
urls=re.findall('%s/\\d+'%url,str(html))
yy=[]
for ur in urls:
page=ur[-2:].strip('/')
yy.append(page)
page=max(map(int,yy))
l=['%02d'%x for x in range(2,int(page))]
for i in l:
iurl=iurl[:-6]+str(i)+'.jpg'
print(iurl)
savefile(iurl,a)
def savefile(url,a):
p=url.rindex('/')
name=url[p+1:]
b=a+'\\'+name
print(url)
try:
request.urlretrieve(url,b)
except:
pass
time.sleep(3)
for x,y in zip(l,n):
print(x,y)
y=y[:4]
time.sleep(0)
a='d:\\1\\%s'%y
if not os.path.isdir(a):
os.mkdir(a)
f(x,a)
else:
print('已存在')
continue