首先爬取的网址是:https://www.mzitu.com/zipai/
打开网站
发现一页会有好几张图片:
这里不光要爬取这一页上的图片
举例现在要爬取455和454和453这3页的图片
首先写出基本框架
import urllib.request
import os
def download_mm(folder='ooxx',pages=3):#爬取的是3页内容,所以pages=3,保存文件名为ooxx
os.mkdir(folder)
os.chdir(folder)
url='http://www.mzitu.com/zipai/'
page_num=int(get_page(url))
for i in range(pages):
page_num-=i
page_url=url+'comment-page-'+str(page_num)+'/#comments' #这里的page_url就是浏览器上面的地址,如上图所示,改变了页码只有page_num会改变
img_addrs=find_imgs(page_url)
save_imgs(folder,img_addrs)
if __name__=='__main__':
download_mm()
定义url_open函数
def url_open(url):
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5221.400 QQBrowser/10.0.1125.400')#header信息要在网页右击->检查->Network->Request Headers中找
response=urllib.request.urlopen(req)#小甲鱼56课讲的是用(url),但据说会因为反爬出现报错,改为(req)能够解决
html=response.read()
print(url)
return html
定义get_page函数
def get_page(url):
html=url_open(url).decode('utf-8')
a=html.find('page-numbers current')+22 #在网页上页码处右击检查,定位页码信息的位置,观察格式来写a和b,如上图所示
b=html.find('<',a)
return html[a:b]
定义find_imgs函数
···
def find_imgs(url):
html=url_open(url).decode('utf-8')
img_addrs=[]
a=html.find('data-original=') #在网页上要爬的图片处右击检查,定位图片的位置,观察格式来写a和b,如上图所示,注意是后面的地址,不是前面的
while a!=-1:
b=html.find('.jpg',a,a+255)
if b!=-1:
img_addrs.append(html[a+15:b+4])
else:
b=a+15
a=html.find('data-original=',b)
for each in img_addrs:
print(each)
return img_addrs
···
定义save_imgs函数
def save_imgs(folder,img_addrs):
for each in img_addrs:
filename=each.split('/')[-1]
with open(filename,'wb') as f:
img=url_open(each)
f.write(img)
全代码如下:
import urllib.request
import os
def url_open(url):
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5221.400 QQBrowser/10.0.1125.400')
response=urllib.request.urlopen(req)
html=response.read()
print(url)
return html
def get_page(url):
html=url_open(url).decode('utf-8')
a=html.find('page-numbers current')+22
b=html.find('<',a)
return html[a:b]
def find_imgs(url):
html=url_open(url).decode('utf-8')
img_addrs=[]
a=html.find('data-original=')
while a!=-1:
b=html.find('.jpg',a,a+255)
if b!=-1:
img_addrs.append(html[a+15:b+4])
else:
b=a+15
a=html.find('data-original=',b)
for each in img_addrs:
print(each)
return img_addrs
def save_imgs(folder,img_addrs):
for each in img_addrs:
filename=each.split('/')[-1]
with open(filename,'wb') as f:
img=url_open(each)
f.write(img)
def download_mm(folder='ooxx',pages=3):#爬取的是3页内容,所以pages=3,保存文件名为ooxx
os.mkdir(folder)
os.chdir(folder)
url='http://www.mzitu.com/zipai/'
page_num=int(get_page(url))
for i in range(pages):
page_num-=i
page_url=url+'comment-page-'+str(page_num)+'/#comments'#这里的page_url就是浏览器上面的地址,如上图所示,改变了页码只有page_num会改变
img_addrs=find_imgs(page_url)
save_imgs(folder,img_addrs)
if __name__=='__main__':
download_mm()
结果如下(当然还有个生成的文件夹):
http://www.mzitu.com/zipai/
http://www.mzitu.com/zipai/comment-page-455/#comments
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poldp9j20j60pkk1o.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pom3uvj20j60pkdiy.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pon9ygj20j60pi47j.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp3e5xj20j60pi112.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5ppjdbdj20j60pk0y2.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poqe7pj20j60pj7es.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poqexsj20qr0zkdim.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp9k9yj20j60pktfg.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poldp9j20j60pkk1o.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pom3uvj20j60pkdiy.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pon9ygj20j60pi47j.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp3e5xj20j60pi112.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5ppjdbdj20j60pk0y2.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poqe7pj20j60pj7es.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poqexsj20qr0zkdim.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp9k9yj20j60pktfg.jpg
http://www.mzitu.com/zipai/comment-page-454/#comments
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp9ixrj20sg11x1kx.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5loi7ijomj20sg0zkn18.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohxcodyj20sg0lbn0e.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohy6z09j20sg0sggow.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohv3lawj20sg0seacw.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvclinj20sg0sgadx.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvcf4qj20sg0sgjvg.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5loia7rrmj20sg0zkq77.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvis0zj20sg0zkdjz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvkfxhj20sg0mnac9.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p3h8ihj20u0140npe.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p13e10j20k00qowjz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p26wnhj20sg11xker.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p210rcj20sg0venp0.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p2oeiij20k30p4b29.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p12zuuj20j60ny13k.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp9ixrj20sg11x1kx.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5loi7ijomj20sg0zkn18.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohxcodyj20sg0lbn0e.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohy6z09j20sg0sggow.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohv3lawj20sg0seacw.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvclinj20sg0sgadx.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvcf4qj20sg0sgjvg.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5loia7rrmj20sg0zkq77.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvis0zj20sg0zkdjz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvkfxhj20sg0mnac9.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p3h8ihj20u0140npe.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p13e10j20k00qowjz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p26wnhj20sg11xker.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p210rcj20sg0venp0.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p2oeiij20k30p4b29.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p12zuuj20j60ny13k.jpg
http://www.mzitu.com/zipai/comment-page-452/#comments
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4skej8rj20j60piag9.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sl5510j20qo0zkdpz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sju2r5j20u011hgp7.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4slojnvj20qo0zh7aw.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sk393dj20ia0odn37.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i91zg8qj20j60y30z1.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i9214z3j20qo0zkk27.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i91yorqj20hs0qo403.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i922of2j20kw0rsafv.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i920qyfj20hs0npq3q.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i97qr5lj20u0140b2a.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i924zlmj20lc0sg7g1.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i927emhj20lf0qo75r.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i93fk2aj20u011h78y.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g50w32crrhj20qo0zk4qp.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g50w2shpd0j20hs0koabc.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4skej8rj20j60piag9.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sl5510j20qo0zkdpz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sju2r5j20u011hgp7.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4slojnvj20qo0zh7aw.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sk393dj20ia0odn37.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i91zg8qj20j60y30z1.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i9214z3j20qo0zkk27.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i91yorqj20hs0qo403.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i922of2j20kw0rsafv.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i920qyfj20hs0npq3q.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i97qr5lj20u0140b2a.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i924zlmj20lc0sg7g1.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i927emhj20lf0qo75r.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i93fk2aj20u011h78y.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g50w32crrhj20qo0zk4qp.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g50w2shpd0j20hs0koabc.jpg