爬虫-爬取图片

首先爬取的网址是:https://www.mzitu.com/zipai/
打开网站
发现一页会有好几张图片:

image.png

这里不光要爬取这一页上的图片
image2.png

举例现在要爬取455和454和453这3页的图片
首先写出基本框架
image3.png

import urllib.request
import os

def download_mm(folder='ooxx',pages=3):#爬取的是3页内容,所以pages=3,保存文件名为ooxx
    os.mkdir(folder)
    os.chdir(folder)

    url='http://www.mzitu.com/zipai/'
    page_num=int(get_page(url))

    for i in range(pages):
        page_num-=i
        page_url=url+'comment-page-'+str(page_num)+'/#comments'   #这里的page_url就是浏览器上面的地址,如上图所示,改变了页码只有page_num会改变
        img_addrs=find_imgs(page_url)
        save_imgs(folder,img_addrs)

if __name__=='__main__':
    download_mm()

定义url_open函数

def url_open(url):
    req=urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5221.400 QQBrowser/10.0.1125.400')#header信息要在网页右击->检查->Network->Request Headers中找
    
    response=urllib.request.urlopen(req)#小甲鱼56课讲的是用(url),但据说会因为反爬出现报错,改为(req)能够解决
    html=response.read()
    print(url)
    return html

定义get_page函数


image4.png
def get_page(url):
    html=url_open(url).decode('utf-8')

    a=html.find('page-numbers current')+22 #在网页上页码处右击检查,定位页码信息的位置,观察格式来写a和b,如上图所示
    b=html.find('<',a)
    
    return html[a:b]

定义find_imgs函数


image5.png

···
def find_imgs(url):
html=url_open(url).decode('utf-8')
img_addrs=[]

a=html.find('data-original=') #在网页上要爬的图片处右击检查,定位图片的位置,观察格式来写a和b,如上图所示,注意是后面的地址,不是前面的
while a!=-1:
    b=html.find('.jpg',a,a+255)

    if b!=-1:
        img_addrs.append(html[a+15:b+4])
    else:
        b=a+15
    a=html.find('data-original=',b)
for each in img_addrs:
    print(each)

return img_addrs

···
定义save_imgs函数

def save_imgs(folder,img_addrs):
    for each in img_addrs:
        filename=each.split('/')[-1]
        with open(filename,'wb') as f:
            img=url_open(each)
            f.write(img)

全代码如下:

import urllib.request
import os

def url_open(url):
    req=urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5221.400 QQBrowser/10.0.1125.400')
    
    response=urllib.request.urlopen(req)
    html=response.read()
    print(url)
    return html

def get_page(url):
    html=url_open(url).decode('utf-8')

    a=html.find('page-numbers current')+22
    b=html.find('<',a)
    
    return html[a:b]

def find_imgs(url):
    html=url_open(url).decode('utf-8')
    img_addrs=[]

    a=html.find('data-original=')
    while a!=-1:
        b=html.find('.jpg',a,a+255)

        if b!=-1:
            img_addrs.append(html[a+15:b+4])
        else:
            b=a+15
        a=html.find('data-original=',b)
    for each in img_addrs:
        print(each)

    return img_addrs

def save_imgs(folder,img_addrs):
    for each in img_addrs:
        filename=each.split('/')[-1]
        with open(filename,'wb') as f:
            img=url_open(each)
            f.write(img)
    
def download_mm(folder='ooxx',pages=3):#爬取的是3页内容,所以pages=3,保存文件名为ooxx
    os.mkdir(folder)
    os.chdir(folder)

    url='http://www.mzitu.com/zipai/'
    page_num=int(get_page(url))

    for i in range(pages):
        page_num-=i
        page_url=url+'comment-page-'+str(page_num)+'/#comments'#这里的page_url就是浏览器上面的地址,如上图所示,改变了页码只有page_num会改变
        img_addrs=find_imgs(page_url)
        save_imgs(folder,img_addrs)

if __name__=='__main__':
    download_mm()

结果如下(当然还有个生成的文件夹):
http://www.mzitu.com/zipai/
http://www.mzitu.com/zipai/comment-page-455/#comments
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poldp9j20j60pkk1o.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pom3uvj20j60pkdiy.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pon9ygj20j60pi47j.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp3e5xj20j60pi112.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5ppjdbdj20j60pk0y2.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poqe7pj20j60pj7es.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poqexsj20qr0zkdim.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp9k9yj20j60pktfg.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poldp9j20j60pkk1o.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pom3uvj20j60pkdiy.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pon9ygj20j60pi47j.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp3e5xj20j60pi112.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5ppjdbdj20j60pk0y2.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poqe7pj20j60pj7es.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5poqexsj20qr0zkdim.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp9k9yj20j60pktfg.jpg
http://www.mzitu.com/zipai/comment-page-454/#comments
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp9ixrj20sg11x1kx.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5loi7ijomj20sg0zkn18.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohxcodyj20sg0lbn0e.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohy6z09j20sg0sggow.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohv3lawj20sg0seacw.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvclinj20sg0sgadx.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvcf4qj20sg0sgjvg.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5loia7rrmj20sg0zkq77.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvis0zj20sg0zkdjz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvkfxhj20sg0mnac9.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p3h8ihj20u0140npe.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p13e10j20k00qowjz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p26wnhj20sg11xker.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p210rcj20sg0venp0.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p2oeiij20k30p4b29.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p12zuuj20j60ny13k.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g5p5pp9ixrj20sg11x1kx.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5loi7ijomj20sg0zkn18.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohxcodyj20sg0lbn0e.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohy6z09j20sg0sggow.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohv3lawj20sg0seacw.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvclinj20sg0sgadx.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvcf4qj20sg0sgjvg.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5loia7rrmj20sg0zkq77.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvis0zj20sg0zkdjz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5lohvkfxhj20sg0mnac9.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p3h8ihj20u0140npe.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p13e10j20k00qowjz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p26wnhj20sg11xker.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p210rcj20sg0venp0.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p2oeiij20k30p4b29.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5i7p12zuuj20j60ny13k.jpg
http://www.mzitu.com/zipai/comment-page-452/#comments
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4skej8rj20j60piag9.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sl5510j20qo0zkdpz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sju2r5j20u011hgp7.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4slojnvj20qo0zh7aw.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sk393dj20ia0odn37.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i91zg8qj20j60y30z1.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i9214z3j20qo0zkk27.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i91yorqj20hs0qo403.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i922of2j20kw0rsafv.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i920qyfj20hs0npq3q.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i97qr5lj20u0140b2a.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i924zlmj20lc0sg7g1.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i927emhj20lf0qo75r.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i93fk2aj20u011h78y.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g50w32crrhj20qo0zk4qp.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g50w2shpd0j20hs0koabc.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4skej8rj20j60piag9.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sl5510j20qo0zkdpz.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sju2r5j20u011hgp7.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4slojnvj20qo0zh7aw.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073gy1g5a4sk393dj20ia0odn37.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i91zg8qj20j60y30z1.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i9214z3j20qo0zkk27.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i91yorqj20hs0qo403.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i922of2j20kw0rsafv.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i920qyfj20hs0npq3q.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i97qr5lj20u0140b2a.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i924zlmj20lc0sg7g1.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i927emhj20lf0qo75r.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g55i93fk2aj20u011h78y.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g50w32crrhj20qo0zk4qp.jpg
https://wxt.sinaimg.cn/mw1024/9d52c073ly1g50w2shpd0j20hs0koabc.jpg

你可能感兴趣的:(爬虫-爬取图片)