[Python]自学笔记37:论一只爬虫的自我修养4:爬取图片

import urllib.request
import os

def url_open(url):   #打开一个链接,并返回html
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
    response = urllib.request.urlopen(req)
    html = response.read()

    
    return html


def find_img(url):   #寻找一个页面中所有的jpg文件
    html = url_open(url).decode('utf-8')
    img_addrs = []
    

    a = html.find('img src=')

    for i in range(100):
        b = html.find('.jpg',a,a+255)

        if b!= -1:
            img_addrs.append(html[a+9:b+4])
        else:
            b = a + 9

        a = html.find('img src=',b)
    
    return img_addrs
        
def save_imgs(img_addrs):   #保存获取到的jpg文件
    a = 1
    for each in img_addrs:
        filename = str(str(a) + '.jpg')
        with open(filename,'wb') as f:
            img = url_open(each)
            f.write(img)
        a += 1

def download_mm(folder='ooxx',pages=10):  #主函数
    os.mkdir(folder)   #创建一个新文件夹
    os.chdir(folder)   #切换当前目录到创建好的文件夹

    url = 'https://www.meitulu.com/'
    page_num = 1

    for i in range(pages):   #每一页都执行一下此循环
        page_num += 1
        page_url = url + 'item/' + '12568_' + str(page_num) + '.html'
        img_addrs = find_img(page_url)   #存放获取到的jpg文件的url
        save_imgs(img_addrs)   #保存url为jpg

if __name__ == '__main__':  
    download_mm()
    


[Python]自学笔记37:论一只爬虫的自我修养4:爬取图片_第1张图片
爬取成功

你可能感兴趣的:(Python自学,python)