Python3爬取meitulu(源码+详细注释)

# -*- coding=UTF-8 -*-

import time
import requests
import re
import os
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError, ReadTimeout
##
url = 'https://www.meitulu.com/rihan/'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Mobile Safari/537.36',
           'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
           'Accept-Encoding': 'gzip, deflate, br',
           'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
           'Host': 'm.meitulu.com',
            "Referer": "https://mtl.gzhuibei.com/"
           }    # 创建头部信息

headers2 = {'Upgrade-Insecure-Requests': '1',
             'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Mobile Safari/537.36'
           }    # 创建头部信息

def getURL(url): #发送网络请求
    a = requests.get(url,headers=headers)
    a.encoding = 'utf-8' #改变乱码问题
    html = a.text
    #html_doc = str(html, 'utf-8')
    return html

def doDown():
    # BeautifulSoup是一个HTML / XML解析器
    # soup中是整个html对象
    soup = BeautifulSoup(getURL(url), 'html.parser')  # 解析爬取网址
    #print(soup)
    # 获取此类数据的总页数 94页 - 循环下载
    for i in range(1, 95):
        # https://www.meitulu.com/rihan/2.html
        mu_lu_url = "https://www.meitulu.com/rihan/" + str(i) + ".html"
        # print('第',i,'页,链接地址为:',mu_lu_url)
        soup2 = BeautifulSoup(getURL(mu_lu_url), 'html.parser')  # 解析爬取网址
        # print(soup2)
        #获取当前目录 有的所有图片集
        img_mu_lu = soup2.find("ul",{"class":"img"}).find_all("li") #套图的第一页的图片实际URL
        #print('第', i, '页,包含地址为:', img_mu_lu)
        #遍历含有图集的URL标签
        for mu_lu in img_mu_lu:  # 遍历URL列表
            result = {
                'title': mu_lu.find("img")['alt'], #从img标签中获取文件头
                'link': mu_lu.find("img")['src']
            }  # 过滤出字典
            print('图集名称为:', result['title'],',图片地址为:',result['link'])

            img_title = result['title'][-5:] #图集的文件夹名称
            start1 = img_title.find('[')
            end1 = img_title.rfind(']')
            img_count = img_title[start1 + 1:end1] #图片的实际个数
            # 创建文件夹
            mkdir('C:/Users/25308/Desktop/Python/美图录/' + result['title'])
            # 循环图集
            try:
                for j in range(1, int(img_count)):
                    start2 = result['link'].find('img/')
                    end2 = result['link'].rfind('/0.jpg')
                    img_url = result['link'][start2 + 4:end2]
                    #拼接出图片实际URL : https://mtl.gzhuibei.com/images/img/17868/2.jpg
                    real_img_url = 'https://mtl.gzhuibei.com/images/img/' + img_url + '/' + str(j) + '.jpg'
                    #print('第',j,'张图片',',图片实际地址为:', real_img_url)
                    #下载
                    downImage(j,real_img_url)
                    time.sleep(yanshi)
            except Exception as ex:
                print("出现如下异常%s"%ex)

def downImage(name,image):
    print('第', name, '张图片', ',图片实际地址为:', image)
    f = open(str(name) + '.jpg', 'wb+')
    img = requests.get(image, headers=headers2) #用一个新的请求头,不然返回值为404
    print('请求返回结果:',img)
    if str(img) == '':
        print('下载图片...', end='')
        f.write(img.content)
    f.close()

def mkdir(name):
    if os.path.exists(name):
        print('文件已经存在,不需要创建!')
        os.chdir(name) # 改变当前工作目录到指定的路径
    else:
        print('创建文件夹:')
        os.mkdir(name)
        os.chdir(name) # 改变当前工作目录到指定的路径

if __name__ == '__main__':
    # 创建文件夹
    mkdir('C:/Users/25308/Desktop/Python/美图录/')  # 设定存储爬取图片的路径
    yanshi = 0.5 #设定抓取图片延迟(0.5秒)
    doDown() # 执行下载方法

你可能感兴趣的:(Python,爬虫)