## python爬取MM131整站图片到本地

修改后用到了队列和多线程,大大提高了效率
PS:在一开始生成URL队列时会等待1分多钟的时间

import os, requests, time
from queue import Queue
from concurrent.futures import ThreadPoolExecutor
from lxml import etree


class MmSpider:
    def __init__(self):
        self.base_url = 'https://m.mm131.net/more.php?page='  # 主网址
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                                      '(KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
                        'Referer': 'https://m.mm131.net/'}

    def get_html(self, url):
        """
        解析并返回网页
        :param url: 要解析的网页
        :return: 返回xpath对象的网页
        """
        response = requests.get(url, headers=self.headers)
        # 判断网页是否有返回信息
        if response.text:
            response.encoding = response.apparent_encoding
            return etree.HTML(response.text)
        else:
            return None

    def get_url(self):
        """
        获取每个图集的网页及title,并放入队列
        """
        i = 1
        # 获取所有图集的url及title
        while True:
            page_url = self.base_url + str(i)
            html = self.get_html(page_url)
            if html is not None:
                post_links = html.xpath('//article/div[2]/a/@href')
                for item in post_links:
                    queue.put(item)
                print(post_links)
                i += 1
            else:
                break

    def get_picture_info(self, url):
        """
        获取每个图集所有的图片
        :param url: 每个图集的网址
        """
        picture_info = {}  # 用来存放图片的title及图片的下载网址
        html = self.get_html(url)
        picture_info['title'] = html.xpath('//div[@class="post-header"]/h2/text()')[0]  # 获取图集的title
        # 获取第一张图片的下载地址,得到前面固定的部分
        first_url = html.xpath('//div[@class="post-content single-post-content"]/a/img/@src')[0].rsplit('/', 1)[0]
        last_page = html.xpath('//div[@class="paging"]/span/text()')[0].replace('页', '').split('/')[1]  # 图集的最后一页
        url_set = set()  # 用于存放图片的下载地址
        # 构造图集中每一张图片的下载地址
        for i in range(1, int(last_page) + 1):
            pic_url = first_url + '/' + str(i) + '.jpg'
            url_set.add(pic_url)
        picture_info['url'] = url_set
        # print(picture_info)
        self.download(picture_info)
        print(time.time() - start)  # 打印运行时间

    def download(self, picture_info):
        """
        下载图片
        :param picture_info: 图片的title及下载地址
        """
        i = 1
        title = picture_info['title']
        file_path = 'MM131图集/' + title + '/'  # 存放图片的路径
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        for url in picture_info['url']:
            response = requests.get(url, headers=self.headers)
            with open(file_path + str(i) + '.jpg', 'wb') as f:
                f.write(response.content)
                print('已下载:{}{}'.format(title, i))
                i += 1


if __name__ == '__main__':
    start = time.time()
    queue = Queue()
    spider = MmSpider()
    spider.get_url()
    pool = ThreadPoolExecutor(max_workers=15)
    while queue.qsize() > 0:
        pool.submit(spider.get_picture_info, queue.get())

效果图如下
## python爬取MM131整站图片到本地_第1张图片

你可能感兴趣的:(笔记,python,爬虫,批量下载图片,队列,多线程)