python爬虫 -- 爬取妹子图照片

## 海量爬取妹子图 

##url:www.mzitu.com

支持python3以上, pycharm直接运行

# *-* coding:utf8 *-*
'''
@author: zzg
@contact: [email protected]
@file: zzg_sister.py
@time: 2018/11/11 0:58
'''
import os
import requests
from lxml import etree
import time


class MM(object):
    """爬url"""

    def __init__(self, url):
        self.url = url
        self.headers = {
            "User-Agent": "https://www.baidu.com/link?url=J1p1Af9lJfiTAr2-0YDy2OMKYrImB6KDyZgtu0nTcr7nethh8cB7BpmcpbbkPZaG&wd=&eqid=cf452f24000053d9000000065c0513d2",
            "Referer": "https://www.mzitu.com/tag/zhennihua/",
        }

    def parse_url(self, url):  # url请求
        response = requests.get(url, headers=self.headers)
        return response.content.decode()

    def get_tag(self, response, tag):  # 提取标签
        html = etree.HTML(response)
        ret = html.xpath(tag)
        return ret

    def save_pict(self, file_path, url_jpg):  # 保存图片
        ret = requests.get(url_jpg, headers=self.headers)
        with open(file_path, 'wb') as f:
            f.write(ret.content)
        print("保存图片成功")

    def run(self, num):
        while True:
            # 1,请求首页,获取套图url
            response = self.parse_url(self.url)

            # 2,提取第一页所有的套图url
            url_list = self.get_tag(response, '//ul[@id="pins"]/li/a/@href')
            print(url_list)

            # 3,遍历,获取单套图所有图片的url
            for url in url_list:
                i = 1  # 设置爬取个数
                while i < num:
                    # 3,1 请求套图url
                    response_jpg = self.parse_url(url)
                    erro_url = url
                    try:
                        # 3,2 提取单张图片url_jpg,下一页的url,图片的名字
                        # //*[@id="maincontent"]/div[4]/table[1]/tbody/tr[3]/td[2]
                        url_jpg = self.get_tag(response_jpg, '//div[@class="main-image"]//a/img/@src')[0]  # 返回的是列表。取值
                        url = self.get_tag(response_jpg, '//div[@class="pagenavi"]/a[last()]/@href')[0]
                        name = self.get_tag(response_jpg, '//div[@class="main-image"]//a/img/@alt')[0]
                        print(url_jpg, url, name)
                    except:
                        print("列表索引错误,错误url:{}".format(erro_url))
                        with open("erro_url.txt", 'a') as f:
                            f.write(erro_url)
                            f.write('\n')
                        break  # 跳过此套图,继续



                    try:
                        # 3.3 保存图片
                        file_path = './zengni/{}_{}.jpg'.format(name, i)  # 构建保存地址和名字
                        self.save_pict(file_path, url_jpg)
                    except Exception as e:
                        print(e)
                        print("图片路径:{}".format(file_path))
                        break  # 跳过,爬取下一套

                    # 3,4 循环下一页爬取
                    i += 1
                time.sleep(0.2)

            # 4,获取下一页套图
            self.url = self.get_tag(response, '//div[@class="nav-links"]/a[last()]/@href')[0]
            print("下一套套图url:{}".format(self.url))

            print("休息休息---1秒")
            time.sleep(1)

            # 5,循环爬取


if __name__ == '__main__':
    print('---start---')
    url = "https://www.mzitu.com/tag/zhennihua//"
    m = MM(url)
    # m.run(6)
    # 如果imags文件夹不存在,则创建
    if os.path.exists("./zengni") is False:
        os.mkdir('./zengni')

    try:
        m.run(40)
    except Exception as e:
        print(e)
        print("结束")
        print(m.url)
    print('---over---')

 

 

你可能感兴趣的:(爬虫)