2019-01-14 图片爬取

import requests
import os
from lxml import etree


class spider(object):
    def __init__(self):
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
            "referer": "https://www.mzitu.com/"
        }

        # 1.取得网站数据
    def requsts_dada(self):
        response = requests.get("https://www.mzitu.com/",headers = self.headers)
        html = etree.HTML(response.text)
        # 2.获取大链接地址和分类标题
        class_tit = html.xpath('//ul[@id="pins"]/li/span/a/text()')
        class_href = html.xpath('//ul[@id="pins"]/li/span/a/@href')
        # print(class_href)
        # 建立文件夹
        for tit,src in zip(class_tit,class_href):
            if os.path.exists(tit) == False:
                 os.mkdir(tit)
                 self.download_img_data(src,tit)
    def download_img_data(self,src,tit):
        # 3.取得分类页面数据
        response = requests.get(src,headers = self.headers)
        html = etree.HTML(response.text)
        img_num = html.xpath('//div[@class="pagenavi"]/a[5]/span/text()')
        for i in range(1,int(img_num[0])+1):
            # 4.获取分类页面大图标题及大图链接
            img_tit = html.xpath('//h2/text()')
            img_data = requests.get(src + "/" + str(i),headers = self.headers)
            html = etree.HTML(img_data.text)
            img_href = html.xpath('//div[@class ="main-image"]/p/a/img/@src')
            for imgtit,imgsrc in zip(img_tit,img_href):
                jpg_name = tit + "\\" + tit + str(i) + ".jpg"
                response = requests.get(imgsrc,headers = self.headers).content
                print("正在下载图片……")
                # 5.保存图片
                with open(jpg_name,"wb") as f:
                    f.write(response)


spider = spider()
spider.requsts_dada()

你可能感兴趣的:(2019-01-14 图片爬取)