爬虫美女图片

import requests
import lxml.html
import re
import time
import os
import random
from django.views.decorators.csrf import csrf_exempt



# user_agent列表,每次执行requests请求都随机使用该列表中的user_agent,避免服务器反爬
user_agent_list = [
    # Windows / Firefox 58
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0",
    # Linux / Firefox 58
    "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0",
    # Mac OS X / Safari 11.0.2
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_2) AppleWebKit/603.1.13 (KHTML, like Gecko) Version/11.0.2 Safari/603.1.13",
    # Windows / IE 11
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
    # Windows / Edge 16
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/16.16299.15.0",
    # Windows / Chrome 63
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    # Android Phone / Chrome 63
    "Mozilla/5.0 (Linux; Android 7.0; SM-G935P Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36",
    # Android Tablet / Chrome 63
    "Mozilla/5.0 (Linux; Android 4.4.4; Lenovo TAB 2 A10-70L Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Safari/537.36",
    # iPhone / Safari 11.1.1
    # "Mozilla/5.0 (iPhone; CPU iPhone OS 11_1_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/11.1.1 Mobile/14E304 Safari/602.1",
    # iPad / Safari 11.1.1
    "Mozilla/5.0 (iPad; CPU OS 11_1_1 like Mac OS X) AppleWebKit/603.3.3 (KHTML, like Gecko) Version/11.1.1 Mobile/14G5037b Safari/602.1"]

requests_header = {
    "Host": "",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    "Accept": "",
    "Accept-Language": "zh-CN,en-US;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "",
    "Connectionv": "keep-alive",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache"
}

PICTURE_PATH = "f:/meitulu"



def download_page_html(url):
    phtml = None
    page = None

    try:
        requests_header["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
        requests_header["Host"] = "www.meitulu.com"
        requests_header["Referer"] = url

        # 选择一个随机的User-Agent
        requests_header["User-Agent"] = random.choice(user_agent_list)
        # print(requests_header["User-Agent"])
        # print(requests_header)
        page = requests.get(url=url, headers=requests_header,
                            timeout=15)  # 请求指定的页面
        # print(page.encoding)
        if page.encoding == "ISO-8859-1":
            page.encoding = "utf-8"  # "gb2312"  # 转换页面的编码为gb2312(避免中文乱码)
        phtml = page.text  # 提取请求结果中包含的html文本
        # print("requests success")
        # page.close()  # 关闭requests请求
    except requests.exceptions.RequestException as e:
        print("requests error:", e)
        phtml = None
        # if page != None:
        #     page.close()
    finally:
        if page != None:
            page.close()
        return phtml

@csrf_exempt
def download_picture(url, page, dir):
    try:
        picdir = "{0}/{1}".format(PICTURE_PATH, dir)  # 构造图片保存路径
        print(picdir)
        if os.path.exists(picdir) != True:
            os.makedirs(picdir)  # 如果指定的文件夹不存在就递归创建

        pic_name = url.split("/")[-1]  # 用图片链接中最后一个/后面的部分作为保存的图片名
        pic_full_name = "{0}/{1}".format(picdir, pic_name)

        # print("save picture to :", pic_full_name)

        requests_header["Accept"] = "image/webp,*/*"
        requests_header["Host"] = "mtl.ttsqgs.com"
        requests_header["Referer"] = page

        response = requests.get(
            url, headers=requests_header, timeout=15)  # 获取的文本实际上是图片的二进制文本
        imgdata = response.content  # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本

        if len(imgdata) > (5*1024):  # 只保存大于5k的图片
            with open(pic_full_name, 'wb') as f:
                f.write(imgdata)  # 把图片数据写入文件。with语句会自动关闭f
            print("save picture to :", pic_full_name)
        else:
            print("picture size too small")
        response.close()
    except:
        print("download piccture {0} error".format(url))

# 获取所有需要爬取的页面数


def get_page_list_num(tree):
    page_all_num = 0
    page_list_num = 0
    try:
        # 使用xpath选择器选择html中指定的元素。
        page_all_num = tree.xpath('//div[@id="pages"]/a/text()')[0]
        print(page_all_num)
        page_all_num = str(page_all_num)
        # print(page_all_num)
        page_all_num = re.sub(r"\D", "", page_all_num)  # 把非数字字符串替换为空
        page_all_num = int(page_all_num)  # 转化为整数
        print("max_page_number:", page_all_num)
    except:
        print("get page number error")
        page_all_num = 0
    finally:
        # 向上取整, 每页60个图集, 用总图集数除以60求出页面数
        page_list_num = page_all_num // (15*4) 
        if(page_all_num % (15*4)) != 0:
            page_list_num += 1
        return page_list_num, page_all_num


def get_page_album_list(tree):  # 获取页面中的图片集列表(编号)
    page_album_list = []
    page_album_list = tree.xpath('//ul[@class="img"]/li/a/@href')
    for i in range(len(page_album_list)):
        page_album_list[i] = page_album_list[i].split(
            "/")[-1]  # 提取最后一个 / 之后的内容 "17748.html"
        page_album_list[i] = re.sub(
            r"\D", "", page_album_list[i])  # 把非数字字符串替换为空, 提取出数字部分
    # print(page_album_list)
    return page_album_list


def get_page_title_list(tree):  # 获取页面中的图片集标题
    page_title_list = []
    page_title_list = tree.xpath('//ul[@class="img"]/li/a/img/@alt')
    # print(page_title_list)
    return page_title_list


def get_page_jpgnum_list(tree):  # 获取页面中的图片数目列表
    page_jpgnum_list = []
    page_jpgnum_list = tree.xpath('//ul[@class="img"]/li/p[1]/text()')
    for i in range(len(page_jpgnum_list)):
        page_jpgnum_list[i] = re.sub(
            r"\D", "", page_jpgnum_list[i])  # 把非数字字符串替换为空, 提取出数字部分
        page_jpgnum_list[i] = int(page_jpgnum_list[i])
    # print(page_jpgnum_list)
    return page_jpgnum_list


REQUEST_URL0 = "https://www.meitulu.com/t/siwayouhuo/"
REQUEST_URL1 = "https://www.meitulu.com/t/siwayouhuo/{0}.html"

REQUEST_ALBUM_URL = "https://www.meitulu.com/item/{0}.html"
REQUEST_JPEG_URL = "https://mtl.ttsqgs.com/images/img/{0}/{1}.jpg"

if __name__ == "__main__":

    requests_url = REQUEST_URL0
    index = 0
    page_list_num = 0
    page_all_num = 0

    print("requests_url :", requests_url)
    page_html_list = download_page_html(requests_url)  # 下载当前页面
    if(page_html_list == None):
        exit()

    # print(page_html_list)
    tree = lxml.html.fromstring(page_html_list)
    page_list_num, page_all_num = get_page_list_num(tree)  # 获取页面数
    print(page_list_num, page_all_num)

    for idx in range(page_list_num):
        if(idx == 0):
            requests_url = REQUEST_URL0
        else:
            requests_url = REQUEST_URL1.format(idx+1)
        print(requests_url)

        page_html_list = download_page_html(requests_url)  # 下载当前页面
        if(page_html_list == None):
            continue

        tree = lxml.html.fromstring(page_html_list)

        page_album_list = get_page_album_list(tree)
        print(idx, len(page_album_list))
        page_title_list = get_page_title_list(tree)
        print(idx, len(page_title_list))
        page_jpgnum_list = get_page_jpgnum_list(tree)
        print(idx, len(page_jpgnum_list))

        if(len(page_album_list) != len(page_title_list)) or \
            (len(page_album_list) == 0) or (len(page_title_list) == 0) or \
            (len(page_jpgnum_list) == 0):
            continue
        
        for lst in range(len(page_album_list)):
            for img in range(page_jpgnum_list[lst]):
                jpeg_url = REQUEST_JPEG_URL.format(page_album_list[lst], img+1)
                page_url = REQUEST_ALBUM_URL.format(page_album_list[lst])
                jpg_title = page_title_list[lst]
                # print(requests_url)
                print("Download [{0}] on [{1}], title[{2}]".format(jpeg_url, page_album_list[lst], jpg_title))
                download_picture(jpeg_url, page_url, jpg_title)

                web_sleep = random.randint(1, 5)  # 延时一个随机值,避免被服务器反爬
                # print("waiting {0} seconds".format(web_sleep))
                time.sleep(web_sleep)

你可能感兴趣的:(爬虫美女图片)