网络爬虫实战之从妹子图开始

网络爬虫的盛行,反扒机制主要有三类:

  1. 验证headers
  2. 验证用户行为
  3. 加载Ajax请求

本次爬取的对象使用了前两种反扒机制,后面我会讲如何绕开,并获取到数据。本次使用的requests库和lxml库需要手动安装一下:

pip install requests
pip install lxml

好了,废话不多说,我们开始吧,先上源码

import random
import re
import time
import requests
from lxml import etree
import os

headers = {
    'accept-encoding':'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'referer': 'https://www.mzitu.com/',
    'cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1586682048,1586697823,1586749867; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1586749931',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
}

def fix_headers_page(number_page):
    '''修复referer,请求主页妹子时使用'''
    headers_01 = {
    'accept-encoding':'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'referer': 'https://www.mzitu.com/xinggan/page/'+ number_page,
    'cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1586682048,1586697823,1586749867; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1586749931',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    return headers_01

def fix_headers_s(word):
    '''修复referer,请求一组妹子图时使用'''
    headers_01 = {
    'accept-encoding':'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'referer': 'https://www.mzitu.com/'+ word,
    'cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1586682048,1586697823,1586749867; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1586749931',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    return headers_01

def fix_headers(word):
    '''修复referer,请求每张照片使用'''
    headers_01 = {
    'accept-encoding':'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'referer': 'https://www.mzitu.com/'+ word,
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    return headers_01

url = 'https://www.mzitu.com/xinggan/'

def create(addr):
    '''创建目录'''
    creat_dir = []
    page_link_s,count = get_main_link()
    page_link_s_two = get_page_link()
    for page in page_link_s:
        pattern = re.compile(r'\d{6,10}')
        number = re.findall(pattern, page)
        word = number[0]
        creat_dir.append(word)
    for page in page_link_s_two:
        pattern = re.compile(r'\d{6,10}')
        number = re.findall(pattern, page)
        word = number[0]
        creat_dir.append(word)
    for i in creat_dir: 
        os.makedirs(addr+'/'+str(i)) 

def get_main_link():
    '''获取性感妹子主页图的链接'''
    r = requests.get(url,headers=headers)
    # time.sleep(random.randint(5,10))
    # print(r.status_code)
    html = etree.HTML(r.text)
    page_link = html.xpath('//*[@id="pins"]//span/a/@href')
    page_count_link = html.xpath('//*[@class="page-numbers"][4]/@href')
    pattern = re.compile(r'\d{1,4}')
    count = re.findall(pattern, page_count_link[0])[0]
    return page_link,count

def get_page_link():
    '''获取每一页的链接以便获取每页中妹子的组图的链接'''
    new_urls = []
    page_link, count = get_main_link()
    tag = 51
    if int(count) > int(tag):
        count = tag
    page_count_link = []
    url = 'https://www.mzitu.com/xinggan/page/' 
    for page in range(2,int(count)):
        temp = url + str(page)
        new_urls.append(temp)
    for new_url in new_urls:
        time.sleep(1)
        pattern = re.compile(r'\d{1,4}')
        number_page_temp = re.findall(pattern, new_url)
        number_page = number_page_temp[0]
        header = fix_headers_page(number_page)
        r = requests.get(new_url,headers=header)
        html = etree.HTML(r.text)
        page_link_count = html.xpath('//*[@id="pins"]//span/a/@href')
        if page_link_count:
            page_count_link.append(page_link_count[0])
    print(page_count_link)
    return page_count_link

def get_photo_one_link(page_link):
    '''获取单个妹子内部连接及页码'''
    create(os.getcwd())
    new_urls = []
    page = []
    for i in page_link:
        # 重做headers
        pattern = re.compile(r'\d{6,10}')
        number_num = re.findall(pattern, i)
        words_words = number_num[0]
        r = requests.get(i,headers=fix_headers_s(words_words))
        time.sleep(random.randint(1,5))
        html = etree.HTML(r.text)
        # 获取页码
        get_zong_page = html.xpath('//*[@class="pagenavi"]//a/@href')
        numberss = get_zong_page[-2][-2:]
        int_num = int(numberss)
        for num in range(2,int_num):
            page.append(num)
            num_url = i+'/'+ str(num)
            new_urls.append(num_url)
        for new_url in new_urls:
            # 获取单张图片的链接
            # 重做headers
            index_s = new_urls.index(new_url) + 2
            pattern = re.compile(r'\d{6,10}')
            number = re.findall(pattern, new_url)
            word = '/' +number[0]+ '/' + str(index_s)
            # 启用新的头部文件
            header = fix_headers(word)
            rr = requests.get(new_url,headers=header)
            # time.sleep(random.randint(1,5))
            html_01 = etree.HTML(rr.text)
            # 提取每页中的图片下载link
            links = html_01.xpath('//*[@class="main-image"]//img/@src')
            for link in links:
                filename = number[0]+ '/' +link[-9:]
                print(link[-9:])
                with open(filename,'wb') as f:
                    # time.sleep(random.randint(1,2))
                    time.sleep(1)
                    ret = requests.get(link,headers=header)
                    f.write(ret.content)
        new_urls = []

        
if __name__ =='__main__':
    print('开始下载...')
    page_link, count = get_main_link()   
    get_photo_one_link(page_link) # 获取首页的妹子图
    page_link_ss = get_page_link()
    get_photo_one_link(page_link_ss) # 获取后续妹子的图
    print('下载结束!!!!')

看看结果

网络爬虫实战之从妹子图开始_第1张图片

谢谢来看后续

该网站验证headers主要验证的是referer,每次加载时,主要变化是这里,在下载图片时,需要在请求中加headers,否则下载的图片无法打开。具体看代码部分。

with open(filename,'wb') as f:
                    time.sleep(1)
                    ret = requests.get(link,headers=header)
                    f.write(ret.content)

关于第二个验证用户行为,加入延时即可。

注:在get_page_link函数中需要取消限制,可以爬取更多数据,只需要注释掉就可以了。否则只能爬取52页的妹子图(每页24组妹子图(每组好像也挺多额))!!!

网络爬虫实战之从妹子图开始_第2张图片

代码很烂(好丢脸)大家看下就好

你可能感兴趣的:(网络爬虫)