爬虫案例之爬取win4000的图片

代码如下:

# coding=utf-8


import os
import requests
from PIL import Image
from io import BytesIO
from lxml import etree
from PIL import ImageFile


ImageFile.LOAD_TRUNCATED_IMAGES = True


# 先定义一个opener函数:
def open_mn_web(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
                   'Cookie': 'BAIDU_SSP_lcr=https://www.baidu.com/link?url=23hwYBuWpwUfbSbQSvJgY869r7hcpListVsxpmAC-8WMK1c1KF5UZJmeUzQ5tn7ZgIs48xWvDcxNo4KNgSsk0a&wd=&eqid=acacb6c4000067ee000000045cd99b1d; Hm_lvt_492109f03bd65de28452325006c4a53c=1557764936; security_session_verify=409ca2b5630fd5ab4c56c0bee5a3540b; Hm_lpvt_492109f03bd65de28452325006c4a53c=1557771292',
                   'Referer': url}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.content
        return None
    except (RecursionError, ConnectionError):
        return None


# 解析器
def parse_mn_web(html,xpath):
    htmls = etree.HTML(html)
    putout = htmls.xpath(xpath)
    return putout


# 存储图片
def save_img(data, offset):
    path_root = r'E:\7160mn\win_4000'
    path = os.path.join(path_root, offset,)
    if not os.path.exists(path):
        os.makedirs(path)
    if requests.get(data):
        img_temp = Image.open(BytesIO(requests.get(data).content))
    else:
        img_temp = Image.open(BytesIO('win_4000/fail_to_load_img.jpg'.encode('utf-8')))
    img_name = data[(data.rfind('/')+1):]
    img_temp.save(path + '\\' + img_name, dpi=(300, 300))
    print(img_name, '已经保存')


# 主函数
def main():
    img_xpath = '//div[contains(@class,"scroll-img-cont")]/ul/li/a/img/@data-original'
    url_xpath = '//div[@class="Left_bar"]//ul[@class="clearfix"]/li/a/@href'
    tags = {
        '2': '美女写真',
        '3': '清纯美女',
        '4': '性感美女',
        '5': '明星美女',
        '6': '空姐学生',
        '7': '游戏美女',
        '26': 'CosPlay'
    }
    for key in tags.keys():
        for k in ['1', '2', '3', '4', '5']:
            url = 'http://www.win4000.com/meinvtag' + key + '_' + k + '.html'
            html = open_mn_web(url)
            url_data = parse_mn_web(html, url_xpath)
            for i in url_data:
                html_1 = open_mn_web(i)
                img_src = parse_mn_web(html_1, img_xpath)
                for j in img_src:
                    img_really = j.replace('_130_170.jpg', '.jpg')
                    save_img(img_really, tags[key])


if __name__ == '__main__':
    main()



实际运行的时候在爬到第三分类页面时报错了:

requests.exceptions.ConnectionError: HTTPConnectionPool(host='pic1.win4000.com', port=80): Max retries exceeded with url: /pic/a/90/d377cfc6b1.jpg (Caused by NewConnectionError(': Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。'))

有没有知道怎么解决的大神。。。。

你可能感兴趣的:(爬虫,python,美女图片,win4000)