解决使用urllib.request.urlretrieve()下载批量文件,下载文件不完全的问题且避免下载时长过长陷入死循环

结合在网上找到的解决办法,成功解决了在使用urlretrieve下载文件的过程中所遇到了一些问题:urlretrieve下载文件不完全且下载时长过长陷入死循环
参考地址1、参考地址2

# -*- coding: utf-8 -*-

import pathlib
import re
import socket
import time
import urllib.request

url = "http://www.cc518.com/res/2017/10-09/13/d359aa8796e5612be81623bb6f03aa19.jpg\
http://www.cc518.com/res/2017/10-09/13/cc5851398893b72fc58aadc85d4dafe7.jpg\
http://www.cc518.com/res/2017/10-09/13/1cb1c68aac661884638d4a9700f56733.jpg\
http://www.cc518.com/res/2017/10-09/13/09cb849cdcf54068bb24ec90b7fd3008.jpg\
http://www.cc518.com/res/2017/10-09/13/860ba3ba2ed5c20f36cddcd3f02d691a.jpg\
http://www.cc518.com/res/2017/10-09/13/66888f05ce1678b3734d10850e72fb2a.jpg\
http://www.cc518.com/res/2017/10-09/13/7e4838993718e8c78bd8f4c319c609a2.jpg\
http://www.cc518.com/res/2017/10-09/13/10fa45fc14981e835019f25e84e4481e.jpg\
"

headers = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
header = headers.encode() # 不进行类型转换,传入urlretrieve()后会报错,需转换成bytes类型
# print(header, type(header))

def callback(a1, a2, a3):
    """
    显示下载文件的进度
    :param @a1:目前为此传递的数据块数量
    :param @a2:每个数据块的大小,单位是byte,字节
    :param @a3:远程文件的大小
    :return: None
    """
    download_pg = 100.0 * a1 * a2 / a3
    if download_pg > 100:
        download_pg = 100

    print("当前下载进度为: %.2f%%" % download_pg, )



def download(url, filename, callback, header):
    """
    封装了 urlretrieve()的自定义函数,递归调用urlretrieve(),当下载失败时,重新下载
    download file from internet
    :param url: path to download from
    :param savepath: path to save files
    :return: None
    """

    try:
        urllib.request.urlretrieve(url, filename, callback, header)
    # except urllib.ContentTooShortError:
    #     print('Network conditions is not good.Reloading.')
    #     download(url, filename, callback, header)
    except Exception as e:
        print(e)
        print('Network conditions is not good.\nReloading.....')
        download(url, filename, callback, header)


img_urls = re.findall('http.*?jpg', url)
print(img_urls)
file_name = "图片"
index = 1
print("\n")
for img_src in img_urls:
    # print(img_src)
    print("\n")
    img_path = '.\\' + file_name + '{}.jpg'.format(index)
    index += 1
    i = pathlib.Path(img_path)
    if not i.exists():
        print("\n")
        print('Downloading data from %s' % img_src)
        time.sleep(1.5) #设置的缓冲时间,个人习惯

        # 设置超时时间为30s
        socket.setdefaulttimeout(10)
        # 解决下载不完全问题且避免陷入死循环
        try:
            download(img_src, img_path, callback, header)
            print(img_path, '\nDownload finished!')
            # print('File size = %.2f Mb' % (filesize / 1024 / 1024))
        except socket.timeout:
            count = 1
            while count <= 5:
                try:
                    download(url, img_path, callback, header) # 封装了 urlretrieve()的自定义函数,递归调用urlretrieve
                    break
                except socket.timeout:
                    err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
                    print(err_info)
                    count += 1
            if count > 5:
                print("downloading picture fialed!")
    else:
        print("\n")
        print(img_path, "'File already exsits!'")
    # 获取文件大小
    filesize = i.stat().st_size
    # 文件大小默认以Bytes计, 转换为Mb
    print('File size = %.2f Mb(%.2f Kb)' % (filesize / 1024 / 1024, filesize / 1024))

你可能感兴趣的:(笔记,python,爬虫)