python爬虫URL重试机制实现(python2.7以及python3.5)

应用场景:

状态不是200的URL重试多次


代码比较简单还有部分注释


python2.7实现:

# -*-coding:utf-8-*-
"""
ayou
"""

import requests

def url_retry(url,num_retries=3):
    print("access!")
    try:
        request = requests.get(url,timeout=60)
        #raise_for_status(),如果不是200会抛出HTTPError错误
        request.raise_for_status()
        html = request.content
    except requests.HTTPError as e:
        html=None
        if num_retries>0:
            #如果不是200就重试,每次递减重试次数
            return url_retry(url,num_retries-1)
    #如果url不存在会抛出ConnectionError错误,这个情况不做重试
    except requests.exceptions.ConnectionError as e:
        return
    return html

url_retry("http://httpbin.org/status/404")

python3.5实现:

# -*-coding:utf-8-*-
"""
ayou
"""
import asyncio
import aiohttp

async def print_page(url,num_retries=3):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(url,timeout=60) as response:
                print("access!")
                  #raise_for_status(),如果不是200会抛出HttpProcessingError错误
                response.raise_for_status()
                body = await response.text()
        except aiohttp.errors.HttpProcessingError as e:
            body = None
            if num_retries > 0:
                  #如果不是200就重试,每次递减重试次数
                return await print_page(url, num_retries - 1)
        #不存在URL会抛出ClientResponseError错误
        except aiohttp.errors.ClientResponseError as e:
            return e
    session.close()
    print(body)
    return body

def main():
    #这是一个不存在URL
    # url = 'http://httpbin.org/status/404111'
    #这是一个404的URL
    url = 'http://httpbin.org/status/404'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(print_page(url))
    loop.close()

if __name__ == '__main__':
    main()



爬虫URL重试机制封装成修饰器(python2.7以及python3.5以上)


python2.7版本:

# -*-coding:utf-8-*-
"""
ayou
"""
import requests

#定义一个重试修饰器,默认重试一次
def retry(num_retries=1):
    #用来接收函数
    def wrapper(func):
        #用来接收函数的参数
        def wrapper(*args,**kwargs):
            #为了方便看抛出什么错误定义一个错误变量
            last_exception =None
            #循环执行包装的函数
            for _ in range(num_retries):
                try:
                    #如果没有错误就返回包装的函数,这样跳出循环
                    return func(*args, **kwargs)
                except Exception as e:
                    #捕捉到错误不要return,不然就不会循环了
                    last_exception = e
            #如果要看抛出错误就可以抛出
            # raise last_exception
        return wrapper
    return wrapper

if __name__=="__main__":
    @retry(5)
    def url_retry(url):
        request = requests.get(url, timeout=60)
        print("access!")
        request.raise_for_status()
        html = request.content
        print(html)
        return html

    url_retry("http://httpbin.org/status/404")
    # url_retry("http://httpbin.org/status/404111")
    # url_retry("http://www.baidu.com")


python3.5以上版本:

# -*-coding:utf-8-*-
"""
ayou
"""
import aiohttp,asyncio

#定义一个重试修饰器,默认重试一次
def retry(num_retries=1):
    #用来接收函数
    def wrapper(func):
        #用来接收函数的参数
        def wrapper(*args,**kwargs):
            #为了方便看抛出什么错误定义一个错误变量
            last_exception =None
            #循环执行包装的函数
            for _ in range(num_retries):
                try:
                    #如果没有错误就返回包装的函数,这样跳出循环
                    return func(*args, **kwargs)
                except Exception as e:
                    #捕捉到错误不要return,不然就不会循环了
                    last_exception = e
            #如果要看抛出错误就可以抛出
            # raise last_exception
        return wrapper
    return wrapper

async def print_page(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url,timeout=60) as response:
            print("access!")
              #raise_for_status(),如果不是200会抛出HttpProcessingError错误
            response.raise_for_status()
            body = await response.text()
    session.close()
    print(body)
    return body

@retry(5)
def loop_get():
    # url = "http://www.baidu.com"
    # url = 'http://httpbin.org/status/404111'
    url = 'http://httpbin.org/status/404'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(print_page(url))
    loop.close()

if __name__ == '__main__':
    loop_get()


你可能感兴趣的:(python爬虫)