Python 多线程、多进程、协程对迭代数据的耗时操作测试

Python 多线程、多进程、协程对迭代数据的耗时操作测试

2023-03-29

CPU计算密集型:

2.17s/p

Python中的多线程受GIL制约,因此表面上的并行,实际上是并发,完全的计算密集型任务就和串行的耗时差不多了;

Deal 耗时 备注
普通的循环执行 22.68 串行
线程池执行 22.10 并发
进程池执行 6.95 并行
协程执行(多线程) 22.39 并发
协程执行(多进程) 6.67 并行
协程执行(多进程,更直接的方式,无嵌套调用) 6.88 并行
协程执行(多进程,并测试异步迭代器的使用) 7.52 并行

I/O密集型:

1s/p

I/O密集型的任务,表面是并行,实际也可以当做并行看待,基本不受GIL的影响,甚至由于开销少,耗时会更短;

Deal 耗时 备注
普通的循环执行 25.05 串行
线程池执行 5.01 并行
进程池执行 6.12 并行
协程执行(多线程) 5.01 并行
协程执行(多进程) 5.04 并行
协程执行(多进程,更直接的方式,无嵌套调用) 5.04 并行
协程执行(多进程,并测试异步迭代器的使用) 5.03 并行

测试代码

  • 引入头文件:
# pip install aiohttp
import nest_asyncio

nest_asyncio.apply()  # PyCharm中 不需要这两句

import time
import asyncio
import concurrent.futures
from multiprocessing import Pool

  • 线程/进程池的数量设为5:

POOL_SIZE = 5


def download_file(row):
    # 非协程对象支持(耗时操作模拟)
    #     print(row,'_downloading')
    # I/O密集模拟
    time.sleep(1)
    # 计算密集模拟
    #     total = 0
    #     for i in range(10000000):
    #         total += i
    #         total -= i

    #     print(row,'_downloaded')
    return row


async def download_prepare(row, pool):
    # 协程对象支持(基于线程/进程池包装非协程对象,返回Future对象)
    # 在协程对象中 可以await另一个协程对象 如果要支持非协程对象的方法调用 就需要结合线程/进程池
    loop = asyncio.get_event_loop()
    fut = loop.run_in_executor(pool, download_file, row)
    response = await fut


class Reader(object):
    def __init__(self, datalist, pool=None):
        self.datalist = iter(datalist)
        self.pool = pool

    async def readrow(self):
        try:
            row = next(iter(self.datalist))
            # 创建task 需要传入一个协程对象(对应要执行的方法)
            task = asyncio.create_task(download_prepare(row, self.pool))
            # 返回一个可等待对象(这里是Task)
            return task
        except StopIteration:
            return None

    def __aiter__(self):
        return self

    async def __anext__(self):
        val = await self.readrow()
        if val == None:
            raise StopAsyncIteration
        return val


# 普通的循环执行
def main(datalist):
    for row in datalist:
        download_file(row)


# 线程池执行
def main1(datalist):
    threadpool = concurrent.futures.ThreadPoolExecutor(POOL_SIZE)
    for row in datalist:
        # 注意在等待时,与主线程无关,主线程依然在继续执行。
        threadpool.submit(download_file, row)
    threadpool.shutdown(True)  # 等待线程池中的任务执行完毕后,在继续执行


# 进程池执行
def main2(datalist):
    pool = Pool(POOL_SIZE)
    pool.map(download_file, datalist)
    pool.close()  # 关闭进程池,不再接受新的进程
    pool.join()  # 主进程阻塞等待子进程的退出


#  协程执行(多线程)
def main3(datalist):
    with concurrent.futures.ThreadPoolExecutor(max_workers=POOL_SIZE) as pool:
        tasks = [asyncio.create_task(download_prepare(row, pool)) for row in datalist]

        loop = asyncio.get_event_loop()
        loop.run_until_complete(asyncio.wait(tasks))


#  协程执行(多进程)
def main4(datalist):
    with concurrent.futures.ProcessPoolExecutor(max_workers=POOL_SIZE) as pool:
        tasks = [asyncio.create_task(download_prepare(row, pool)) for row in datalist]

        loop = asyncio.get_event_loop()
        loop.run_until_complete(asyncio.wait(tasks))


#  协程执行(多进程,更直接的方式,无嵌套调用)
def main5(datalist):
    loop = asyncio.get_running_loop()
    with concurrent.futures.ProcessPoolExecutor(max_workers=POOL_SIZE) as pool:
        futs = []
        for row in datalist:
            futs.append(loop.run_in_executor(pool, download_file, row))

        done_s, pending_s = asyncio.run(asyncio.wait(futs))

#         print(done_s, pending_s)


#  协程执行(多进程,并测试异步迭代器的使用)
async def main6(datalist):
    with concurrent.futures.ProcessPoolExecutor(max_workers=POOL_SIZE) as pool:
        obj_t = Reader(datalist, pool)
        tasks = []
        async for task in obj_t:
            tasks.append(task)

        done_s, pending_s = await asyncio.wait(tasks)


#         print(done_s, pending_s)

  • 迭代25个数据,分别测试计算密集型、I/O密集型任务在不同场景下的耗时:

if __name__ == "__main__":

    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
    t1 = time.time()
    datalist = [
        "0_0_0_0_353",
        "1_1_0_0_353",
        "2_2_0_0_353",
        "3_4_0_0_353",
        "4_5_0_0_353",
        "0_0_0_0_353",
        "1_1_0_0_353",
        "2_2_0_0_353",
        "3_4_0_0_353",
        "4_5_0_0_353",
        "0_0_0_0_353",
        "1_1_0_0_353",
        "2_2_0_0_353",
        "3_4_0_0_353",
        "4_5_0_0_353",
        "0_0_0_0_353",
        "1_1_0_0_353",
        "2_2_0_0_353",
        "3_4_0_0_353",
        "4_5_0_0_353",
        "0_0_0_0_353",
        "1_1_0_0_353",
        "2_2_0_0_353",
        "3_4_0_0_353",
        "4_5_0_0_353",
    ]

    main(datalist)  # 10s
    print("串行:普通的循环执行——执行时间:", float(time.time() - t1), "秒")
    t1 = time.time()
    main1(datalist)  # 6s
    print("并发:线程池执行——执行时间:", float(time.time() - t1), "秒")
    t1 = time.time()
    main2(datalist)  # 6s
    print("并行:进程池执行——执行时间:", float(time.time() - t1), "秒")
    t1 = time.time()
    main3(datalist)  # 6s
    print("并发:协程执行(多线程)——执行时间:", float(time.time() - t1), "秒")
    t1 = time.time()
    main4(datalist)  # 6s
    print("并行:协程执行(多进程)——执行时间:", float(time.time() - t1), "秒")
    t1 = time.time()
    main5(datalist)  # 6s
    print("并行:协程执行(多进程,更直接的方式,无嵌套调用) ——执行时间:", float(time.time() - t1), "秒")
    t1 = time.time()
    asyncio.run(main6(datalist))  # 6s
    print("并行:协程执行(多进程,并测试异步迭代器的使用)  ——执行时间:", float(time.time() - t1), "秒")
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))

你可能感兴趣的:(Python,python,pycharm,开发语言)