浅析python多线程及ThreadPoolExecutor的使用及问题

首先上一个多线程获取多个excel数据为dataFrame的例子

前言:

本此测试数据为120个excel文件,每个文件1000条数据,共120000条数据需要读取

直接创建线程与使用线程池对比

1. 直接依任务数创建线程

# @datetime:6/27/0027
"desc"
__author__ = "[email protected]"
import threading
import os.path
import time
from service import logger
import pandas as pd

logger = logger.MyLogger("multi_thread").getLogger()
# 定义全局变量,用于每个线程存放data
data = []


# 自定义线程类,继承threading.Thread
class MyThread(threading.Thread):
    def __init__(self, name, path):
        threading.Thread.__init__(self)
        # 线程名
        self.name = name
        # 线程执行任务所需数据定义,这里是excel路径
        self.path = path

    # run方法,执行线程start方法后会自动调用这个run方法,用于调用getExcelData并将值传给data
    def run(self):
        global data
        path = self.path
        data.append(getExcelData(path))


def getExcelData(path):
    logger.info("开始读取excel,当前线程:" + str(threading.currentThread().name))
    data = pd.DataFrame()
    if not os.path.exists(path):
        raise FileNotFoundError()
    if os.path.isfile(path):
        logger.info("读取Excel文件完毕,当前线程:" + str(threading.currentThread().name))
        return data.append(pd.read_excel(path, skiprows=1, skipfooter=1), sort=False)
    return None


if __name__ == "__main__":
    data_frame = pd.DataFrame()
    threads = []

    excel_path = os.path.join(os.getcwd(), "../excels")
    xls_names = [x for x in os.listdir(excel_path) if x.endswith(".xls")]
    startTime = time.time()

    for file_name in xls_names:
        # 获取文件目录下的excel文件,新建线程
        thread = MyThread(str(file_name), os.path.join(excel_path, file_name))
        thread.start()
        threads.append(thread)
    # 这里时用于线程阻塞,等待所有子线程线程执行完毕主线程才继续执行
    for thread in threads:
        thread.join()

    # data转换为dataframe
    for d in data:
        data_frame = data_frame.append(d, sort=False)
    endTime = time.time()
    logger.info(endTime - startTime)
    logger.info(len(data_frame))

执行结果:

耗时:12.755005836486816
获取数据量:120000

2. 使用线程池创建线程获取

# @datetime:6/27/0027
"desc"
__author__ = "[email protected]"
from service import logger
from concurrent.futures import *
import threading
import os
import pandas as pd
import time

logger = logger.MyLogger("multi_process").getLogger()


def getExcelData(path):
    logger.info("开始读取excel,当前线程:" + str(threading.currentThread().name))
    data = pd.DataFrame()
    if not os.path.exists(path):
        raise FileNotFoundError()
    if os.path.isfile(path):
        logger.info("读取Excel文件完毕,当前线程:" + str(threading.currentThread().name))
    return data.append(pd.read_excel(path, skiprows=1, skipfooter=1), sort=False)


if __name__ == "__main__":
    excel_path = os.path.join(os.getcwd(), "../excels")
    xls_names = [x for x in os.listdir(excel_path) if x.endswith(".xls")]
    startTime = time.time()
    executor = ThreadPoolExecutor(max_workers=10)
    # 通过submit函数提交执行的函数到线程池中,submit函数立即返回,不阻塞
    return_data = []
    # 分配任务
    all_task = [executor.submit(getExcelData, os.path.join(excel_path, file_name)) for file_name in xls_names]
    data = pd.DataFrame()
    # 每有一个线程执行完获取返回值,直接调用concurrent.futures的as_completed方法可以实现
    for future in as_completed(all_task):
        data = data.append(future.result())
    # 等待子线程执行完
    wait(all_task)

    end = time.time()
    logger.info(len(data))
    logger.info(end - startTime)

执行结果:

耗时:12.771236181259155

将最大线程数量(max_workers=120)设置为120后:

耗时:12.948451519012451

3. 单线程直接获取

# @datetime:6/27/0027
"desc"
__author__ = "[email protected]"
import os.path
import time
from service import logger
import pandas as pd

logger = logger.MyLogger("excelUtils").getLogger()


class ExcelReader:

    def __init__(self, path, file_suffix=".xls"):
        self.path = path
        self.file_suffix = file_suffix

    def getData(self):
        if not os.path.exists(self.path):
            raise FileNotFoundError()
        data = pd.DataFrame()
        if os.path.isdir(self.path):
            xls_names = [x for x in os.listdir(self.path) if x.endswith(self.file_suffix)]
            logger.info("开始")
            for xls_name in xls_names:
                df = pd.read_excel(os.path.join(self.path, xls_name), skiprows=1, skipfooter=1)
                data = data.append(df, sort=False)
            logger.info("读取Excel文件完毕,共读取" + str(xls_names.__len__()) + "个文件")
        return data


if __name__ == "__main__":
    start = time.time()
    reader = ExcelReader(os.path.join(os.getcwd(), "../excels"))
    data = reader.getData()
    end = time.time()
    print(end - start)
    print(len(data))

结果:

耗时:12.019948959350586

结论:单线程 < 多线程 = 使用线程池(10)<使用线程池(120)

happy 不?python中使用多线程耗时比单线程还多了,不能怪python,不得不谈一下GIL

GIL是什么

首先GIL不是python的特性,二十Cpython(python解析器)所使用的一个概念。实际上各大环境基本上都使用了Cpython解释器。

GIL全称Global Interpreter Lock,通俗点说就是一把超级大锁,即全局排他锁,保护了数据安全性的同时,使得多线程提高效率的能力几乎丧失,尤其表现在cpu密集型计算上。结果方法之一时python提供了mutiprocess(多进程)来弥补这个问题,由于多个进程之间的时不同的GIL锁,不会发生阻塞。

关于GIL,可以参见这篇博文[python中的GIL详解]

你可能感兴趣的:(python,python)