首先上一个多线程获取多个excel数据为dataFrame的例子
前言:
本此测试数据为120个excel文件,每个文件1000条数据,共120000条数据需要读取
# @datetime:6/27/0027
"desc"
__author__ = "[email protected]"
import threading
import os.path
import time
from service import logger
import pandas as pd
logger = logger.MyLogger("multi_thread").getLogger()
# 定义全局变量,用于每个线程存放data
data = []
# 自定义线程类,继承threading.Thread
class MyThread(threading.Thread):
def __init__(self, name, path):
threading.Thread.__init__(self)
# 线程名
self.name = name
# 线程执行任务所需数据定义,这里是excel路径
self.path = path
# run方法,执行线程start方法后会自动调用这个run方法,用于调用getExcelData并将值传给data
def run(self):
global data
path = self.path
data.append(getExcelData(path))
def getExcelData(path):
logger.info("开始读取excel,当前线程:" + str(threading.currentThread().name))
data = pd.DataFrame()
if not os.path.exists(path):
raise FileNotFoundError()
if os.path.isfile(path):
logger.info("读取Excel文件完毕,当前线程:" + str(threading.currentThread().name))
return data.append(pd.read_excel(path, skiprows=1, skipfooter=1), sort=False)
return None
if __name__ == "__main__":
data_frame = pd.DataFrame()
threads = []
excel_path = os.path.join(os.getcwd(), "../excels")
xls_names = [x for x in os.listdir(excel_path) if x.endswith(".xls")]
startTime = time.time()
for file_name in xls_names:
# 获取文件目录下的excel文件,新建线程
thread = MyThread(str(file_name), os.path.join(excel_path, file_name))
thread.start()
threads.append(thread)
# 这里时用于线程阻塞,等待所有子线程线程执行完毕主线程才继续执行
for thread in threads:
thread.join()
# data转换为dataframe
for d in data:
data_frame = data_frame.append(d, sort=False)
endTime = time.time()
logger.info(endTime - startTime)
logger.info(len(data_frame))
执行结果:
耗时:12.755005836486816
获取数据量:120000
# @datetime:6/27/0027
"desc"
__author__ = "[email protected]"
from service import logger
from concurrent.futures import *
import threading
import os
import pandas as pd
import time
logger = logger.MyLogger("multi_process").getLogger()
def getExcelData(path):
logger.info("开始读取excel,当前线程:" + str(threading.currentThread().name))
data = pd.DataFrame()
if not os.path.exists(path):
raise FileNotFoundError()
if os.path.isfile(path):
logger.info("读取Excel文件完毕,当前线程:" + str(threading.currentThread().name))
return data.append(pd.read_excel(path, skiprows=1, skipfooter=1), sort=False)
if __name__ == "__main__":
excel_path = os.path.join(os.getcwd(), "../excels")
xls_names = [x for x in os.listdir(excel_path) if x.endswith(".xls")]
startTime = time.time()
executor = ThreadPoolExecutor(max_workers=10)
# 通过submit函数提交执行的函数到线程池中,submit函数立即返回,不阻塞
return_data = []
# 分配任务
all_task = [executor.submit(getExcelData, os.path.join(excel_path, file_name)) for file_name in xls_names]
data = pd.DataFrame()
# 每有一个线程执行完获取返回值,直接调用concurrent.futures的as_completed方法可以实现
for future in as_completed(all_task):
data = data.append(future.result())
# 等待子线程执行完
wait(all_task)
end = time.time()
logger.info(len(data))
logger.info(end - startTime)
执行结果:
耗时:12.771236181259155
将最大线程数量(max_workers=120)设置为120后:
耗时:12.948451519012451
# @datetime:6/27/0027
"desc"
__author__ = "[email protected]"
import os.path
import time
from service import logger
import pandas as pd
logger = logger.MyLogger("excelUtils").getLogger()
class ExcelReader:
def __init__(self, path, file_suffix=".xls"):
self.path = path
self.file_suffix = file_suffix
def getData(self):
if not os.path.exists(self.path):
raise FileNotFoundError()
data = pd.DataFrame()
if os.path.isdir(self.path):
xls_names = [x for x in os.listdir(self.path) if x.endswith(self.file_suffix)]
logger.info("开始")
for xls_name in xls_names:
df = pd.read_excel(os.path.join(self.path, xls_name), skiprows=1, skipfooter=1)
data = data.append(df, sort=False)
logger.info("读取Excel文件完毕,共读取" + str(xls_names.__len__()) + "个文件")
return data
if __name__ == "__main__":
start = time.time()
reader = ExcelReader(os.path.join(os.getcwd(), "../excels"))
data = reader.getData()
end = time.time()
print(end - start)
print(len(data))
结果:
耗时:12.019948959350586
结论:单线程 < 多线程 = 使用线程池(10)<使用线程池(120)
happy 不?python中使用多线程耗时比单线程还多了,不能怪python,不得不谈一下GIL
首先GIL不是python的特性,二十Cpython(python解析器)所使用的一个概念。实际上各大环境基本上都使用了Cpython解释器。
GIL全称Global Interpreter Lock,通俗点说就是一把超级大锁,即全局排他锁,保护了数据安全性的同时,使得多线程提高效率的能力几乎丧失,尤其表现在cpu密集型计算上。结果方法之一时python提供了mutiprocess(多进程)来弥补这个问题,由于多个进程之间的时不同的GIL锁,不会发生阻塞。
关于GIL,可以参见这篇博文[python中的GIL详解]