转载: https://blog.csdn.net/sinat_30353259/article/details/83818646
原理就是joblib会把你的df.groupby()结果进行切分,然后每个块(分组)去跑多进程,然后最后通过concat把每个进程数据合并就好了.
单进程 多进程代码对比:
单进程:
%%time
def run(df):
# print(df)
list_ = [] # [[s1,e1],[s2,e2]...]
res = df.loc[:,['s','e']]
row = res.shape[0] # 打印分组后的行数
if row == 1:
return df
else:
old_list = [[np.int(s),np.int(e)] for s,e in zip(res.loc[:,'s'],res.loc[:,'e']) ]
#print(old_list)
old_list.sort(key=lambda x:x[0])
#print(old_list)
# [[112500, 134759], [90200, 90259], [90300, 93659], [94200, 103159]]
result = []
for interval in old_list:
if result and result[-1][1] >= interval[0]:
# 将result中最后一个区间更新为合并之后的新区间
result[-1][1] = max(result[-1][1], interval[1])
else:
result.append(interval)
# print(1,result)
a_len = len(result)
# print(a_len)
# print([s for s,_ in result])
df.iloc[:a_len,-2]=[s for s,_ in result]
df.iloc[:a_len,-1]=[e for _,e in result]
df2 = df.iloc[:a_len,:]
# print(df2)
return df2
df1.groupby('hu').apply(run)
# 耗时33秒
多进程:
%%time
def run(df):
# print(df)
list_ = [] # [[s1,e1],[s2,e2]...]
res = df.loc[:,['s','e']]
row = res.shape[0] # 打印分组后的行数
if row != 1:
old_list = [[int(s),int(e)] for s,e in zip(res.loc[:,'s'],res.loc[:,'e']) ]
#print(old_list)
old_list.sort(key=lambda x:x[0])
#print(old_list)
# [[112500, 134759], [90200, 90259], [90300, 93659], [94200, 103159]]
result = []
for interval in old_list:
if result and result[-1][1] >= interval[0]:
# 将result中最后一个区间更新为合并之后的新区间
result[-1][1] = max(result[-1][1], interval[1])
else:
result.append(interval)
a_len = len(result)
df.iloc[:a_len,-2]=[s for s,_ in result]
df.iloc[:a_len,-1]=[e for _,e in result]
df2 = df.iloc[:a_len,:]
return df2
else:
return df
def apply_parallel(df_grouped,func):
results = Parallel(n_jobs=8)(delayed(func)(group) for name,group in df_grouped)
return pd.concat(results)
df_grouped = df1.groupby('hu')
df2 = apply_parallel(df_grouped,run)
# 耗时19秒
n_jobs变量是进程数,如果i5 四核八线程 建议写8,四核四线程建议写4,等等