pandas apply方法速度太慢优化方法之 joblib 包实现多进程

转载: https://blog.csdn.net/sinat_30353259/article/details/83818646

原理就是joblib会把你的df.groupby()结果进行切分,然后每个块(分组)去跑多进程,然后最后通过concat把每个进程数据合并就好了.

单进程 多进程代码对比:

单进程:

%%time

def run(df):
#     print(df)
    list_ = []   # [[s1,e1],[s2,e2]...]
    res = df.loc[:,['s','e']]
    row = res.shape[0]  # 打印分组后的行数
    if row == 1:
        return df
    else:
        old_list = [[np.int(s),np.int(e)] for s,e in zip(res.loc[:,'s'],res.loc[:,'e']) ]
        #print(old_list)
        old_list.sort(key=lambda x:x[0])
        #print(old_list)
        # [[112500, 134759], [90200, 90259], [90300, 93659], [94200, 103159]]
        result = []
        for interval in old_list:
            if result and result[-1][1] >= interval[0]:
                # 将result中最后一个区间更新为合并之后的新区间
                result[-1][1] = max(result[-1][1], interval[1])
            else:
                result.append(interval)
#         print(1,result)
        a_len = len(result)
#         print(a_len)
#         print([s for s,_ in result])
        df.iloc[:a_len,-2]=[s for s,_ in result]
        
        df.iloc[:a_len,-1]=[e for _,e in result]
        df2 = df.iloc[:a_len,:]
#         print(df2)
        return df2
df1.groupby('hu').apply(run)
# 耗时33秒

多进程:

%%time

def run(df):
#     print(df)
    list_ = []   # [[s1,e1],[s2,e2]...]
    res = df.loc[:,['s','e']]
    row = res.shape[0]  # 打印分组后的行数
    if row != 1:
        old_list = [[int(s),int(e)] for s,e in zip(res.loc[:,'s'],res.loc[:,'e']) ]
        #print(old_list)
        old_list.sort(key=lambda x:x[0])
        #print(old_list)
        # [[112500, 134759], [90200, 90259], [90300, 93659], [94200, 103159]]
        result = []
        for interval in old_list:
            if result and result[-1][1] >= interval[0]:
                # 将result中最后一个区间更新为合并之后的新区间
                result[-1][1] = max(result[-1][1], interval[1])
            else:
                result.append(interval)
        a_len = len(result)
        df.iloc[:a_len,-2]=[s for s,_ in result]
        
        df.iloc[:a_len,-1]=[e for _,e in result]
        df2 = df.iloc[:a_len,:]
        return df2
    else:
        return df

def apply_parallel(df_grouped,func):
    results = Parallel(n_jobs=8)(delayed(func)(group) for name,group in df_grouped)
    return pd.concat(results)

df_grouped = df1.groupby('hu')
df2 = apply_parallel(df_grouped,run)
# 耗时19秒

n_jobs变量是进程数,如果i5 四核八线程 建议写8,四核四线程建议写4,等等

你可能感兴趣的:(数据分析)