python高效使用方法03_pandas中的rolling.mean滚动求均值效率究竟如何?

python高效使用方法03_pandas中的rolling.mean滚动求均值效率究竟如何?_第1张图片
先上图来说明pandas.rolling(n).mean()滚动求均值的方法效率其实并不是最高的,我自己尝试使用cython把滚动求均值的方法重新编译了一下,发现效率总体上是pandas的三倍以上。
总结:pandas比较合适用于普通的研究分析工作,如果用到追求高效率的生产环境中,需要考虑要不要改写具体的函数。

代码:
效率对比代码:

import numpy as np
import pandas as pd
import time
from pyecharts import options as opts
from pyecharts.charts import Bar
from cal_rolling_mean_by_cython import cal_rolling_mean_by_cython

# n_rows = 100
# n_times = 1000
# np.random.seed(1)
# df = pd.DataFrame({i: np.random.randn(n_rows) for i in ['open', 'high', 'low', 'close', 'volume']})
# df['ma_1'] = df['close'].rolling(20).mean()
# df['ma_2'] = cal_rolling_mean_by_cython(df['close'].to_numpy(), 20)
# df = df.dropna()
# a = list(df['ma_1'])
# b = list(df['ma_2'])
# if not a[0] == b[0]:
#     print(a[0], b[0])
#     print(df[['ma_1', 'ma_2']])
# assert list(df['ma_1']) == list(df['ma_2'])
# 经过验证,发现数据精度存在不同,在无数位小数点之后,存在不一样的情况,如0.22949341595210063= 0.22949341595210065

def test_func_time(n_rows, n_times=1000):
    df = pd.DataFrame({i: np.random.randn(n_rows) for i in ['open', 'high', 'low', 'close', 'volume']})
    # 测试np.where的效率
    begin_time = time.perf_counter()
    for i in range(n_times):
        df['ma_1'] = df['close'].rolling(20).mean()
    end_time = time.perf_counter()
    consume_time_where = end_time - begin_time
    print(f"rolling.mean耗费的时间:{consume_time_where}")
    # 测试np.select的效率
    begin_time = time.perf_counter()
    for i in range(n_times):
        df['ma_2'] = cal_rolling_mean_by_cython(df['close'].to_numpy(), 20)
    end_time = time.perf_counter()
    consume_time_select = end_time - begin_time
    print(f"cal_rolling_mean_by_cythont耗费的时间:{consume_time_select}")
    # assert df['ma_1'].equals(df['ma_2']), "检查错误"
    return [consume_time_where, consume_time_select]


if __name__ == '__main__':
    r1 = test_func_time(n_rows=1000)
    r2 = test_func_time(n_rows=100000)
    r3 = test_func_time(n_rows=10000000)
    c = (
        Bar()
            .add_xaxis(["1千行", "十万行", "一千万行"])
            .add_yaxis("rolling_mean_by_pandas", [r1[0], r2[0], r3[0]])
            .add_yaxis("cal_rolling_mean_by_cython", [r1[1], r2[1], r3[1]])
            .reversal_axis()
            .set_series_opts(label_opts=opts.LabelOpts(position="right"))
            .set_global_opts(title_opts=opts.TitleOpts(title="耗费时间对比"))
            #.render("d:/result/夏普率耗费时间对比.html")
            .render("./rolling_mean_by_pandas和cal_rolling_mean_by_cython耗费时间对比.html")
    )

计算滚动平均的cython代码文件:cal.pyx

#cython: language_level=3
#distutils:language=c
#cython: c_string_type=unicode, c_string_encoding=utf8

import numpy as np
from numpy import NaN as nan
cimport numpy as np
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(False)
cpdef cal_rolling_mean_by_cython(np.ndarray[np.double_t, ndim=1] arr,int look_back_days):
    cdef int data_len = arr.shape[0]
    cdef np.ndarray[np.double_t, ndim=1] result = np.empty(data_len)
    cdef int i
    cdef int j
    cdef double sum=0.0
    for i in range(look_back_days-1):
        result[i] = nan
    for i in range(look_back_days-1, data_len):
        sum = 0.0
        for j in range(i-look_back_days+1,i+1):
             sum += arr[j]
        result[i] = sum/look_back_days
    return result

#def

setup.py

from setuptools import setup, Extension
from Cython.Build import cythonize
import numpy as np
import sys


ext = Extension(
    "cal_rolling_mean_by_cython", sources=["cal.pyx"],
    include_dirs=[np.get_include()],
    language='c'
)

setup(name="cal_rolling_mean_by_cython", ext_modules=cythonize([ext]))

你可能感兴趣的:(python高效使用方法,python,pandas,均值算法)