目录
一、K线分析
二、pandas.DataFrame.resample - 内置聚合方法
2-0 测试数据初始化
2-1 resample 的 3分钟聚合
2-2 resample 的 30s 拆成
2-3 自定义聚合逻辑 - apply
2-4 DataFrame 对象的 resample 处理方式
三、K线聚合的业务实现
方式一、获取批量数据 + resample处理
一、K线分析
需求分析:元数据为1min的k线数据,需要将其置换成 5min、10min 的数据线
细节注意
- 1min to 5min
- 时间戳 间隔为300S
- 数据计算的的区间:[1563156300 ,1563156600 ) - 注意左闭右开
- 即,1min线 9:00-9:04 的数据聚合为 5min线的 9:00 的数据
- k线数据
k线数据:{ _id : 10位时间戳 ts:13位时间戳 vol:成交额, 即 sum(每一笔成交价 * 该笔的成交量) low:最低价 high:最高价 open:开盘价 close:收盘价(当K线为最晚的一根时,是最新成交价) count:成交笔数 amount:成交量 }
二、pandas.DataFrame.resample - 内置聚合方法
官方 resample 文档
2-0 测试数据初始化
# 包含9个一分钟时间戳的系列 index = pandas.date_range('1/1/2000', periods=9, freq='T') series = pandas.Series(range(9), index=index) print(series) ''' 2000-01-01 00:00:00 0 2000-01-01 00:01:00 1 2000-01-01 00:02:00 2 2000-01-01 00:03:00 3 2000-01-01 00:04:00 4 2000-01-01 00:05:00 5 2000-01-01 00:06:00 6 2000-01-01 00:07:00 7 2000-01-01 00:08:00 8 Freq: T, dtype: int64 '''
2-1 resample 的 3分钟聚合
# 默认: sum(0:00 - 0:02) 对应 value(0:00) data = series.resample('3T').sum() print(data) ''' 2000-01-01 00:00:00 3 2000-01-01 00:03:00 12 2000-01-01 00:06:00 21 Freq: 3T, dtype: int64 ''' # label 使用右标记,即,sum(0:00 - 0:02) 对应 value(0:03) data = series.resample('3T',label='right').sum() print(data) ''' 2000-01-01 00:03:00 3 2000-01-01 00:06:00 12 2000-01-01 00:09:00 21 Freq: 3T, dtype: int64 ''' # closed 关闭右区间,即,sum(0:01 - 0:03) 对应 value(0:03) data = series.resample('3T', label='right', closed='right').sum() print(data) ''' 2000-01-01 00:00:00 0 2000-01-01 00:03:00 6 2000-01-01 00:06:00 15 2000-01-01 00:09:00 15 Freq: 3T, dtype: int64 '''
2-2 resample 的 30s 拆成
data = series.resample('30S') print(data) # DatetimeIndexResampler [freq=<30 * Seconds>, axis=0, closed=left, label=left, convention=start, base=0] data = series.resample('30S').asfreq() print(data) ''' 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN 2000-01-01 00:01:00 1.0 2000-01-01 00:01:30 NaN 2000-01-01 00:02:00 2.0 2000-01-01 00:02:30 NaN 2000-01-01 00:03:00 3.0 2000-01-01 00:03:30 NaN 2000-01-01 00:04:00 4.0 2000-01-01 00:04:30 NaN 2000-01-01 00:05:00 5.0 2000-01-01 00:05:30 NaN 2000-01-01 00:06:00 6.0 2000-01-01 00:06:30 NaN 2000-01-01 00:07:00 7.0 2000-01-01 00:07:30 NaN 2000-01-01 00:08:00 8.0 Freq: 30S, dtype: float64 ''' # 从第0行开始取,取5行 data = series.resample('30S').asfreq()[0:5] print(data) ''' 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN 2000-01-01 00:01:00 1.0 2000-01-01 00:01:30 NaN 2000-01-01 00:02:00 2.0 Freq: 30S, dtype: float64 ''' # 使用pad方法,将nan填充前一个数据 data = series.resample('30S').pad()[0:5] # 使用bfill方法,将nan填充后一个数据 data = series.resample('30S').bfill()[0:5] # 同上使用 method 参数进行选取 - {'backfill','bfill','pad','ffill',None} data = series.resample('30S').fillna(method='pad')
2-3 自定义聚合逻辑 - apply
index = pandas.date_range('1/1/2000', periods=9, freq='T') series = pandas.Series(range(9), index=index) def custom_resampler(array_like): return numpy.sum(array_like) + 5 data = series.resample('3T').apply(custom_resampler) print(data) ''' 2000-01-01 00:00:00 8 2000-01-01 00:03:00 17 2000-01-01 00:06:00 26 Freq: 3T, dtype: int64 '''
2-4 DataFrame 对象的 resample 处理方式
d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) df = pandas.DataFrame(d) df['week_starting'] = pandas.date_range('01/01/2018', periods=8, freq='W') ''' price volume week_starting 0 10 50 2018-01-07 1 11 60 2018-01-14 2 9 40 2018-01-21 3 13 100 2018-01-28 4 14 50 2018-02-04 5 18 100 2018-02-11 6 17 40 2018-02-18 7 19 50 2018-02-25 ''' # 以月为单位,on 指定非索引字段取代索引,mean() 获取均值 data = df.resample('M', on='week_starting').mean() ''' price volume week_starting 2018-01-31 10.75 62.5 2018-02-28 17.00 60.0 ''' def custom_resampler(df): if df.name == 'price': return numpy.min(df) if df.name == 'volume': return numpy.sum(df) data = df.resample('M', on='week_starting').apply(custom_resampler) print(data) ''' price volume week_starting week_starting 2018-01-31 9 250 None 2018-02-28 14 240 None '''
三、K线聚合的业务实现
方式一、获取批量数据 + resample处理
from datetime import datetime, timedelta import numpy import pandas import pymongo from decimal import Decimal from pymongo.errors import DuplicateKeyError def get_timestamp_without_seconds(t): return int(datetime(t.year, t.month, t.day, t.hour, t.minute).timestamp()) def stamp2time(stamp): return datetime.fromtimestamp(stamp) # 精度计算 def digital_utils(temps): temps = str(temps) if temps.find('E'): temps = '{:.8f}'.format(Decimal(temps)) nums = temps.split('.') if int(nums[1]) == 0: return nums[0] else: num = str(int(nums[1][::-1])) result = '{}.{}'.format(nums[0], num[::-1]) return result # 自定义错误类型 class DataCanNotFoundException(Exception): def __init__(self): err = 'The data is not found, check it please' Exception.__init__(self, err) def kline_get_data(query_coll, period=5): ''' 处理kline数据来源 :param query_coll: 数据来源coll :param period: 单位为分钟,数据范围,period = 5 对应,[整除时间前推5min,当前最近整除时间)所有数据 :return: mongo数据的list形式 ''' now = datetime.now() minutes = now.minute remainder = minutes % period if remainder: # 若当前分钟为23,则计算[15,20)的数据给予15 minutes = minutes - remainder end = now.replace(minute=minutes) end_stamp = get_timestamp_without_seconds(end) start = end - timedelta(seconds=period * 60) start_stamp = get_timestamp_without_seconds(start) query = {'_id': {"$gte": start_stamp, "$lt": end_stamp}} mongo_data = query_coll.find(query) data_list = list(mongo_data) if not data_list: raise DataCanNotFoundException return data_list def custom_resampler(df): ''' 详细字段的数据处理 :param df: pandas.DataFrame 对象,每列数据 :return: ''' if df.name in ['open', '_id', 'ts']: return numpy.asarray(df)[0] if df.name == 'close': return numpy.asarray(df)[-1] if df.name == 'low': return numpy.min(df) if df.name == 'high': return numpy.max(df) if df.name in ['count', 'amount', 'vol']: return digital_utils(numpy.sum(df)) def kline_process(save_coll, data_list, period=5): ''' 处理kline的计算和存储 :param save_coll: 指定存储coll :param data_list: 存储的列表数据,[{},{}] :param period: minute为单位,resample的数据聚合操作单位 :return: ''' df = pandas.DataFrame(data_list) df['_id'] = df['_id'].apply(stamp2time) # df = df.drop(columns=['ts']) res_data = df.resample('%sT' % period, on='_id').apply(custom_resampler) # print(res_data) res_data = res_data.to_dict('list') print(res_data) for i in range(len(res_data['_id'])): id_res = get_timestamp_without_seconds(res_data['_id'][i].to_pydatetime()) data = dict( _id=id_res, ts=res_data['ts'][i], amount=res_data['amount'][i], close=res_data['close'][i], count=res_data['count'][i], high=res_data['high'][i], low=res_data['low'][i], open=res_data['open'][i], vol=res_data['vol'][i], ) print(data) try: save_coll.insert_one(data) except DuplicateKeyError as e: print('库中已存在该_id:%s' % e) if __name__ == '__main__': conn = pymongo.MongoClient("mongodb://localhost:27017/") db = conn['test'] coll = db['quota_huobi_18cbtc_1min'] save_conn = pymongo.MongoClient('mongodb://localhost:27017/') save_db = save_conn['test'] save_coll = save_db['quota_huobi_18cbtc_5min'] try: data_list = kline_get_data(coll, period=5) print(data_list) kline_process(save_coll, data_list, period=5) except DataCanNotFoundException as e: print(e)