>>> from pandas import DataFrame,Series
>>> import numpy as np
>>> df=DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
>>> df
key1 key2 data1 data2
0 a one -0.738145 0.494023
1 a two 0.553763 -2.618996
2 b one 1.081573 -0.713015
3 b two -0.173333 1.399356
4 a one 0.021549 0.204140
>>> groupd1=df['data1'].groupby([df['key1'],df['key2']]).sum()
>>> df['data1']
0 -0.738145
1 0.553763
2 1.081573
3 -0.173333
4 0.021549
Name: data1, dtype: float64
>>> groupd1
key1 key2
a one -0.716596
two 0.553763
b one 1.081573
two -0.173333
Name: data1, dtype: float64
>>> groupd2=df['data1'].groupby([df['key1']]).sum()
>>> groupd2
key1
a -0.162833
b 0.908240
Name: data1, dtype: float64
>>> df.groupby(df['key1']).mean()
data1 data2
key1
a -0.054278 -0.640278
b 0.454120 0.343171
>>> -0.738145+1.081573++0.021549
0.36497699999999983
>>> (-0.738145+1.081573++0.021549)/3
0.12165899999999995
>>> (-0.738145+0.553763++0.021549)/3
-0.05427766666666669
>>> (0.494023-2.618996+0.204140)/3
-0.6402776666666667
>>> df.groupby([df['key1'],df['key2']]).mean()
data1 data2
key1 key2
a one -0.358298 0.349081
two 0.553763 -2.618996
b one 1.081573 -0.713015
two -0.173333 1.399356
>>> (-0.738145+0.021549 )/2
-0.358298
>>> (0.494023+0.204140)/2
0.3490815
>>> for group1,group2 in df.groupby(df['key1']):
print(group1,group2)
a key1 key2 data1 data2
0 a one -0.738145 0.494023
1 a two 0.553763 -2.618996
4 a one 0.021549 0.204140
b key1 key2 data1 data2
2 b one 1.081573 -0.713015
3 b two -0.173333 1.399356
>>> for group1,group2 in df.groupby(df['key1']):
print(group1,'\n',group2)
a
key1 key2 data1 data2
0 a one -0.738145 0.494023
1 a two 0.553763 -2.618996
4 a one 0.021549 0.204140
b
key1 key2 data1 data2
2 b one 1.081573 -0.713015
3 b two -0.173333 1.399356
>>> for group1,group2 in df.groupby(df['key1']):
print(group1,'\r',group2)
a
key1 key2 data1 data2
0 a one -0.738145 0.494023
1 a two 0.553763 -2.618996
4 a one 0.021549 0.204140
b
key1 key2 data1 data2
2 b one 1.081573 -0.713015
3 b two -0.173333 1.399356
>>> def get_stats(group):
return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
>>> def group_by_test():
frame=DataFrame({'data1':np.random.randn(100),'data2':np.random.randn(100)})
factor=pd.cut(frame.data1,4)
print (factor)
>>> def group_by_test():
frame=DataFrame({'data1':np.random.randn(100),'data2':np.random.randn(100)})
factor=pd.cut(frame.data1,4)
print (factor)
print (frame.data2.groupby(factor).apply(get_stats))
>>> for g in frame.data2.groupbyby(facotr):
get_stats(g)
Traceback (most recent call last):
File "" , line 1, in <module>
for g in frame.data2.groupbyby(facotr):
NameError: name 'frame' is not defined
>>> def draw(deck,n=5):
return deck.take(np.random.permutation(len(deck))[:n])
>>> def group_by_test2():
card_val=(range(1,11)+[10]*3)*4 #牌的序号
base_name=['A']+range(2,11)+['J','K','Q'] #牌名
cards=[]
suits=['H','S','C','D'] #花色:红桃(Hearts),黑桃(Spades),梅花(Clubs),方片(Diamonds)
for suit in ['H','S','C','D']:
cards.extend(str(num)+suit for num in base_name) #产生牌
deck=Series(card_val,index=cards)
get_suit=lambda card:card[-1] #根据牌名最后一个字符也就是花色进行分组。
>>> print (deck.groupby(get_suit).apply(draw,n=2))
Traceback (most recent call last):
File "" , line 1, in <module>
print (deck.groupby(get_suit).apply(draw,n=2))
NameError: name 'deck' is not defined
>>>
>>> import datetime
>>> datetime.datetime.now
<built-in method now of type object at 0x00000000676F2DF0>
>>> datetime.datetime.now()
datetime.datetime(2020, 5, 15, 9, 56, 58, 806307)
>>> from datetime import datetime
>>> t1=datetime(2018,4,11)
>>> t2=datetime(2918,3,3)
>>> t2=datetime(2018,3,3)
>>> t1-t2
datetime.timedelta(39)
>>> from datetime import timedelta
>>> delta=timedelta(12)
>>> t1+delta
datetime.datetime(2018, 4, 23, 0, 0)
>>> value='2018-4-12'
>>> datetime.strptime(value,'%Y-%m-%d')
datetime.datetime(2018, 4, 12, 0, 0)
>>> from dateutil.parser import parse
>>> print('April,12,2018,12:00 PM')
April,12,2018,12:00 PM
>>> parse('April,12,2018,12:00 PM')
datetime.datetime(2020, 4, 12, 12, 0)
>>> parse('12/4/2018',dayfirst=True)
datetime.datetime(2018, 4, 12, 0, 0)
>>> datestr=['4/12/2018','3/12/2018']
>>> import pandas as pd
>>> pd.to_datetime(datestr)
DatetimeIndex(['2018-04-12', '2018-03-12'], dtype='datetime64[ns]', freq=None)
>>> datestr=[datetime(2018,4,12),datetime(2018,4,11),datetime(2018,4,10),datetime(2018,4,9)]
>>> from pandas import DataFrame,Series
>>> import numpy as np
>>> ts=Series(np.random.randn(4),index=datestr)
>>> ts
2018-04-12 0.844791
2018-04-11 -0.295957
2018-04-10 0.728659
2018-04-09 -1.516450
dtype: float64
>>> stamp=ts.index[2]
>>> ts[stamp]
0.7286591295809405
>>> ts=Series(np.random.randn(20),index=pd.date_range('4/12/2018',periods=20))
>>> ts
2018-04-12 -1.137414
2018-04-13 1.342568
2018-04-14 0.978639
2018-04-15 0.542504
2018-04-16 -0.525884
2018-04-17 -0.250660
2018-04-18 -0.851703
2018-04-19 0.638443
2018-04-20 -0.294546
2018-04-21 -1.415922
2018-04-22 -0.197610
2018-04-23 -0.375781
2018-04-24 -1.558825
2018-04-25 0.648640
2018-04-26 0.088823
2018-04-27 -0.474935
2018-04-28 1.740024
2018-04-29 1.430090
2018-04-30 -0.103884
2018-05-01 1.438667
Freq: D, dtype: float64
>>> ts['2018-4']
2018-04-12 -1.137414
2018-04-13 1.342568
2018-04-14 0.978639
2018-04-15 0.542504
2018-04-16 -0.525884
2018-04-17 -0.250660
2018-04-18 -0.851703
2018-04-19 0.638443
2018-04-20 -0.294546
2018-04-21 -1.415922
2018-04-22 -0.197610
2018-04-23 -0.375781
2018-04-24 -1.558825
2018-04-25 0.648640
2018-04-26 0.088823
2018-04-27 -0.474935
2018-04-28 1.740024
2018-04-29 1.430090
2018-04-30 -0.103884
Freq: D, dtype: float64
>>> ts['2018/4']
2018-04-12 -1.137414
2018-04-13 1.342568
2018-04-14 0.978639
2018-04-15 0.542504
2018-04-16 -0.525884
2018-04-17 -0.250660
2018-04-18 -0.851703
2018-04-19 0.638443
2018-04-20 -0.294546
2018-04-21 -1.415922
2018-04-22 -0.197610
2018-04-23 -0.375781
2018-04-24 -1.558825
2018-04-25 0.648640
2018-04-26 0.088823
2018-04-27 -0.474935
2018-04-28 1.740024
2018-04-29 1.430090
2018-04-30 -0.103884
Freq: D, dtype: float64
>>> ts['2018/4/12':'2018/4/23']
2018-04-12 -1.137414
2018-04-13 1.342568
2018-04-14 0.978639
2018-04-15 0.542504
2018-04-16 -0.525884
2018-04-17 -0.250660
2018-04-18 -0.851703
2018-04-19 0.638443
2018-04-20 -0.294546
2018-04-21 -1.415922
2018-04-22 -0.197610
2018-04-23 -0.375781
Freq: D, dtype: float64
>>> ts['2018/4/20':'2018/4/23']
2018-04-20 -0.294546
2018-04-21 -1.415922
2018-04-22 -0.197610
2018-04-23 -0.375781
Freq: D, dtype: float64
>>> ts.truncate(after='2018/4/15')
2018-04-12 -1.137414
2018-04-13 1.342568
2018-04-14 0.978639
2018-04-15 0.542504
Freq: D, dtype: float64
>>> ts.truncate(before='2018/4/15')
2018-04-15 0.542504
2018-04-16 -0.525884
2018-04-17 -0.250660
2018-04-18 -0.851703
2018-04-19 0.638443
2018-04-20 -0.294546
2018-04-21 -1.415922
2018-04-22 -0.197610
2018-04-23 -0.375781
2018-04-24 -1.558825
2018-04-25 0.648640
2018-04-26 0.088823
2018-04-27 -0.474935
2018-04-28 1.740024
2018-04-29 1.430090
2018-04-30 -0.103884
2018-05-01 1.438667
Freq: D, dtype: float64
>>> pd.date_range('4/12/2018',periods=100,freq='D')
DatetimeIndex(['2018-04-12', '2018-04-13', '2018-04-14', '2018-04-15',
'2018-04-16', '2018-04-17', '2018-04-18', '2018-04-19',
'2018-04-20', '2018-04-21', '2018-04-22', '2018-04-23',
'2018-04-24', '2018-04-25', '2018-04-26', '2018-04-27',
'2018-04-28', '2018-04-29', '2018-04-30', '2018-05-01',
'2018-05-02', '2018-05-03', '2018-05-04', '2018-05-05',
'2018-05-06', '2018-05-07', '2018-05-08', '2018-05-09',
'2018-05-10', '2018-05-11', '2018-05-12', '2018-05-13',
'2018-05-14', '2018-05-15', '2018-05-16', '2018-05-17',
'2018-05-18', '2018-05-19', '2018-05-20', '2018-05-21',
'2018-05-22', '2018-05-23', '2018-05-24', '2018-05-25',
'2018-05-26', '2018-05-27', '2018-05-28', '2018-05-29',
'2018-05-30', '2018-05-31', '2018-06-01', '2018-06-02',
'2018-06-03', '2018-06-04', '2018-06-05', '2018-06-06',
'2018-06-07', '2018-06-08', '2018-06-09', '2018-06-10',
'2018-06-11', '2018-06-12', '2018-06-13', '2018-06-14',
'2018-06-15', '2018-06-16', '2018-06-17', '2018-06-18',
'2018-06-19', '2018-06-20', '2018-06-21', '2018-06-22',
'2018-06-23', '2018-06-24', '2018-06-25', '2018-06-26',
'2018-06-27', '2018-06-28', '2018-06-29', '2018-06-30',
'2018-07-01', '2018-07-02', '2018-07-03', '2018-07-04',
'2018-07-05', '2018-07-06', '2018-07-07', '2018-07-08',
'2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12',
'2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16',
'2018-07-17', '2018-07-18', '2018-07-19', '2018-07-20'],
dtype='datetime64[ns]', freq='D')
>>> pd.date_range('4/12/2018',periods=100,freq='M')
DatetimeIndex(['2018-04-30', '2018-05-31', '2018-06-30', '2018-07-31',
'2018-08-31', '2018-09-30', '2018-10-31', '2018-11-30',
'2018-12-31', '2019-01-31', '2019-02-28', '2019-03-31',
'2019-04-30', '2019-05-31', '2019-06-30', '2019-07-31',
'2019-08-31', '2019-09-30', '2019-10-31', '2019-11-30',
'2019-12-31', '2020-01-31', '2020-02-29', '2020-03-31',
'2020-04-30', '2020-05-31', '2020-06-30', '2020-07-31',
'2020-08-31', '2020-09-30', '2020-10-31', '2020-11-30',
'2020-12-31', '2021-01-31', '2021-02-28', '2021-03-31',
'2021-04-30', '2021-05-31', '2021-06-30', '2021-07-31',
'2021-08-31', '2021-09-30', '2021-10-31', '2021-11-30',
'2021-12-31', '2022-01-31', '2022-02-28', '2022-03-31',
'2022-04-30', '2022-05-31', '2022-06-30', '2022-07-31',
'2022-08-31', '2022-09-30', '2022-10-31', '2022-11-30',
'2022-12-31', '2023-01-31', '2023-02-28', '2023-03-31',
'2023-04-30', '2023-05-31', '2023-06-30', '2023-07-31',
'2023-08-31', '2023-09-30', '2023-10-31', '2023-11-30',
'2023-12-31', '2024-01-31', '2024-02-29', '2024-03-31',
'2024-04-30', '2024-05-31', '2024-06-30', '2024-07-31',
'2024-08-31', '2024-09-30', '2024-10-31', '2024-11-30',
'2024-12-31', '2025-01-31', '2025-02-28', '2025-03-31',
'2025-04-30', '2025-05-31', '2025-06-30', '2025-07-31',
'2025-08-31', '2025-09-30', '2025-10-31', '2025-11-30',
'2025-12-31', '2026-01-31', '2026-02-28', '2026-03-31',
'2026-04-30', '2026-05-31', '2026-06-30', '2026-07-31'],
dtype='datetime64[ns]', freq='M')
>>> pd.date_range('4/12/2018',periods=100,freq='Y')
DatetimeIndex(['2018-12-31', '2019-12-31', '2020-12-31', '2021-12-31',
'2022-12-31', '2023-12-31', '2024-12-31', '2025-12-31',
'2026-12-31', '2027-12-31', '2028-12-31', '2029-12-31',
'2030-12-31', '2031-12-31', '2032-12-31', '2033-12-31',
'2034-12-31', '2035-12-31', '2036-12-31', '2037-12-31',
'2038-12-31', '2039-12-31', '2040-12-31', '2041-12-31',
'2042-12-31', '2043-12-31', '2044-12-31', '2045-12-31',
'2046-12-31', '2047-12-31', '2048-12-31', '2049-12-31',
'2050-12-31', '2051-12-31', '2052-12-31', '2053-12-31',
'2054-12-31', '2055-12-31', '2056-12-31', '2057-12-31',
'2058-12-31', '2059-12-31', '2060-12-31', '2061-12-31',
'2062-12-31', '2063-12-31', '2064-12-31', '2065-12-31',
'2066-12-31', '2067-12-31', '2068-12-31', '2069-12-31',
'2070-12-31', '2071-12-31', '2072-12-31', '2073-12-31',
'2074-12-31', '2075-12-31', '2076-12-31', '2077-12-31',
'2078-12-31', '2079-12-31', '2080-12-31', '2081-12-31',
'2082-12-31', '2083-12-31', '2084-12-31', '2085-12-31',
'2086-12-31', '2087-12-31', '2088-12-31', '2089-12-31',
'2090-12-31', '2091-12-31', '2092-12-31', '2093-12-31',
'2094-12-31', '2095-12-31', '2096-12-31', '2097-12-31',
'2098-12-31', '2099-12-31', '2100-12-31', '2101-12-31',
'2102-12-31', '2103-12-31', '2104-12-31', '2105-12-31',
'2106-12-31', '2107-12-31', '2108-12-31', '2109-12-31',
'2110-12-31', '2111-12-31', '2112-12-31', '2113-12-31',
'2114-12-31', '2115-12-31', '2116-12-31', '2117-12-31'],
dtype='datetime64[ns]', freq='A-DEC')
>>> dates=[datetime(2017,1,6),datetime(2017,1,10),datetime(2017,1,13),
datetime(2017,1,15),datetime(2017,1,18),datetime(2017,1,20)]
>>> ts=Series(np.random.randn(6),index=dates)
>>> ts
2017-01-06 -1.346653
2017-01-10 0.725038
2017-01-13 -0.409668
2017-01-15 -1.216957
2017-01-18 -0.071602
2017-01-20 -0.792183
dtype: float64
>>> dates=[datetime(2018,4,12),datetime(2018,4,13),datetime(2018,4,13),
datetime(2018,4,14)]
>>> ts=Series(np.random.randn(6),index=dates)
Traceback (most recent call last):
File "" , line 1, in <module>
ts=Series(np.random.randn(6),index=dates)
File "C:\Python36\lib\site-packages\pandas\core\series.py", line 262, in __init__
.format(val=len(data), ind=len(index)))
ValueError: Length of passed values is 6, index implies 4
>>> ts=Series(np.random.randn(4),index=dates)
>>> ts
2018-04-12 -0.296342
2018-04-13 0.624015
2018-04-13 -0.761311
2018-04-14 0.673819
dtype: float64
>>> ts=Series(np.random.randn(4),index=dates,dtype='int64')
>>> ts
2018-04-12 0
2018-04-13 0
2018-04-13 0
2018-04-14 0
dtype: int64
>>> ts.index.is_unique
False
>>> pd.date_range('4/12/2018','5/12/2018')
DatetimeIndex(['2018-04-12', '2018-04-13', '2018-04-14', '2018-04-15',
'2018-04-16', '2018-04-17', '2018-04-18', '2018-04-19',
'2018-04-20', '2018-04-21', '2018-04-22', '2018-04-23',
'2018-04-24', '2018-04-25', '2018-04-26', '2018-04-27',
'2018-04-28', '2018-04-29', '2018-04-30', '2018-05-01',
'2018-05-02', '2018-05-03', '2018-05-04', '2018-05-05',
'2018-05-06', '2018-05-07', '2018-05-08', '2018-05-09',
'2018-05-10', '2018-05-11', '2018-05-12'],
dtype='datetime64[ns]', freq='D')
>>> ts=Series(np.random.randn(4),index=pd.date_range('4/12/2018',periods=4,freq='M'))
>>> ts
2018-04-30 -0.682872
2018-05-31 0.920610
2018-06-30 0.925642
2018-07-31 -0.623782
Freq: M, dtype: float64
>>> ts.shift(2)
2018-04-30 NaN
2018-05-31 NaN
2018-06-30 -0.682872
2018-07-31 0.920610
Freq: M, dtype: float64
>>> ts.shift(2,freq='M')
2018-06-30 -0.682872
2018-07-31 0.920610
2018-08-31 0.925642
2018-09-30 -0.623782
Freq: M, dtype: float64
>>> rng=pd.date_range('1/1/2000',periods=3,freq='M')
>>> ts=Series(randn(3),index=rng)
Traceback (most recent call last):
File "" , line 1, in <module>
ts=Series(randn(3),index=rng)
NameError: name 'randn' is not defined
>>> rng=pd.date_range('1/1/2000',periods=3,freq='M')
>>> ts=Series(np.random.randn(3),index=rng)
>>> ts
2000-01-31 1.167103
2000-02-29 -0.319709
2000-03-31 0.681588
Freq: M, dtype: float64
>>> pts2=ts.to_period(freq="M")
>>> pts2
2000-01 1.167103
2000-02 -0.319709
2000-03 0.681588
Freq: M, dtype: float64
>>> pts2.to_timestamp(how='end')
2000-01-31 1.167103
2000-02-29 -0.319709
2000-03-31 0.681588
Freq: M, dtype: float64
>>> rng=pd.date_range('1/1/2000',periods=50,freq='D')
>>> ts=Series(np.random.randn(50),index=rng)
>>> print(ts.resample('M').mean())
2000-01-31 0.126353
2000-02-29 0.039958
Freq: M, dtype: float64
>>> print(ts.mean())
SyntaxError: unexpected indent
>>> print(ts.mean())
0.09352305334643628
>>> rng=pd.date_range('1/1/2000',periods=12,freq='T')
>>> ts=Series(np.arange(12),index=rng)
>>> ts
2000-01-01 00:00:00 0
2000-01-01 00:01:00 1
2000-01-01 00:02:00 2
2000-01-01 00:03:00 3
2000-01-01 00:04:00 4
2000-01-01 00:05:00 5
2000-01-01 00:06:00 6
2000-01-01 00:07:00 7
2000-01-01 00:08:00 8
2000-01-01 00:09:00 9
2000-01-01 00:10:00 10
2000-01-01 00:11:00 11
Freq: T, dtype: int32
>>> print(ts.resample('5min',closed='left').sum())
2000-01-01 00:00:00 10
2000-01-01 00:05:00 35
2000-01-01 00:10:00 21
Freq: 5T, dtype: int32
>>> print(ts.resample('5min',closed='right').sum())
1999-12-31 23:55:00 0
2000-01-01 00:00:00 15
2000-01-01 00:05:00 40
2000-01-01 00:10:00 11
Freq: 5T, dtype: int32
>>> ts.resample('5min', closed='left').ohlc()
open high low close
2000-01-01 00:00:00 0 4 0 4
2000-01-01 00:05:00 5 9 5 9
2000-01-01 00:10:00 10 11 10 11
>>> ts.resample('5min',closed='left').ohlc()
open high low close
2000-01-01 00:00:00 0 4 0 4
2000-01-01 00:05:00 5 9 5 9
2000-01-01 00:10:00 10 11 10 11
>>>
参考:https://www.cnblogs.com/zhanghongfeng/p/8745415.html