利用python进行数据分析 数据聚合与分组运算groupby 和时间序列操作


>>> from pandas import DataFrame,Series
>>> import numpy as np
>>> df=DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})


>>> df
  key1 key2     data1     data2
0    a  one -0.738145  0.494023
1    a  two  0.553763 -2.618996
2    b  one  1.081573 -0.713015
3    b  two -0.173333  1.399356
4    a  one  0.021549  0.204140
>>> groupd1=df['data1'].groupby([df['key1'],df['key2']]).sum()
>>> df['data1']
0   -0.738145
1    0.553763
2    1.081573
3   -0.173333
4    0.021549
Name: data1, dtype: float64
>>> groupd1
key1  key2
a     one    -0.716596
      two     0.553763
b     one     1.081573
      two    -0.173333
Name: data1, dtype: float64
>>> groupd2=df['data1'].groupby([df['key1']]).sum()
>>> groupd2
key1
a   -0.162833
b    0.908240
Name: data1, dtype: float64
>>> df.groupby(df['key1']).mean()
         data1     data2
key1                    
a    -0.054278 -0.640278
b     0.454120  0.343171
>>> -0.738145+1.081573++0.021549
0.36497699999999983
>>> (-0.738145+1.081573++0.021549)/3
0.12165899999999995
>>> (-0.738145+0.553763++0.021549)/3
-0.05427766666666669
>>> (0.494023-2.618996+0.204140)/3
-0.6402776666666667
>>> df.groupby([df['key1'],df['key2']]).mean()
              data1     data2
key1 key2                    
a    one  -0.358298  0.349081
     two   0.553763 -2.618996
b    one   1.081573 -0.713015
     two  -0.173333  1.399356
>>> (-0.738145+0.021549 )/2
-0.358298
>>> (0.494023+0.204140)/2
0.3490815

>>> for group1,group2 in df.groupby(df['key1']):
	print(group1,group2)

	
a   key1 key2     data1     data2
0    a  one -0.738145  0.494023
1    a  two  0.553763 -2.618996
4    a  one  0.021549  0.204140
b   key1 key2     data1     data2
2    b  one  1.081573 -0.713015
3    b  two -0.173333  1.399356
>>> for group1,group2 in df.groupby(df['key1']):
	print(group1,'\n',group2)

	
a 
   key1 key2     data1     data2
0    a  one -0.738145  0.494023
1    a  two  0.553763 -2.618996
4    a  one  0.021549  0.204140
b 
   key1 key2     data1     data2
2    b  one  1.081573 -0.713015
3    b  two -0.173333  1.399356
>>> for group1,group2 in df.groupby(df['key1']):
	print(group1,'\r',group2)

	
a 
   key1 key2     data1     data2
0    a  one -0.738145  0.494023
1    a  two  0.553763 -2.618996
4    a  one  0.021549  0.204140
b 
   key1 key2     data1     data2
2    b  one  1.081573 -0.713015
3    b  two -0.173333  1.399356
>>> def get_stats(group):

    return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}

>>> def group_by_test():

    frame=DataFrame({'data1':np.random.randn(100),'data2':np.random.randn(100)})

    factor=pd.cut(frame.data1,4)

    print (factor)

    

>>> def group_by_test():

    frame=DataFrame({'data1':np.random.randn(100),'data2':np.random.randn(100)})

    factor=pd.cut(frame.data1,4)

    print (factor)
    print (frame.data2.groupby(factor).apply(get_stats))

    
>>> for g in frame.data2.groupbyby(facotr):

  get_stats(g)

  
Traceback (most recent call last):
  File "", line 1, in <module>
    for g in frame.data2.groupbyby(facotr):
NameError: name 'frame' is not defined
>>> def draw(deck,n=5):

    return deck.take(np.random.permutation(len(deck))[:n])

>>> def group_by_test2():

    card_val=(range(1,11)+[10]*3)*4  #牌的序号 

    base_name=['A']+range(2,11)+['J','K','Q'] #牌名

    cards=[]

    suits=['H','S','C','D'] #花色:红桃(Hearts),黑桃(Spades),梅花(Clubs),方片(Diamonds)

    for suit in ['H','S','C','D']:

        cards.extend(str(num)+suit for num in base_name) #产生牌

    deck=Series(card_val,index=cards)

    get_suit=lambda card:card[-1] #根据牌名最后一个字符也就是花色进行分组。

    
>>> print (deck.groupby(get_suit).apply(draw,n=2))
Traceback (most recent call last):
  File "", line 1, in <module>
    print (deck.groupby(get_suit).apply(draw,n=2))
NameError: name 'deck' is not defined
>>> 

>>> import datetime
>>> datetime.datetime.now
<built-in method now of type object at 0x00000000676F2DF0>
>>> datetime.datetime.now()
datetime.datetime(2020, 5, 15, 9, 56, 58, 806307)

>>> from datetime import datetime
>>> t1=datetime(2018,4,11)
>>> t2=datetime(2918,3,3)
>>> t2=datetime(2018,3,3)
>>> t1-t2
datetime.timedelta(39)

>>> from datetime import timedelta
>>> delta=timedelta(12)
>>> t1+delta
datetime.datetime(2018, 4, 23, 0, 0)
>>> value='2018-4-12'

>>> datetime.strptime(value,'%Y-%m-%d')
datetime.datetime(2018, 4, 12, 0, 0)


>>> from dateutil.parser import parse
>>> print('April,12,2018,12:00 PM')
April,12,2018,12:00 PM
>>> parse('April,12,2018,12:00 PM')
datetime.datetime(2020, 4, 12, 12, 0)
>>> parse('12/4/2018',dayfirst=True)
datetime.datetime(2018, 4, 12, 0, 0)
>>> datestr=['4/12/2018','3/12/2018']


>>> import pandas as pd

>>> pd.to_datetime(datestr)
DatetimeIndex(['2018-04-12', '2018-03-12'], dtype='datetime64[ns]', freq=None)
>>> datestr=[datetime(2018,4,12),datetime(2018,4,11),datetime(2018,4,10),datetime(2018,4,9)]

>>> from pandas import DataFrame,Series
>>> import numpy as np
>>> ts=Series(np.random.randn(4),index=datestr)
>>> ts
2018-04-12    0.844791
2018-04-11   -0.295957
2018-04-10    0.728659
2018-04-09   -1.516450
dtype: float64
>>> stamp=ts.index[2]
>>> ts[stamp]
0.7286591295809405
>>> ts=Series(np.random.randn(20),index=pd.date_range('4/12/2018',periods=20))
>>> ts
2018-04-12   -1.137414
2018-04-13    1.342568
2018-04-14    0.978639
2018-04-15    0.542504
2018-04-16   -0.525884
2018-04-17   -0.250660
2018-04-18   -0.851703
2018-04-19    0.638443
2018-04-20   -0.294546
2018-04-21   -1.415922
2018-04-22   -0.197610
2018-04-23   -0.375781
2018-04-24   -1.558825
2018-04-25    0.648640
2018-04-26    0.088823
2018-04-27   -0.474935
2018-04-28    1.740024
2018-04-29    1.430090
2018-04-30   -0.103884
2018-05-01    1.438667
Freq: D, dtype: float64
>>> ts['2018-4']
2018-04-12   -1.137414
2018-04-13    1.342568
2018-04-14    0.978639
2018-04-15    0.542504
2018-04-16   -0.525884
2018-04-17   -0.250660
2018-04-18   -0.851703
2018-04-19    0.638443
2018-04-20   -0.294546
2018-04-21   -1.415922
2018-04-22   -0.197610
2018-04-23   -0.375781
2018-04-24   -1.558825
2018-04-25    0.648640
2018-04-26    0.088823
2018-04-27   -0.474935
2018-04-28    1.740024
2018-04-29    1.430090
2018-04-30   -0.103884
Freq: D, dtype: float64
>>> ts['2018/4']
2018-04-12   -1.137414
2018-04-13    1.342568
2018-04-14    0.978639
2018-04-15    0.542504
2018-04-16   -0.525884
2018-04-17   -0.250660
2018-04-18   -0.851703
2018-04-19    0.638443
2018-04-20   -0.294546
2018-04-21   -1.415922
2018-04-22   -0.197610
2018-04-23   -0.375781
2018-04-24   -1.558825
2018-04-25    0.648640
2018-04-26    0.088823
2018-04-27   -0.474935
2018-04-28    1.740024
2018-04-29    1.430090
2018-04-30   -0.103884
Freq: D, dtype: float64
>>> ts['2018/4/12':'2018/4/23']
2018-04-12   -1.137414
2018-04-13    1.342568
2018-04-14    0.978639
2018-04-15    0.542504
2018-04-16   -0.525884
2018-04-17   -0.250660
2018-04-18   -0.851703
2018-04-19    0.638443
2018-04-20   -0.294546
2018-04-21   -1.415922
2018-04-22   -0.197610
2018-04-23   -0.375781
Freq: D, dtype: float64
>>> ts['2018/4/20':'2018/4/23']
2018-04-20   -0.294546
2018-04-21   -1.415922
2018-04-22   -0.197610
2018-04-23   -0.375781
Freq: D, dtype: float64
>>> ts.truncate(after='2018/4/15')
2018-04-12   -1.137414
2018-04-13    1.342568
2018-04-14    0.978639
2018-04-15    0.542504
Freq: D, dtype: float64
>>> ts.truncate(before='2018/4/15')
2018-04-15    0.542504
2018-04-16   -0.525884
2018-04-17   -0.250660
2018-04-18   -0.851703
2018-04-19    0.638443
2018-04-20   -0.294546
2018-04-21   -1.415922
2018-04-22   -0.197610
2018-04-23   -0.375781
2018-04-24   -1.558825
2018-04-25    0.648640
2018-04-26    0.088823
2018-04-27   -0.474935
2018-04-28    1.740024
2018-04-29    1.430090
2018-04-30   -0.103884
2018-05-01    1.438667
Freq: D, dtype: float64
>>> pd.date_range('4/12/2018',periods=100,freq='D')
DatetimeIndex(['2018-04-12', '2018-04-13', '2018-04-14', '2018-04-15',
               '2018-04-16', '2018-04-17', '2018-04-18', '2018-04-19',
               '2018-04-20', '2018-04-21', '2018-04-22', '2018-04-23',
               '2018-04-24', '2018-04-25', '2018-04-26', '2018-04-27',
               '2018-04-28', '2018-04-29', '2018-04-30', '2018-05-01',
               '2018-05-02', '2018-05-03', '2018-05-04', '2018-05-05',
               '2018-05-06', '2018-05-07', '2018-05-08', '2018-05-09',
               '2018-05-10', '2018-05-11', '2018-05-12', '2018-05-13',
               '2018-05-14', '2018-05-15', '2018-05-16', '2018-05-17',
               '2018-05-18', '2018-05-19', '2018-05-20', '2018-05-21',
               '2018-05-22', '2018-05-23', '2018-05-24', '2018-05-25',
               '2018-05-26', '2018-05-27', '2018-05-28', '2018-05-29',
               '2018-05-30', '2018-05-31', '2018-06-01', '2018-06-02',
               '2018-06-03', '2018-06-04', '2018-06-05', '2018-06-06',
               '2018-06-07', '2018-06-08', '2018-06-09', '2018-06-10',
               '2018-06-11', '2018-06-12', '2018-06-13', '2018-06-14',
               '2018-06-15', '2018-06-16', '2018-06-17', '2018-06-18',
               '2018-06-19', '2018-06-20', '2018-06-21', '2018-06-22',
               '2018-06-23', '2018-06-24', '2018-06-25', '2018-06-26',
               '2018-06-27', '2018-06-28', '2018-06-29', '2018-06-30',
               '2018-07-01', '2018-07-02', '2018-07-03', '2018-07-04',
               '2018-07-05', '2018-07-06', '2018-07-07', '2018-07-08',
               '2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12',
               '2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16',
               '2018-07-17', '2018-07-18', '2018-07-19', '2018-07-20'],
              dtype='datetime64[ns]', freq='D')
>>> pd.date_range('4/12/2018',periods=100,freq='M')
DatetimeIndex(['2018-04-30', '2018-05-31', '2018-06-30', '2018-07-31',
               '2018-08-31', '2018-09-30', '2018-10-31', '2018-11-30',
               '2018-12-31', '2019-01-31', '2019-02-28', '2019-03-31',
               '2019-04-30', '2019-05-31', '2019-06-30', '2019-07-31',
               '2019-08-31', '2019-09-30', '2019-10-31', '2019-11-30',
               '2019-12-31', '2020-01-31', '2020-02-29', '2020-03-31',
               '2020-04-30', '2020-05-31', '2020-06-30', '2020-07-31',
               '2020-08-31', '2020-09-30', '2020-10-31', '2020-11-30',
               '2020-12-31', '2021-01-31', '2021-02-28', '2021-03-31',
               '2021-04-30', '2021-05-31', '2021-06-30', '2021-07-31',
               '2021-08-31', '2021-09-30', '2021-10-31', '2021-11-30',
               '2021-12-31', '2022-01-31', '2022-02-28', '2022-03-31',
               '2022-04-30', '2022-05-31', '2022-06-30', '2022-07-31',
               '2022-08-31', '2022-09-30', '2022-10-31', '2022-11-30',
               '2022-12-31', '2023-01-31', '2023-02-28', '2023-03-31',
               '2023-04-30', '2023-05-31', '2023-06-30', '2023-07-31',
               '2023-08-31', '2023-09-30', '2023-10-31', '2023-11-30',
               '2023-12-31', '2024-01-31', '2024-02-29', '2024-03-31',
               '2024-04-30', '2024-05-31', '2024-06-30', '2024-07-31',
               '2024-08-31', '2024-09-30', '2024-10-31', '2024-11-30',
               '2024-12-31', '2025-01-31', '2025-02-28', '2025-03-31',
               '2025-04-30', '2025-05-31', '2025-06-30', '2025-07-31',
               '2025-08-31', '2025-09-30', '2025-10-31', '2025-11-30',
               '2025-12-31', '2026-01-31', '2026-02-28', '2026-03-31',
               '2026-04-30', '2026-05-31', '2026-06-30', '2026-07-31'],
              dtype='datetime64[ns]', freq='M')
>>> pd.date_range('4/12/2018',periods=100,freq='Y')
DatetimeIndex(['2018-12-31', '2019-12-31', '2020-12-31', '2021-12-31',
               '2022-12-31', '2023-12-31', '2024-12-31', '2025-12-31',
               '2026-12-31', '2027-12-31', '2028-12-31', '2029-12-31',
               '2030-12-31', '2031-12-31', '2032-12-31', '2033-12-31',
               '2034-12-31', '2035-12-31', '2036-12-31', '2037-12-31',
               '2038-12-31', '2039-12-31', '2040-12-31', '2041-12-31',
               '2042-12-31', '2043-12-31', '2044-12-31', '2045-12-31',
               '2046-12-31', '2047-12-31', '2048-12-31', '2049-12-31',
               '2050-12-31', '2051-12-31', '2052-12-31', '2053-12-31',
               '2054-12-31', '2055-12-31', '2056-12-31', '2057-12-31',
               '2058-12-31', '2059-12-31', '2060-12-31', '2061-12-31',
               '2062-12-31', '2063-12-31', '2064-12-31', '2065-12-31',
               '2066-12-31', '2067-12-31', '2068-12-31', '2069-12-31',
               '2070-12-31', '2071-12-31', '2072-12-31', '2073-12-31',
               '2074-12-31', '2075-12-31', '2076-12-31', '2077-12-31',
               '2078-12-31', '2079-12-31', '2080-12-31', '2081-12-31',
               '2082-12-31', '2083-12-31', '2084-12-31', '2085-12-31',
               '2086-12-31', '2087-12-31', '2088-12-31', '2089-12-31',
               '2090-12-31', '2091-12-31', '2092-12-31', '2093-12-31',
               '2094-12-31', '2095-12-31', '2096-12-31', '2097-12-31',
               '2098-12-31', '2099-12-31', '2100-12-31', '2101-12-31',
               '2102-12-31', '2103-12-31', '2104-12-31', '2105-12-31',
               '2106-12-31', '2107-12-31', '2108-12-31', '2109-12-31',
               '2110-12-31', '2111-12-31', '2112-12-31', '2113-12-31',
               '2114-12-31', '2115-12-31', '2116-12-31', '2117-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')

>>> dates=[datetime(2017,1,6),datetime(2017,1,10),datetime(2017,1,13),
      datetime(2017,1,15),datetime(2017,1,18),datetime(2017,1,20)]
>>> ts=Series(np.random.randn(6),index=dates)
>>> ts
2017-01-06   -1.346653
2017-01-10    0.725038
2017-01-13   -0.409668
2017-01-15   -1.216957
2017-01-18   -0.071602
2017-01-20   -0.792183
dtype: float64
>>> dates=[datetime(2018,4,12),datetime(2018,4,13),datetime(2018,4,13),
      datetime(2018,4,14)]
>>> ts=Series(np.random.randn(6),index=dates)
Traceback (most recent call last):
  File "", line 1, in <module>
    ts=Series(np.random.randn(6),index=dates)
  File "C:\Python36\lib\site-packages\pandas\core\series.py", line 262, in __init__
    .format(val=len(data), ind=len(index)))
ValueError: Length of passed values is 6, index implies 4
>>> ts=Series(np.random.randn(4),index=dates)
>>> ts
2018-04-12   -0.296342
2018-04-13    0.624015
2018-04-13   -0.761311
2018-04-14    0.673819
dtype: float64
>>> ts=Series(np.random.randn(4),index=dates,dtype='int64')
>>> ts
2018-04-12    0
2018-04-13    0
2018-04-13    0
2018-04-14    0
dtype: int64
>>> ts.index.is_unique
False
>>> pd.date_range('4/12/2018','5/12/2018')
DatetimeIndex(['2018-04-12', '2018-04-13', '2018-04-14', '2018-04-15',
               '2018-04-16', '2018-04-17', '2018-04-18', '2018-04-19',
               '2018-04-20', '2018-04-21', '2018-04-22', '2018-04-23',
               '2018-04-24', '2018-04-25', '2018-04-26', '2018-04-27',
               '2018-04-28', '2018-04-29', '2018-04-30', '2018-05-01',
               '2018-05-02', '2018-05-03', '2018-05-04', '2018-05-05',
               '2018-05-06', '2018-05-07', '2018-05-08', '2018-05-09',
               '2018-05-10', '2018-05-11', '2018-05-12'],
              dtype='datetime64[ns]', freq='D')
>>> ts=Series(np.random.randn(4),index=pd.date_range('4/12/2018',periods=4,freq='M'))
>>> ts
2018-04-30   -0.682872
2018-05-31    0.920610
2018-06-30    0.925642
2018-07-31   -0.623782
Freq: M, dtype: float64
>>> ts.shift(2)
2018-04-30         NaN
2018-05-31         NaN
2018-06-30   -0.682872
2018-07-31    0.920610
Freq: M, dtype: float64
>>> ts.shift(2,freq='M')
2018-06-30   -0.682872
2018-07-31    0.920610
2018-08-31    0.925642
2018-09-30   -0.623782
Freq: M, dtype: float64
>>> rng=pd.date_range('1/1/2000',periods=3,freq='M')
>>> ts=Series(randn(3),index=rng)
Traceback (most recent call last):
  File "", line 1, in <module>
    ts=Series(randn(3),index=rng)
NameError: name 'randn' is not defined

>>> rng=pd.date_range('1/1/2000',periods=3,freq='M')
>>> ts=Series(np.random.randn(3),index=rng)
>>> ts
2000-01-31    1.167103
2000-02-29   -0.319709
2000-03-31    0.681588
Freq: M, dtype: float64
>>> pts2=ts.to_period(freq="M")
>>> pts2
2000-01    1.167103
2000-02   -0.319709
2000-03    0.681588
Freq: M, dtype: float64
>>> pts2.to_timestamp(how='end')
2000-01-31    1.167103
2000-02-29   -0.319709
2000-03-31    0.681588
Freq: M, dtype: float64
>>> rng=pd.date_range('1/1/2000',periods=50,freq='D')
>>> ts=Series(np.random.randn(50),index=rng)
>>> print(ts.resample('M').mean())
2000-01-31    0.126353
2000-02-29    0.039958
Freq: M, dtype: float64
>>>  print(ts.mean())
SyntaxError: unexpected indent
>>> print(ts.mean())
0.09352305334643628
>>> rng=pd.date_range('1/1/2000',periods=12,freq='T')
>>> ts=Series(np.arange(12),index=rng)
>>> ts
2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int32
>>> print(ts.resample('5min',closed='left').sum())
2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int32
>>> print(ts.resample('5min',closed='right').sum())
1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int32
>>> ts.resample('5min', closed='left').ohlc()
                     open  high  low  close
2000-01-01 00:00:00     0     4    0      4
2000-01-01 00:05:00     5     9    5      9
2000-01-01 00:10:00    10    11   10     11
>>> ts.resample('5min',closed='left').ohlc()
                     open  high  low  close
2000-01-01 00:00:00     0     4    0      4
2000-01-01 00:05:00     5     9    5      9
2000-01-01 00:10:00    10    11   10     11
>>> 

参考:https://www.cnblogs.com/zhanghongfeng/p/8745415.html

你可能感兴趣的:(利用python进行数据分析 数据聚合与分组运算groupby 和时间序列操作)