pandas groupby教程

pandas groupby

按功能区分

  • 分组数据

    df = pd.DataFrame({'Year' : ['2001', '2002', '2001', '2002',
                              '2001', '2002', '2001', '2002'],
                       'score' : ['primary', 'second', 'third', 'fourth',
                              'primary', 'second', 'fourth', 'third'],
                       'C' : np.random.randn(8),
                       'D' : np.random.randn(8)})
    
    print(df)
       Year    score         C         D
    0  2001  primary -1.277445  0.109920
    1  2002   second -1.814557  0.031803
    2  2001    third  0.246247  1.618513
    3  2002   fourth -1.042709  0.464520
    4  2001  primary  1.064651 -0.818139
    5  2002   second -0.197225  0.059878
    6  2001   fourth -0.292600 -0.432351
    7  2002    third  0.443334 -0.454662
    
    >>> df.groupby('Year')
    
    >>> df.groupby(['Year','score'])
    
    
    查看分组
    >>> df.groupby('Year').groups
    {'2001': Int64Index([0, 2, 4, 6], dtype='int64'), '2002': Int64Index([1, 3, 5, 7], dtype='int64')}
    
    分组打印
    for name,group in df.groupby(['Year','score']):
        print (name)
        print (group)
     
    ('2001', 'fourth')
       Year   score         C         D
    6  2001  fourth  0.240019  0.329059
    ('2001', 'primary')
       Year    score         C         D
    0  2001  primary  0.663303 -0.923613
    4  2001  primary -1.034212 -0.696587
    ('2001', 'third')
       Year  score         C         D
    2  2001  third  0.795344 -0.022841
    ('2002', 'fourth')
       Year   score         C         D
    3  2002  fourth -0.710015  2.105533
    ('2002', 'second')
       Year   score         C         D
    1  2002  second  0.701961  1.064701
    5  2002  second  0.796509 -0.582946
    ('2002', 'third')
       Year  score         C         D
    7  2002  third  0.849467 -0.248125
    
    
    
  • 选择一个分组

    import pandas as pd
    import numpy as np
    
    df = pd.DataFrame({'Year' : ['2001', '2002', '2001', '2002',
                              '2001', '2002', '2001', '2002'],
                       'score' : ['primary', 'second', 'third', 'fourth',
                              'primary', 'second', 'fourth', 'third'],
                       'C' : np.random.randn(8),
                       'D' : np.random.randn(8)})
    
    
    grouped = df.groupby('Year')
    print (grouped.get_group('2001'))
    
       Year    score         C         D
    0  2001  primary  1.155751 -1.306120
    2  2001    third  0.029471 -0.231977
    4  2001  primary  0.553366  0.313114
    6  2001   fourth  0.698063 -1.472859
    
  • 聚合

    聚合函数为每个组返回单个聚合值。当创建了分组(group by)对象,就可以对分组数据执行多个聚合操作。

    一个比较常用的是通过聚合或等效的agg方法聚合 -

    import pandas as pd
    import numpy as np
    
    df = pd.DataFrame({'Year' : ['2001', '2002', '2001', '2002',
                              '2001', '2002', '2001', '2002'],
                       'score' : ['primary', 'second', 'third', 'fourth',
                              'primary', 'second', 'fourth', 'third'],
                       'C' : np.random.randn(8),
                       'D' : np.random.randn(8)})
    
    
    grouped = df.groupby('Year')
    print (grouped['C'].agg(np.mean))
    
    • 聚合size方法

      import pandas as pd
      import numpy as np
      
      df = pd.DataFrame({'Year' : ['2001', '2002', '2001', '2002',
                                '2001', '2002', '2001', '2002'],
                         'score' : ['primary', 'second', 'third', 'fourth',
                                'primary', 'second', 'fourth', 'third'],
                         'C' : np.random.randn(8),
                         'D' : np.random.randn(8)})
      
      
      grouped = df.groupby('Year')
      print (grouped['C'].agg(np.size))
      
    • 同时聚合多种方法

      import pandas as pd
      import numpy as np
      
      df = pd.DataFrame({'Year' : ['2001', '2002', '2001', '2002',
                                '2001', '2002', '2001', '2002'],
                         'score' : ['primary', 'second', 'third', 'fourth',
                                'primary', 'second', 'fourth', 'third'],
                         'C' : np.random.randn(8),
                         'D' : np.random.randn(8)})
      
      
      grouped = df.groupby('Year')
      print (grouped['C'].agg([np.size,np.sum,np.mean]))
      
            size       sum      mean
      Year                          
      2001   4.0 -3.839661 -0.959915
      2002   4.0 -3.633480 -0.908370
      
      
  • 转换

    分组或列上的转换返回索引大小与被分组的索引相同的对象。因此,转换应该返回与组块大小相同的结果。

    score = lambda x: (x - x.mean())
    

    实例

    import pandas as pd
    import numpy as np
    
    df = pd.DataFrame({'Year' : ['2001', '2002', '2001', '2002',
                              '2001', '2002', '2001', '2002'],
                       'score' : ['primary', 'second', 'third', 'fourth',
                              'primary', 'second', 'fourth', 'third'],
                       'C' : np.random.randn(8),
                       'D' : np.random.randn(8)})
    
    
    grouped = df.groupby('Year')
    score = lambda x: (x - x.mean())
    print(df)
    print(grouped['C'].agg(np.mean))
    print (grouped['C'].transform(score))
    
       Year    score         C         D
    0  2001  primary  1.455783  0.272765
    1  2002   second -0.891035  1.399249
    2  2001    third -0.335082  0.777773
    3  2002   fourth -0.938394 -0.614140
    4  2001  primary -0.770775  0.677035
    5  2002   second  1.561887  0.343912
    6  2001   fourth -0.426879 -0.551144
    7  2002    third  1.070326 -0.438464
    Year
    2001   -0.019239
    2002    0.200696
    Name: C, dtype: float64
    0    1.475021
    1   -1.091731
    2   -0.315844
    3   -1.139090
    4   -0.751537
    5    1.361191
    6   -0.407640
    7    0.869630
    Name: C, dtype: float64
    
            
            可以看到1.475021 = 1.455783 - (-0.019239)
    

你可能感兴趣的:(pandas groupby教程)