pandas 和numpy库处理数据(2)

接上:对于数据的基本统计分析(统计元素不再解释,基本是均值、方差等计算)

 df
   num  class   name    sex  english  sport  army  math  possity  space
0   10      1   mary  woman       80     80    90  75.0       60     65
1   28      1   land    man       80     50    69  70.0       58     70
2   15      2   asnx    man       80     69    80  75.0       90     94
3   18      4  david    man       90     80    86  85.0       95     62
4   19      2    gry  woman       90     50    64   NaN       64     85
5   20      2  kitty  woman       84     58    97  94.0       63     21
6   14      3   lury  woman       98     77    88   0.0       55     40
7   21      1   facy    man       55     68    94  52.0       36     48
>>> df['sport'].describe()
count     8.000000
mean     66.500000
std      12.535663
min      50.000000
25%      56.000000
50%      68.500000
75%      77.750000
max      80.000000
Name: sport, dtype: float64
>>> df.describe()
             num     class    english    ...           math    possity      space
count   8.000000  8.000000   8.000000    ...       7.000000   8.000000   8.000000
mean   18.125000  2.000000  82.125000    ...      64.428571  65.125000  60.625000
std     5.383507  1.069045  12.699128    ...      31.245571  19.067081  23.820384
min    10.000000  1.000000  55.000000    ...       0.000000  36.000000  21.000000
25%    14.750000  1.000000  80.000000    ...      61.000000  57.250000  46.000000
50%    18.500000  2.000000  82.000000    ...      75.000000  61.500000  63.500000
75%    20.250000  2.250000  90.000000    ...      80.000000  70.500000  73.750000
max    28.000000  4.000000  98.000000    ...      94.000000  95.000000  94.000000

[8 rows x 8 columns]
>>> df['english'].size
8
>>> df['english'].max()
98
>>> df['english'].min()
55
>>> df['english'].sum()
657
>>> df['english'].mean()
82.125
>>> df['english'].var()
161.26785714285714
>>> df['english'].std()
12.699128204048383
>>> np.mean(df['english'])
82.125
>>> np.average(df['english'])
82.125
>>> df.median()
num        18.5
class       2.0
english    82.0
sport      68.5
army       87.0
math       75.0
possity    61.5
space      63.5
dtype: float64
>>> df.mode()
   num  class   name    sex  english  sport  army  math  possity  space
0   10    1.0   asnx    man     80.0   50.0    64  75.0       36     21
1   14    2.0  david  woman      NaN   80.0    69   NaN       55     40
2   15    NaN   facy    NaN      NaN    NaN    80   NaN       58     48
3   18    NaN    gry    NaN      NaN    NaN    86   NaN       60     62
4   19    NaN  kitty    NaN      NaN    NaN    88   NaN       63     65
5   20    NaN   land    NaN      NaN    NaN    90   NaN       64     70
6   21    NaN   lury    NaN      NaN    NaN    94   NaN       90     85
7   28    NaN   mary    NaN      NaN    NaN    97   NaN       95     94

>>> df.mode()
   num  class   name    sex  english  sport  army  math  possity  space
0   10    1.0   asnx    man     80.0   50.0    64  75.0       36     21
1   14    2.0  david  woman      NaN   80.0    69   NaN       55     40
2   15    NaN   facy    NaN      NaN    NaN    80   NaN       58     48
3   18    NaN    gry    NaN      NaN    NaN    86   NaN       60     62
4   19    NaN  kitty    NaN      NaN    NaN    88   NaN       63     65
5   20    NaN   land    NaN      NaN    NaN    90   NaN       64     70
6   21    NaN   lury    NaN      NaN    NaN    94   NaN       90     85
7   28    NaN   mary    NaN      NaN    NaN    97   NaN       95     94
>>> df
   num  class   name    sex  english  sport  army  math  possity  space
0   10      1   mary  woman       80     80    90  75.0       60     65
1   28      1   land    man       80     50    69  70.0       58     70
2   15      2   asnx    man       80     69    80  75.0       90     94
3   18      4  david    man       90     80    86  85.0       95     62
4   19      2    gry  woman       90     50    64   NaN       64     85
5   20      2  kitty  woman       84     58    97  94.0       63     21
6   14      3   lury  woman       98     77    88   0.0       55     40
7   21      1   facy    man       55     68    94  52.0       36     48
>>> df.groupby('class')['english','sport','army'].mean()
         english  sport       army
class                             
1      71.666667   66.0  84.333333
2      84.666667   59.0  80.333333
3      98.000000   77.0  88.000000
4      90.000000   80.0  86.000000
>>> df.groupby(['class','sex'])['english'].agg({'total':np.sum,'number':np.size,'mean':np.mean,'var':np.var})

             total  number  mean    var
class sex                              
1     man      135       2  67.5  312.5
      woman     80       1  80.0    NaN
2     man       80       1  80.0    NaN
      woman    174       2  87.0   18.0
3     woman     98       1  98.0    NaN
4     man       90       1  90.0    NaN
>>> #建立透视表
>>> df.pivot_table(index=['class','name'])
             army  english  math  num  possity  space  sport
class name                                                  
1     facy     94       55  52.0   21       36     48     68
      land     69       80  70.0   28       58     70     50
      mary     90       80  75.0   10       60     65     80
2     asnx     80       80  75.0   15       90     94     69
      gry      64       90   NaN   19       64     85     50
      kitty    97       84  94.0   20       63     21     58
3     lury     88       98   0.0   14       55     40     77
4     david    86       90  85.0   18       95     62     80
>>> df
   num  class   name    sex  english  sport  army  math  possity  space
0   10      1   mary  woman       80     80    90  75.0       60     65
1   28      1   land    man       80     50    69  70.0       58     70
2   15      2   asnx    man       80     69    80  75.0       90     94
3   18      4  david    man       90     80    86  85.0       95     62
4   19      2    gry  woman       90     50    64   NaN       64     85
5   20      2  kitty  woman       84     58    97  94.0       63     21
6   14      3   lury  woman       98     77    88   0.0       55     40
7   21      1   facy    man       55     68    94  52.0       36     48
#相关系数
>>> df['english'].corr(df['sport'])
0.0785215353368861
>>> df['english'].corr(df['army'])
-0.28518424251841296
>>> df.loc[:,['english','sport','army','math','possity','space']].corr()
          english     sport      army      math   possity     space
english  1.000000  0.078522 -0.285184 -0.210888  0.486667  0.020484
sport    0.078522  1.000000  0.604026 -0.275197  0.239372 -0.140894
army    -0.285184  0.604026  1.000000 -0.010708 -0.191855 -0.744345
math    -0.210888 -0.275197 -0.010708  1.000000  0.449533  0.180691
possity  0.486667  0.239372 -0.191855  0.449533  1.000000  0.445185
space    0.020484 -0.140894 -0.744345  0.180691  0.445185  1.000000

你可能感兴趣的:(pandas 和numpy库处理数据(2))