5. 日月光华 Python数据分析-Pandas-汇总统计方法与应用函数

import numpy as np
import pandas as pd

data = pd.DataFrame(np.random.randn(9, 6), columns=list('abcdef'))

data.tail()   # 显示后5行
#   a   b   c   d   e   f
# 4 -0.783951   1.060859    0.199606    -0.209257   0.197639    -0.114448
# 5 1.421739    -0.646007   0.742340    -1.510422   -0.219927   0.225591
# 6 -0.524613   -0.058266   0.170682    0.293186    -1.505537   0.625299
# 7 -0.688985   -0.083023   -0.274630   -0.455460   -1.323609   -0.212020
# 8 0.396852    -0.086073   -0.374000   1.958154    -0.920253   0.530094

data.info()
# 
# RangeIndex: 9 entries, 0 to 8
# Data columns (total 6 columns):
# a    9 non-null float64
# b    9 non-null float64
# c    9 non-null float64
# d    9 non-null float64
# e    9 non-null float64
# f    9 non-null float64
# dtypes: float64(6)
# memory usage: 512.0 bytes

data.describe()
#   a   b   c   d   e   f
# count 9.000000    9.000000    8.000000    9.000000    9.000000    9.000000
# mean  -0.018660   -0.262793   0.368564    0.117029    0.248969    0.235428
# std   1.221575    1.026121    0.764823    0.879644    0.685090    0.991456
# min   -2.142549   -1.641552   -0.647248   -1.192345   -1.313795   -1.445769
# 25%   -1.041688   -1.038682   -0.146752   -0.312632   0.133815    -0.521155
# 50%   0.447488    -0.242379   0.393846    0.190798    0.486768    0.357102
# 75%   0.567235    0.632706    0.837352    0.822798    0.623940    0.496832
# max   1.403796    1.162124    1.527134    1.394901    1.024189    1.614408

data.sum(0)   # 以列为单位进行操作
# a   -0.167941
# b   -2.365137
# c    2.948514
# d    1.053265
# e    2.240720
# f    2.118855
# dtype: float64

data.a[data.a.idxmax()]
# 1.4037960380489185
data = pd.DataFrame(np.random.randint(1, 10, size=(5,7)))
data
#   0   1   2   3   4   5   6
# 0 5   3   6   7   2   1   8
# 1 7   6   9   8   4   9   4
# 2 1   9   7   6   3   9   1
# 3 2   3   6   8   9   6   2
# 4 6   6   3   1   3   7   6

np.unique(data)  # 取出唯一值
# array([1, 2, 3, 4, 5, 6, 7, 8, 9])

data.iloc[2].unique()   # 取出第2行的唯一值(行号从0开始)
# array([1, 9, 7, 6, 3])

data.iloc[:, -2].value_counts()    # 统计-2列,每个数值出现的次数
# 9    2
# 1    1
# 6    1
# 7    1
# Name: 5, dtype: int64

s = pd.Series(['a', 'b', 'b','b','b', 'a', 'c'])
s.value_counts()
# b    4
# a    2
# c    1
# dtype: int64

s[s.isin(['a', 'c'])]   # 统计s中存在a或c的位置
# 0    a
# 5    a
# 6    c
# dtype: object

data.apply(lambda x:x.max()-x.min(), axis=0)    # 对列应用该函数
# 0    7
# 1    6
# 2    5
# 3    8
# 4    3
# 5    2
# 6    7
# dtype: int64

data.applymap(lambda x:x**2 + x +3)  #对每个元素都应用这个函数

data = pd.DataFrame(np.random.randn(9, 6), columns=list('abcdef'))
data.a.apply(lambda x:x+10)
# 0     9.801785
# 1     9.736178
# 2     9.717085
# 3    10.699237
# 4    11.847702
# 5    10.039552
# 6    10.537041
# 7     9.298466
# 8     9.151704
# Name: a, dtype: float64

data
image.png
data.applymap(lambda x:x**2 + x +3)
image.png
data['g'] = ['ssd', 'ddff', 'zsd', 'sdf', 'bfff', 'xxf', 'zxc', 'sadff', 'sdfsdd']
data
image.png
data['g'] = data.g.apply(lambda x:x.title())
data
image.png

你可能感兴趣的:(5. 日月光华 Python数据分析-Pandas-汇总统计方法与应用函数)