xxx.sum()
xxx.mean()
xxx.max()
xxx.add()
等等等 方法
df1=DataFrame(
[
[3,2,np.nan],
[2,7,-5],
[5,np.nan,np.nan],
[7,6,4]
]
)
df1
0 1 2
0 3 2.0 NaN
1 2 7.0 -5.0
2 5 NaN NaN
3 7 6.0 4.0
df1.sum()
0 17.0
1 15.0
2 -1.0
dtype: float64
默认参数 axis=0
df1.sum(axis=0)
0 17.0
1 15.0
2 -1.0
dtype: float64
axis=0 还可以携程 axis=‘index’
df1.sum(axis='index')
0 17.0
1 15.0
2 -1.0
dtype: float64
df1.sum(axis='columns')
0 5.0
1 4.0
2 5.0
3 17.0
dtype: float64
print(df1)
df1.mean(skipna=True)
0 1 2
0 3 2.0 NaN
1 2 7.0 -5.0
2 5 NaN NaN
3 7 6.0 4.0
0 4.25
1 5.00
2 -0.50
dtype: float64
df1.mean(skipna=False)
0 4.25
1 NaN
2 NaN
dtype: float64
df1.fillna(0).mean(0)
0 4.25
1 3.75
2 -0.25
dtype: float64
df2=DataFrame(np.random.randint(1,10,(4,6)),columns=list('ABCDEF'))
df2
A B C D E F
0 9 9 3 8 8 1
1 8 4 1 9 3 4
2 8 1 2 7 5 8
3 6 5 6 4 4 9
df2.sum(axis=1)
0 38
1 29
2 31
3 34
dtype: int64
df2.cumsum(axis=1)
A B C D E F
0 9 18 21 29 37 38
1 8 12 13 22 25 29
2 8 9 11 18 23 31
3 6 11 17 21 25 34
print(df2)
df2.cummax(axis=1)
A B C D E F
0 9 9 3 8 8 1
1 8 4 1 9 3 4
2 8 1 2 7 5 8
3 6 5 6 4 4 9
A B C D E F
0 9 9 9 9 9 9
1 8 8 8 9 9 9
2 8 8 8 8 8 8
3 6 6 6 6 6 9
df3=DataFrame(
np.random.randint(1,100,[10,4]),
columns=list('ABCD'))
df3
A B C D
0 33 4 30 81
1 98 50 94 26
2 66 85 36 35
3 28 10 89 90
4 78 1 76 49
5 1 83 39 61
6 30 16 55 9
7 12 75 59 83
8 41 49 13 22
9 74 37 40 93
df3.describe()
A B C D
count 10.000000 10.000000 10.000000 10.000000
mean 46.100000 41.000000 53.100000 54.900000
std 31.388073 32.612199 26.518128 30.996236
min 1.000000 1.000000 13.000000 9.000000
25% 28.500000 11.500000 36.750000 28.250000
50% 37.000000 43.000000 47.500000 55.000000
75% 72.000000 68.750000 71.750000 82.500000
max 98.000000 85.000000 94.000000 93.000000
df3.A.describe()
count 10.000000
mean 46.100000
std 31.388073
min 1.000000
25% 28.500000
50% 37.000000
75% 72.000000
max 98.000000
Name: A, dtype: float64
s1 = Series(
[np.random.choice('西瓜,橘子,苹果,木瓜,双星,77'.split(','))for i in range(60)]
)
s1
0 苹果
1 77
2 木瓜
3 双星
4 西瓜
5 木瓜
.....
55 77
56 苹果
57 橘子
58 苹果
59 橘子
dtype: object
s1.describe()
count 60
unique 6
top 双星
freq 13
dtype: object
s1.unique()
array(['苹果', '77', '木瓜', '双星', '西瓜', '橘子'], dtype=object)
df2
A B C D E F
0 9 9 3 8 8 1
1 8 4 1 9 3 4
2 8 1 2 7 5 8
3 6 5 6 4 4 9
df2.apply(Series.unique)
A [9, 8, 6]
B [9, 4, 1, 5]
C [3, 1, 2, 6]
D [8, 9, 7, 4]
E [8, 3, 5, 4]
F [1, 4, 8, 9]
dtype: object
df2.apply(Series.unique,axis=1)
0 [9, 3, 8, 1]
1 [8, 4, 1, 9, 3]
2 [8, 1, 2, 7, 5]
3 [6, 5, 4, 9]
dtype: object
s1.value_counts()
双星 13
西瓜 12
橘子 10
苹果 10
77 8
木瓜 7
dtype: int64
s1.value_counts(normalize=True)
双星 0.216667
西瓜 0.200000
橘子 0.166667
苹果 0.166667
77 0.133333
木瓜 0.116667
dtype: float64
s1.value_counts().plot(kind='bar')
参数:
s1.value_counts()
双星 13
西瓜 12
橘子 10
苹果 10
77 8
木瓜 7
dtype: int64
s1.value_counts(normalize=True)
双星 0.216667
西瓜 0.200000
橘子 0.166667
苹果 0.166667
77 0.133333
木瓜 0.116667
dtype: float64
s1.value_counts(sort=False)
木瓜 7
77 8
西瓜 12
双星 13
苹果 10
橘子 10
dtype: int64
ages= Series(np.random.randint(1,101,100))
ages
0 20
1 9
2 11
3 65
4 11
..
95 67
96 15
97 24
98 35
99 64
Length: 100, dtype: int32
ages.value_counts(bins=4)
(51.0, 75.5] 29
(1.901, 26.5] 26
(26.5, 51.0] 25
(75.5, 100.0] 20
dtype: int64
ages.value_counts(bins=[1,18,40,60,100])
(60.0, 100.0] 40
(40.0, 60.0] 20
(18.0, 40.0] 20
(0.999, 18.0] 20
dtype: int64
ages.value_counts(normalize=True,bins=[18,40,60])
(17.999, 40.0] 0.21
(40.0, 60.0] 0.20
dtype: float64
ages.value_counts(normalize=True,bins=[18,40,60]).apply(lambda x :f'{x*100}%')
(17.999, 40.0] 21.0%
(40.0, 60.0] 20.0%
dtype: object
pd.value_counts(ages)
4 4
27 4
20 3
74 3
6 3
..
58 1
99 1
52 1
51 1
2 1
Length: 65, dtype: int64
pd.value_counts(s1)
双星 13
西瓜 12
橘子 10
苹果 10
77 8
木瓜 7
dtype: int64
df4= DataFrame(
np.random.randint(1,6,(4,6)),
index=list('ABCD'),
columns=list('甲乙丙丁戊已')
)
df4
甲 乙 丙 丁 戊 已
A 1 5 2 4 2 3
B 5 1 1 2 3 4
C 2 1 3 5 2 4
D 1 5 1 3 4 3
df4.value_counts()
甲 乙 丙 丁 戊 已
5 1 1 2 3 4 1
2 1 3 5 2 4 1
1 5 2 4 2 3 1
1 3 4 3 1
dtype: int64
df4.apply(pd.value_counts,axis=0)
甲 乙 丙 丁 戊 已
1 2.0 2.0 2.0 NaN NaN NaN
2 1.0 NaN 1.0 1.0 2.0 NaN
3 NaN NaN 1.0 1.0 1.0 2.0
4 NaN NaN NaN 1.0 1.0 2.0
5 1.0 2.0 NaN 1.0 NaN NaN
df4.apply(pd.value_counts,axis=1)
1 2 3 4 5
A 1.0 2.0 1.0 1.0 1.0
B 2.0 1.0 1.0 1.0 1.0
C 1.0 2.0 1.0 1.0 1.0
D 2.0 NaN 2.0 1.0 1.0
s2=Series(np.random.randint(1,6,10))
s2
0 1
1 1
2 1
3 3
4 3
5 1
6 1
7 5
8 5
9 5
dtype: int32
condition=s2.isin([2,4])
condition
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
dtype: bool
s2[condition]
Series([], dtype: int32)
s2.isin([2,4]).apply(lambda x:not x)
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
dtype: bool
s2[s2.isin([2,4]).apply(lambda x:not x)]
0 1
1 1
2 1
3 3
4 3
5 1
6 1
7 5
8 5
9 5
dtype: int32
~s2.isin([2,4])
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
dtype: bool
s2.isin([2,4])^True
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
dtype: bool
dropna(axis=0,how =‘any’,inplace=False) 默认只要有一个nan就 丢弃数据
how=‘all’ 表示 整行或者整列全部都是nan才丢掉
fillna() 填充值
isnull() 返回 是 nan 的布尔序列
notnull() 返回不是nan 的布尔序列
df1
0 1 2
0 3 2.0 NaN
1 2 7.0 -5.0
2 5 NaN NaN
3 7 6.0 4.0
df1.dropna()
0 1 2
1 2 7.0 -5.0
3 7 6.0 4.0
df1.dropna(axis=1)
0
0 3
1 2
2 5
3 7
df1.dropna(how='all')
0 1 2
0 3 2.0 NaN
1 2 7.0 -5.0
2 5 NaN NaN
3 7 6.0 4.0
df1[1]
0 2.0
1 7.0
2 NaN
3 6.0
Name: 1, dtype: float64
df1[1].isnull()
0 False
1 False
2 True
3 False
Name: 1, dtype: bool
df1[1].notnull()
0 True
1 True
2 False
3 True
Name: 1, dtype: bool
df1[1].fillna(0)
0 2.0
1 7.0
2 0.0
3 6.0
Name: 1, dtype: float64