'''
【课程2.14】 数值计算和统计基础
常用数学、统计方法
'''
import pandas as pd
import numpy as np
df=pd.DataFrame({
'key1':[4,5,3,np.nan,2],
'key2':[1,2,np.nan,3,5],
'key3':[1,2,3,'h','k']},
index=list('abcde')
)
print(df)
key1 key2 key3
a 4.0 1.0 1
b 5.0 2.0 2
c 3.0 NaN 3
d NaN 3.0 h
e 2.0 5.0 k
print(df['key1'])
a 4.0
b 5.0
c 3.0
d NaN
e 2.0
Name: key1, dtype: float64
m1=df.mean()
print(m1)
print(df['key2'].mean())
key1 3.50
key2 2.75
dtype: float64
2.75
m2=df.mean(axis=1)
print(m2)
a 2.5
b 3.5
c 3.0
d 3.0
e 3.5
dtype: float64
m3=df.mean(skipna=False)
print(m3)
m4=df.mean(axis=1,skipna=False)
print(m4)
key1 NaN
key2 NaN
dtype: float64
a 2.5
b 3.5
c NaN
d NaN
e 3.5
dtype: float64
df=pd.DataFrame({
'key1':np.arange(10),
'key2':np.random.rand(10)*10,
'key3':[1,2,3,4,5,6,8,9,7,np.nan]
})
print(df)
key1 key2 key3
0 0 7.414184 1.0
1 1 1.086788 2.0
2 2 3.489052 3.0
3 3 0.204554 4.0
4 4 8.666828 5.0
5 5 8.846589 6.0
6 6 5.601258 8.0
7 7 4.295757 9.0
8 8 0.190201 7.0
9 9 0.579211 NaN
print(df.count())
key1 10
key2 10
key3 9
dtype: int64
print(df.min())
key1 0.000000
key2 0.190201
key3 1.000000
dtype: float64
print(df.max())
key1 9.000000
key2 8.846589
key3 9.000000
dtype: float64
print(df.quantile(q=0.5))
key1 4.500000
key2 3.892404
key3 NaN
dtype: float64
F:\software\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
print(df.sum())
key1 45.000000
key2 40.374422
key3 45.000000
dtype: float64
print(df.median())
key1 4.500000
key2 3.892404
key3 5.000000
dtype: float64
print(df.std())
key1 3.027650
key2 3.479024
key3 2.738613
dtype: float64
print(df.var())
key1 9.166667
key2 12.103610
key3 7.500000
dtype: float64
print(df.skew(),'→ skew样本的偏度\n')
print(df.kurt(),'→ kurt样本的峰度\n')
key1 0.000000
key2 0.238528
key3 0.000000
dtype: float64 → skew样本的偏度
key1 -1.200000
key2 -1.681431
key3 -1.200000
dtype: float64 → kurt样本的峰度
df['key1_sum']=df['key1'].cumsum()
df['key2_sum']=df['key2'].cumsum()
df['key3_sum']=df['key3'].cumsum()
print(df)
key1 key2 key3 key1_sum key2_sum key3_sum
0 0 7.414184 1.0 0 7.414184 1.0
1 1 1.086788 2.0 1 8.500972 3.0
2 2 3.489052 3.0 3 11.990024 6.0
3 3 0.204554 4.0 6 12.194578 10.0
4 4 8.666828 5.0 10 20.861406 15.0
5 5 8.846589 6.0 15 29.707995 21.0
6 6 5.601258 8.0 21 35.309254 29.0
7 7 4.295757 9.0 28 39.605010 38.0
8 8 0.190201 7.0 36 39.795211 45.0
9 9 0.579211 NaN 45 40.374422 NaN
df['key1_prod']=df['key1'].cumprod()
df['key2_prod']=df['key2'].cumprod()
df['key3_prod']=df['key3'].cumprod()
print(df)
key1 key2 key3 key1_sum key2_sum key3_sum key1_prod \
0 0 7.414184 1.0 0 7.414184 1.0 0
1 1 1.086788 2.0 1 8.500972 3.0 0
2 2 3.489052 3.0 3 11.990024 6.0 0
3 3 0.204554 4.0 6 12.194578 10.0 0
4 4 8.666828 5.0 10 20.861406 15.0 0
5 5 8.846589 6.0 15 29.707995 21.0 0
6 6 5.601258 8.0 21 35.309254 29.0 0
7 7 4.295757 9.0 28 39.605010 38.0 0
8 8 0.190201 7.0 36 39.795211 45.0 0
9 9 0.579211 NaN 45 40.374422 NaN 0
key2_prod key3_prod
0 7.414184 1.0
1 8.057643 2.0
2 28.113540 6.0
3 5.750740 24.0
4 49.840677 120.0
5 440.919971 720.0
6 2469.706722 5760.0
7 10609.258712 51840.0
8 2017.889816 362880.0
9 1168.783411 NaN
s=pd.Series(list('ffccdbaa'))
print(s)
0 f
1 f
2 c
3 c
4 d
5 b
6 a
7 a
dtype: object
print(s.unique())
sq=s.unique()
print(pd.Series(sq))
['f' 'c' 'd' 'b' 'a']
0 f
1 c
2 d
3 b
4 a
dtype: object
sq.sort()
print(sq)
['a' 'b' 'c' 'd' 'f']
print(s.value_counts(sort=False))
print(pd.value_counts(s))
b 1
d 1
c 2
f 2
a 2
dtype: int64
a 2
f 2
c 2
d 1
b 1
dtype: int64
s=pd.Series(np.arange(10,15))
df=pd.DataFrame({
'key1':list('abcdefghi1'),
'key2':np.arange(4,14)
})
print(s)
print(df)
0 10
1 11
2 12
3 13
4 14
dtype: int32
key1 key2
0 a 4
1 b 5
2 c 6
3 d 7
4 e 8
5 f 9
6 g 10
7 h 11
8 i 12
9 1 13
print(s.isin([5,13,14]))
0 False
1 False
2 False
3 True
4 True
dtype: bool
print(df.isin(['a','bc',10,8,1]))
key1 key2
0 True False
1 False False
2 False False
3 False False
4 False True
5 False False
6 False True
7 False False
8 False False
9 False False
'''
【课程2.15】 文本数据
Pandas针对字符串配备的一套方法,使其易于对数组的每个元素进行操作
'''
s=pd.Series(['A','b','C','bbhello','123',np.nan,'hj'])
df = pd.DataFrame({
'key1':list('abcdef'),
'key2':['hee','fv','w','hija','123',np.nan]})
print(s)
print(df)
0 A
1 b
2 C
3 bbhello
4 123
5 NaN
6 hj
dtype: object
key1 key2
0 a hee
1 b fv
2 c w
3 d hija
4 e 123
5 f NaN
print(s.str.count('b'))
0 0.0
1 1.0
2 0.0
3 2.0
4 0.0
5 NaN
6 0.0
dtype: float64
print(df['key2'].str.upper())
0 HEE
1 FV
2 W
3 HIJA
4 123
5 NaN
Name: key2, dtype: object
df.columns = df.columns.str.upper()
print(df)
KEY1 KEY2
0 a hee
1 b fv
2 c w
3 d hija
4 e 123
5 f NaN
s=pd.Series(['A','b','AAbb','123',np.nan])
print(s)
0 A
1 b
2 AAbb
3 123
4 NaN
dtype: object
print(s.str.lower())
0 a
1 b
2 aabb
3 123
4 NaN
dtype: object
print(s.str.upper())
0 A
1 B
2 AABB
3 123
4 NaN
dtype: object
print(s.str.len())
0 1.0
1 1.0
2 4.0
3 3.0
4 NaN
dtype: float64
print(s.str.startswith('b'))
0 False
1 True
2 False
3 False
4 NaN
dtype: object
print(s.str.endswith('3'))
0 False
1 False
2 False
3 True
4 NaN
dtype: object
s=pd.Series([' jack', 'jill ', ' jesse ', 'frank'])
df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '],
index=range(3))
print(s)
print(df)
0 jack
1 jill
2 jesse
3 frank
dtype: object
Column A Column B
0 -0.510577 -1.234627
1 1.338442 0.466668
2 -0.776209 -0.036641
print(s.str.strip())
print(s.str.lstrip())
print(s.str.rstrip())
0 jack
1 jill
2 jesse
3 frank
dtype: object
0 jack
1 jill
2 jesse
3 frank
dtype: object
0 jack
1 jill
2 jesse
3 frank
dtype: object
df.columns=df.columns.str.strip()
print(df)
Column A Column B
0 -0.510577 -1.234627
1 1.338442 0.466668
2 -0.776209 -0.036641
df=pd.DataFrame(np.random.randn(3,2),
index=list('abc'), columns=[' Column A ', ' Column B '])
print(df)
Column A Column B
a -1.063125 -1.096488
b 0.434409 -1.770704
c -1.728158 0.045788
df.columns=df.columns.str.replace(' ','--')
print(df)
--Column--A-- --Column--B--
a -1.063125 -1.096488
b 0.434409 -1.770704
c -1.728158 0.045788
df.columns=df.columns.str.replace('--','aaa',n=1)
print(df)
aaaColumn--A-- aaaColumn--B--
a -1.063125 -1.096488
b 0.434409 -1.770704
c -1.728158 0.045788
s=pd.Series(['a,b,c','1,2,3',['a,,,c'],np.nan])
print(s)
0 a,b,c
1 1,2,3
2 [a,,,c]
3 NaN
dtype: object
print(s.str.split(','))
0 [a, b, c]
1 [1, 2, 3]
2 NaN
3 NaN
dtype: object
print(s.str.split(',')[0])
print(s.str.split(',')[1])
['a', 'b', 'c']
['1', '2', '3']
print(s.str.split(',', expand=True))
print(s.str.split(',', expand=True, n = 1))
print(s.str.rsplit(',', expand=True, n = 1))
0 1 2
0 a b c
1 1 2 3
2 NaN None None
3 NaN None None
0 1
0 a b,c
1 1 2,3
2 NaN None
3 NaN None
0 1
0 a,b c
1 1,2 3
2 NaN None
3 NaN None
-----
df = pd.DataFrame({
'key1':['a,b,c','1,2,3',[':,., ']],
'key2':['a-b-c','1-2-3',[':-.- ']]})
print(df['key2'].str.split('-'))
0 [a, b, c]
1 [1, 2, 3]
2 NaN
Name: key2, dtype: object
s = pd.Series(['A','b','C','bbhello','123',np.nan,'hj'])
df = pd.DataFrame({
'key1':list('abcdef'),
'key2':['hee','fv','w','hija','123',np.nan]})
print(s)
print(df)
0 A
1 b
2 C
3 bbhello
4 123
5 NaN
6 hj
dtype: object
key1 key2
0 a hee
1 b fv
2 c w
3 d hija
4 e 123
5 f NaN
print(s.str[0])
0 A
1 b
2 C
3 b
4 1
5 NaN
6 h
dtype: object
print(s.str[:2])
0 A
1 b
2 C
3 bb
4 12
5 NaN
6 hj
dtype: object
print(df['key2'].str[0])
0 h
1 f
2 w
3 h
4 1
5 NaN
Name: key2, dtype: object
'''
【课程2.16】 合并 merge、join
Pandas具有全功能的,高性能内存中连接操作,与SQL等关系数据库非常相似
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, sort=True,
suffixes=('_x', '_y'), copy=True, indicator=False)
'''
df1 = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
df3 = pd.DataFrame({
'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({
'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(pd.merge(df1,df2,on='key'))
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
print(pd.merge(df3,df4,on=['key1','key2']))
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
print(pd.merge(df3,df4,on=['key1','key2'],how='inner'))
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
print(pd.merge(df3,df4,on=['key1','key2'],how='outer'))
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
print(pd.merge(df3, df4, on=['key1','key2'], how = 'left'))
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
------
print(pd.merge(df3,df4,on=['key1','key2'],how='right'))
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
df1 = pd.DataFrame({
'lkey':list('bbacaab'),
'data1':range(7)})
df2 = pd.DataFrame({
'rkey':list('abd'),
'date2':range(3)})
print(pd.merge(df1,df2,left_on='lkey',right_on='rkey'))
data1 lkey date2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
df1 = pd.DataFrame({
'key':list('abcdfeg'),
'data1':range(7)})
df2 = pd.DataFrame({
'date2':range(100,105)},
index = list('abcde'))
print(pd.merge(df1,df2,left_on='key',right_index=True))
data1 key date2
0 0 a 100
1 1 b 101
2 2 c 102
3 3 d 103
5 5 e 104
df1 = pd.DataFrame({
'key':list('bbacaab'),
'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({
'key':list('abd'),
'date2':[11,2,33]})
x1 = pd.merge(df1,df2, on = 'key', how = 'outer')
x2 = pd.merge(df1,df2, on = 'key', sort=True, how = 'outer')
print(x1)
print(x2)
data1 key date2
0 1.0 b 2.0
1 3.0 b 2.0
2 7.0 b 2.0
3 2.0 a 11.0
4 5.0 a 11.0
5 9.0 a 11.0
6 4.0 c NaN
7 NaN d 33.0
data1 key date2
0 2.0 a 11.0
1 5.0 a 11.0
2 9.0 a 11.0
3 1.0 b 2.0
4 3.0 b 2.0
5 7.0 b 2.0
6 4.0 c NaN
7 NaN d 33.0
print(x2.sort_values('data1'))
data1 key date2
3 1.0 b 2.0
0 2.0 a 11.0
4 3.0 b 2.0
6 4.0 c NaN
1 5.0 a 11.0
5 7.0 b 2.0
2 9.0 a 11.0
7 NaN d 33.0
left = pd.DataFrame({
'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({
'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
print(right)
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
print(left.join(right))
print(left.join(right, how='outer'))
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
df1 = pd.DataFrame({
'key':list('bbacaab'),
'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({
'key':list('abd'),
'date2':[11,2,33]})
print(df1)
print(df2)
data1 key
0 1 b
1 3 b
2 2 a
3 4 c
4 5 a
5 9 a
6 7 b
date2 key
0 11 a
1 2 b
2 33 d
print(pd.merge(df1, df2, left_index=True, right_index=True, suffixes=('_1', '_2')))
print(df1.join(df2['date2']))
print('-----')
data1 key_1 date2 key_2
0 1 b 11 a
1 3 b 2 b
2 2 a 33 d
data1 key date2
0 1 b 11.0
1 3 b 2.0
2 2 a 33.0
3 4 c NaN
4 5 a NaN
5 9 a NaN
6 7 b NaN
-----
left = pd.DataFrame({
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']})
right = pd.DataFrame({
'C': ['C0', 'C1'],
'D': ['D0', 'D1']},
index=['K0', 'K1'])
print(left)
print(right)
print(left.join(right, on = 'key'))
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K0
3 A3 B3 K1
C D
K0 C0 D0
K1 C1 D1
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K0 C0 D0
3 A3 B3 K1 C1 D1
'''
【课程2.17】 连接与修补 concat、combine_first
连接 - 沿轴执行连接操作
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
keys=None, levels=None, names=None, verify_integrity=False,
copy=True)
'''
import pandas as pd
import numpy as np
s1=pd.Series([1,2,3])
s2=pd.Series([2,3,4])
print(pd.concat([s1,s2]))
0 1
1 2
2 3
0 2
1 3
2 4
dtype: int64
s3 = pd.Series([1,2,3],index = ['a','c','h'])
s4 = pd.Series([2,3,4],index = ['b','e','d'])
print(pd.concat([s3,s4]).sort_index())
a 1
b 2
c 2
d 4
e 3
h 3
dtype: int64
s5 = pd.Series([1,2,3],index = ['a','b','c'])
s6 = pd.Series([2,3,4],index = ['b','c','d'])
print(pd.concat([s5,s6],axis=1))
0 1
a 1.0 NaN
b 2.0 2.0
c 3.0 3.0
d NaN 4.0
print([pd.concat([s5,s6],axis=1,join='inner')])
[ 0 1
b 2 2
c 3 3]
print(pd.concat([s5,s6], axis= 1, join_axes=[['a','b','d']]))
0 1
a 1.0 NaN
b 2.0 2.0
d NaN 4.0
sre=pd.concat([s5,s6],keys=['one','two'])
print(sre)
one a 1
b 2
c 3
two b 2
c 3
d 4
dtype: int64
sre = pd.concat([s5,s6], axis=1, keys = ['one','two'])
print(sre,type(sre))
one two
a 1.0 NaN
b 2.0 2.0
c 3.0 3.0
d NaN 4.0
df1 = pd.DataFrame([[np.nan, 3., 5.], [-4.6, np.nan, np.nan],[np.nan, 7., np.nan]])
df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]],index=[1, 2])
print(df1)
print(df2)
0 1 2
0 NaN 3.0 5.0
1 -4.6 NaN NaN
2 NaN 7.0 NaN
0 1 2
1 -42.6 NaN -8.2
2 -5.0 1.6 4.0
print(df1.combine_first(df2))
0 1 2
0 NaN 3.0 5.0
1 -4.6 NaN -8.2
2 -5.0 7.0 4.0
df1.update(df2)
print(df1)
0 1 2
0 NaN 3.0 5.0
1 -42.6 NaN -8.2
2 -5.0 1.6 4.0
'''
【课程2.18】 去重及替换
.duplicated / .replace
'''
s = pd.Series([1,1,1,1,2,2,2,3,4,5,5,5,5])
print(s)
0 1
1 1
2 1
3 1
4 2
5 2
6 2
7 3
8 4
9 5
10 5
11 5
12 5
dtype: int64
print(s.duplicated())
0 False
1 True
2 True
3 True
4 False
5 True
6 True
7 False
8 False
9 False
10 True
11 True
12 True
dtype: bool
print(s[s.duplicated()==False])
0 1
4 2
7 3
8 4
9 5
dtype: int64
s_re = s.drop_duplicates()
print(s_re)
0 1
4 2
7 3
8 4
9 5
dtype: int64
df = pd.DataFrame({
'key1':['a','a',3,4,5],
'key2':['a','a','b','b','c']})
print(df)
key1 key2
0 a a
1 a a
2 3 b
3 4 b
4 5 c
print(df.duplicated())
0 False
1 True
2 False
3 False
4 False
dtype: bool
print(df.duplicated())
print(df['key2'].duplicated())
0 False
1 True
2 False
3 False
4 False
dtype: bool
0 False
1 True
2 False
3 True
4 False
Name: key2, dtype: bool
s = pd.Series(list('ascaazsd'))
print(s)
0 a
1 s
2 c
3 a
4 a
5 z
6 s
7 d
dtype: object
print(s.replace('a',np.nan))
0 NaN
1 s
2 c
3 NaN
4 NaN
5 z
6 s
7 d
dtype: object
print(s.replace(['a','s'],np.nan))
0 NaN
1 NaN
2 c
3 NaN
4 NaN
5 z
6 NaN
7 d
dtype: object
print(s.replace({
'a':'hello world','s':'aaaaaaaaa'}))
0 hello world
1 aaaaaaaaa
2 c
3 hello world
4 hello world
5 z
6 aaaaaaaaa
7 d
dtype: object
'''
【课程2.19】 数据分组
分组统计 - groupby功能
① 根据某些条件将数据拆分成组
② 对每个组独立应用函数
③ 将结果合并到一个数据结构中
Dataframe在行(axis=0)或列(axis=1)上进行分组,将一个函数应用到各个分组并产生一个新值,然后函数执行结果被合并到最终的结果对象中。
df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)
'''
df = pd.DataFrame({
'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
print(df)
A B C D
0 foo one -1.420779 2.205477
1 bar one 0.514083 1.440050
2 foo two -0.265449 -1.354243
3 bar three -0.011561 -0.482113
4 foo two 2.030139 0.009457
5 bar two -0.106477 -0.014610
6 foo one 0.938704 0.228842
7 foo three 0.194304 2.537552
print(df.groupby('A'))
print(df.groupby('A').mean())
C D
A
bar 0.132015 0.314442
foo 0.295384 0.725417
print(df.groupby(['A','B']).mean())
C D
A B
bar one 0.514083 1.440050
three -0.011561 -0.482113
two -0.106477 -0.014610
foo one -0.241037 1.217159
three 0.194304 2.537552
two 0.882345 -0.672393
print(df.groupby(['A'])['D'].mean())
A
bar 0.314442
foo 0.725417
Name: D, dtype: float64
df = pd.DataFrame({
'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
X Y
0 A 1
1 B 4
2 A 3
3 B 2
print(df.groupby(['X']))
print(list(df.groupby(['X'])))
[('A', X Y
0 A 1
2 A 3), ('B', X Y
1 B 4
3 B 2)]
print(list(df.groupby('X'))[0])
('A', X Y
0 A 1
2 A 3)
for n,g in df.groupby('X'):
print(n)
print(g)
print('###')
print('-----')
A
X Y
0 A 1
2 A 3
###
B
X Y
1 B 4
3 B 2
###
-----
print(df.groupby(['X']).get_group('A'))
X Y
0 A 1
2 A 3
print(df.groupby(['X']).get_group('B'))
X Y
1 B 4
3 B 2
grouped = df.groupby(['X'])
print(grouped.groups)
print(grouped.groups['A'])
{'A': [0, 2], 'B': [1, 3]}
[0, 2]
sz = grouped.size()
print(sz,type(sz))
X
A 2
B 2
dtype: int64
df = pd.DataFrame({
'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
print(df)
A B C D
0 foo one 0.628069 2.369622
1 bar one 0.624670 -1.729496
2 foo two 1.748096 -0.674956
3 bar three -0.154640 -0.436490
4 foo two -0.604173 -0.081001
5 bar two 0.311972 -2.065370
6 foo one 0.723137 -0.125561
7 foo three 0.988974 -1.429884
group=df.groupby(['A','B']).groups
print(grouped)
print(group[('foo', 'three')])
[7]
df = pd.DataFrame({
'data1':np.random.rand(2),
'data2':np.random.rand(2),
'key1':['a','b'],
'key2':['one','two']})
print(df)
data1 data2 key1 key2
0 0.762760 0.661971 a one
1 0.785907 0.058554 b two
for n,p in df.groupby(df.dtypes, axis=1):
print(n)
print(p)
print('##')
float64
data1 data2
0 0.762760 0.661971
1 0.785907 0.058554
##
object
key1 key2
0 a one
1 b two
##
df = pd.DataFrame(np.arange(16).reshape(4,4),
columns = ['a','b','c','d'])
print(df)
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
mapping = {
'a':'one','b':'one','c':'two','d':'two','e':'three'}
by_column=df.groupby(mapping,axis=1)
print(by_column.sum())
one two
0 1 5
1 9 13
2 17 21
3 25 29
s = pd.Series(mapping)
print(s,'\n')
print(s.groupby(s).count())
a one
b one
c two
d two
e three
dtype: object
one 2
three 1
two 2
dtype: int64
df = pd.DataFrame(np.arange(16).reshape(4,4),
columns = ['a','b','c','d'],
index = ['abc','bcd','aa','b'])
print(df)
a b c d
abc 0 1 2 3
bcd 4 5 6 7
aa 8 9 10 11
b 12 13 14 15
print(df.groupby(len).sum())
a b c d
1 12 13 14 15
2 8 9 10 11
3 4 6 8 10
s = pd.Series([1, 2, 3, 10, 20, 30], index = [1, 2, 3, 1, 2, 3])
print(s)
1 1
2 2
3 3
1 10
2 20
3 30
dtype: int64
grouped = s.groupby(level=0)
print(grouped)
print(grouped.first(),'→ first:非NaN的第一个值\n')
print(grouped.last(),'→ last:非NaN的最后一个值\n')
print(grouped.sum(),'→ sum:非NaN的和\n')
print(grouped.mean(),'→ mean:非NaN的平均值\n')
print(grouped.median(),'→ median:非NaN的算术中位数\n')
print(grouped.count(),'→ count:非NaN的值\n')
print(grouped.min(),'→ min、max:非NaN的最小值、最大值\n')
print(grouped.std(),'→ std,var:非NaN的标准差和方差\n')
print(grouped.prod(),'→ prod:非NaN的积\n')
1 1
2 2
3 3
dtype: int64 → first:非NaN的第一个值
1 10
2 20
3 30
dtype: int64 → last:非NaN的最后一个值
1 11
2 22
3 33
dtype: int64 → sum:非NaN的和
1 5.5
2 11.0
3 16.5
dtype: float64 → mean:非NaN的平均值
1 5.5
2 11.0
3 16.5
dtype: float64 → median:非NaN的算术中位数
1 2
2 2
3 2
dtype: int64 → count:非NaN的值
1 1
2 2
3 3
dtype: int64 → min、max:非NaN的最小值、最大值
1 6.363961
2 12.727922
3 19.091883
dtype: float64 → std,var:非NaN的标准差和方差
1 10
2 40
3 90
dtype: int64 → prod:非NaN的积
df = pd.DataFrame({
'a':[1,1,2,2],
'b':np.random.rand(4),
'c':np.random.rand(4),
'd':np.random.rand(4),})
print(df)
a b c d
0 1 0.772478 0.417508 0.068269
1 1 0.087774 0.063274 0.991277
2 2 0.229447 0.062397 0.985243
3 2 0.025157 0.130426 0.664778
print(df.groupby('a').agg(['mean',np.sum]))
print(df.groupby('a')['b'].agg({
'result1':np.mean,
'result2':np.sum}))
b c d
mean sum mean sum mean sum
a
1 0.430126 0.860252 0.240391 0.480782 0.529773 1.059545
2 0.127302 0.254605 0.096412 0.192824 0.825010 1.650021
result2 result1
a
1 0.860252 0.430126
2 0.254605 0.127302
'''
【课程2.20】 分组转换及一般性“拆分-应用-合并”
transform / apply
'''
df = pd.DataFrame({
'data1':np.random.rand(5),
'data2':np.random.rand(5),
'key1':list('aabba'),
'key2':['one','two','one','two','one']})
print(df)
data1 data2 key1 key2
0 0.906503 0.944454 a one
1 0.608210 0.481486 a two
2 0.109676 0.276845 b one
3 0.817092 0.008300 b two
4 0.266458 0.540617 a one
k_mean=df.groupby('key1').mean()
print(k_mean)
data1 data2
key1
a 0.593724 0.655519
b 0.463384 0.142573
print(pd.merge(df,k_mean,left_on='key1',right_index=True).add_prefix('mean_'))
mean_data1_x mean_data2_x mean_key1 mean_key2 mean_data1_y mean_data2_y
0 0.906503 0.944454 a one 0.593724 0.655519
1 0.608210 0.481486 a two 0.593724 0.655519
4 0.266458 0.540617 a one 0.593724 0.655519
2 0.109676 0.276845 b one 0.463384 0.142573
3 0.817092 0.008300 b two 0.463384 0.142573
print(df.groupby('key2').mean())
print(df.groupby('key2').transform(np.mean))
data1 data2
key2
one 0.427546 0.587305
two 0.712651 0.244893
data1 data2
0 0.427546 0.587305
1 0.712651 0.244893
2 0.427546 0.587305
3 0.712651 0.244893
4 0.427546 0.587305
df = pd.DataFrame({
'data1':np.random.rand(5),
'data2':np.random.rand(5),
'key1':list('aabba'),
'key2':['one','two','one','two','one']})
print(df)
print(df.groupby('key1').apply(lambda x: x.describe()))
data1 data2 key1 key2
0 0.285345 0.497243 a one
1 0.525780 0.196928 a two
2 0.407405 0.829274 b one
3 0.329079 0.689100 b two
4 0.856305 0.436768 a one
data1 data2
key1
a count 3.000000 3.000000
mean 0.555810 0.376980
std 0.286662 0.158834
min 0.285345 0.196928
25% 0.405563 0.316848
50% 0.525780 0.436768
75% 0.691042 0.467006
max 0.856305 0.497243
b count 2.000000 2.000000
mean 0.368242 0.759187
std 0.055385 0.099118
min 0.329079 0.689100
25% 0.348660 0.724144
50% 0.368242 0.759187
75% 0.387824 0.794230
max 0.407405 0.829274
def f_df1(d,n):
return(d.sort_index()[:n])
def f_df2(d,k1):
return(d[k1])
print(df.groupby('key1').apply(f_df1,2),'\n')
print(df.groupby('key1').apply(f_df2,'data2'))
print(type(df.groupby('key1').apply(f_df2,'data2')))
data1 data2 key1 key2
key1
a 0 0.285345 0.497243 a one
1 0.525780 0.196928 a two
b 2 0.407405 0.829274 b one
3 0.329079 0.689100 b two
key1
a 0 0.497243
1 0.196928
4 0.436768
b 2 0.829274
3 0.689100
Name: data2, dtype: float64
'''
【课程2.21】 透视表及交叉表
类似excel数据透视 - pivot table / crosstab
'''
date = ['2017-5-1','2017-5-2','2017-5-3']*3
rng = pd.to_datetime(date)
df = pd.DataFrame({
'date':rng,
'key':list('abcdabcda'),
'values':np.random.rand(9)*10})
print(df)
date key values
0 2017-05-01 a 4.092407
1 2017-05-02 b 7.389150
2 2017-05-03 c 9.946409
3 2017-05-01 d 5.282872
4 2017-05-02 a 8.591360
5 2017-05-03 b 7.475860
6 2017-05-01 c 5.490445
7 2017-05-02 d 5.947843
8 2017-05-03 a 2.806096
print(pd.pivot_table(df, values = 'values', index = 'date', columns = 'key', aggfunc=np.sum))
print('-----')
print(pd.pivot_table(df, values = 'values', index = ['date','key'], aggfunc=len))
print('-----')
key a b c d
date
2017-05-01 4.092407 NaN 5.490445 5.282872
2017-05-02 8.591360 7.38915 NaN 5.947843
2017-05-03 2.806096 7.47586 9.946409 NaN
-----
date key
2017-05-01 a 1.0
c 1.0
d 1.0
2017-05-02 a 1.0
b 1.0
d 1.0
2017-05-03 a 1.0
b 1.0
c 1.0
Name: values, dtype: float64
-----
df = pd.DataFrame({
'A': [1, 2, 2, 2, 2],
'B': [3, 3, 4, 4, 4],
'C': [1, 1, np.nan, 1, 1]})
print(df)
A B C
0 1 3 1.0
1 2 3 1.0
2 2 4 NaN
3 2 4 1.0
4 2 4 1.0
print(pd.crosstab(df['A'],df['B']))
B 3 4
A
1 1 0
2 1 3
print(pd.crosstab(df['A'],df['B'],normalize=True))
print('-----')
B 3 4
A
1 0.2 0.0
2 0.2 0.6
-----
print(pd.crosstab(df['A'],df['B'],values=df['C'],aggfunc=np.sum))
print('-----')
B 3 4
A
1 1.0 NaN
2 1.0 2.0
-----
print(pd.crosstab(df['A'],df['B'],values=df['C'],aggfunc=np.sum, margins=True))
print('-----')
B 3 4 All
A
1 1.0 NaN 1.0
2 1.0 2.0 3.0
All 2.0 2.0 4.0
-----
'''
【课程2.22】 数据读取
核心:read_table, read_csv, read_excel
'''
import os
os.chdir('C:/Users/Hjx/Desktop/')
data1 = pd.read_table('data1.txt', delimiter=',',header = 0, index_col=1)
print(data1)
data2 = pd.read_csv('data2.csv',engine = 'python')
print(data2.head())
data3 = pd.read_excel('地市级党委书记数据库(2000-10).xlsx',sheetname='中国人民共和国地市级党委书记数据库(2000-10)',header=0)
print(data3)