导入库
import numpy as np
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')
pandas基础
pandas数据结构
Series 一维数据
l = np.array([1,2,3,6,9])
s1 = pd.Series(data=l)
display(l,s1)
array([1, 2, 3, 6, 9])
0 1
1 2
2 3
3 6
4 9
dtype: int32
s2 = pd.Series(data = l,index = list('ABCDE'))
s2
A 1
B 2
C 3
D 6
E 9
dtype: int32
s3 = pd.Series(data = {'A':149,'B':130,'C':118,'D':99,'E':66})
s3
A 149
B 130
C 118
D 99
E 66
dtype: int64
DataFrame 多维数据
df1 = pd.DataFrame(data = np.random.randint(0,151,size=(10,3)),
index = list('ABCDEFHIJK'),
columns=['python','math','english'],
dtype = np.float16)
df1
|
python |
math |
english |
A |
120.0 |
76.0 |
39.0 |
B |
10.0 |
139.0 |
85.0 |
C |
69.0 |
14.0 |
33.0 |
D |
85.0 |
97.0 |
144.0 |
E |
132.0 |
11.0 |
35.0 |
F |
129.0 |
25.0 |
148.0 |
H |
45.0 |
66.0 |
95.0 |
I |
148.0 |
54.0 |
61.0 |
J |
56.0 |
14.0 |
81.0 |
K |
55.0 |
137.0 |
104.0 |
df2 = pd.DataFrame(data={'python':[66,99,128],'math':[88,65,137],'english':[100,121,45]})
df2
|
python |
math |
english |
0 |
66 |
88 |
100 |
1 |
99 |
65 |
121 |
2 |
128 |
137 |
45 |
数据查看
df = pd.DataFrame(data = np.random.randint(0,151,size=(100,3)),
columns=['python','math','english'])
df
|
python |
math |
english |
0 |
150 |
111 |
83 |
1 |
31 |
137 |
41 |
2 |
126 |
129 |
130 |
3 |
91 |
23 |
118 |
4 |
96 |
68 |
22 |
... |
... |
... |
... |
95 |
21 |
29 |
78 |
96 |
34 |
7 |
60 |
97 |
134 |
21 |
111 |
98 |
108 |
130 |
137 |
99 |
127 |
105 |
59 |
100 rows × 3 columns
df.shape
(100, 3)
df.head(n=3)
|
python |
math |
english |
0 |
150 |
111 |
83 |
1 |
31 |
137 |
41 |
2 |
126 |
129 |
130 |
df.tail(n=5)
|
python |
math |
english |
95 |
21 |
29 |
78 |
96 |
34 |
7 |
60 |
97 |
134 |
21 |
111 |
98 |
108 |
130 |
137 |
99 |
127 |
105 |
59 |
df.dtypes
python int32
math int32
english int32
dtype: object
df.info()
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 python 100 non-null int32
1 math 100 non-null int32
2 english 100 non-null int32
dtypes: int32(3)
memory usage: 1.3 KB
df.describe()
|
python |
math |
english |
count |
100.00000 |
100.000000 |
100.000000 |
mean |
75.48000 |
72.330000 |
78.180000 |
std |
41.94645 |
40.506404 |
42.564836 |
min |
1.00000 |
1.000000 |
3.000000 |
25% |
47.50000 |
40.500000 |
42.500000 |
50% |
75.00000 |
70.000000 |
81.500000 |
75% |
109.00000 |
101.250000 |
113.500000 |
max |
150.00000 |
146.000000 |
150.000000 |
df.values
array([[150, 111, 83],
[ 31, 137, 41],
[126, 129, 130],
[ 91, 23, 118],
[ 96, 68, 22],
[ 46, 12, 108],
[141, 100, 16],
[ 87, 6, 20],
[104, 133, 101],
[ 73, 108, 134],
[ 57, 146, 18],
[ 57, 133, 49],
[ 33, 70, 28],
[ 69, 37, 80],
[118, 30, 44],
[ 7, 43, 37],
[104, 42, 39],
[145, 58, 107],
[121, 17, 92],
[120, 80, 89],
[ 66, 69, 18],
[ 21, 107, 140],
[ 17, 1, 129],
[ 83, 65, 95],
[ 95, 61, 89],
[ 79, 129, 90],
[ 97, 54, 91],
[123, 14, 50],
[128, 76, 33],
[ 7, 41, 138],
[ 61, 141, 134],
[138, 145, 20],
[ 60, 27, 115],
[ 77, 139, 66],
[112, 83, 119],
[138, 142, 48],
[ 16, 99, 85],
[ 55, 21, 90],
[ 51, 75, 63],
[123, 34, 39],
[ 18, 39, 103],
[129, 73, 57],
[139, 10, 130],
[ 22, 79, 6],
[ 61, 127, 85],
[ 99, 78, 138],
[ 59, 60, 12],
[ 63, 41, 64],
[ 17, 69, 112],
[ 86, 42, 80],
[ 94, 18, 41],
[ 15, 42, 3],
[ 90, 57, 139],
[ 48, 91, 107],
[ 66, 92, 54],
[ 70, 81, 31],
[ 11, 8, 66],
[ 10, 96, 94],
[144, 92, 120],
[ 84, 69, 16],
[ 5, 52, 76],
[ 41, 74, 140],
[ 26, 75, 26],
[ 49, 118, 67],
[ 59, 72, 61],
[ 77, 26, 72],
[122, 52, 7],
[ 70, 34, 20],
[117, 16, 131],
[ 1, 89, 144],
[ 89, 24, 93],
[ 6, 83, 125],
[119, 117, 9],
[138, 89, 104],
[ 65, 110, 29],
[ 87, 126, 70],
[ 51, 118, 75],
[ 29, 82, 43],
[104, 136, 129],
[ 22, 65, 150],
[107, 139, 120],
[ 68, 95, 122],
[ 58, 53, 133],
[117, 70, 87],
[ 28, 131, 6],
[104, 68, 95],
[128, 32, 148],
[ 96, 138, 7],
[ 60, 65, 103],
[ 53, 11, 150],
[ 5, 86, 83],
[145, 53, 60],
[ 88, 70, 113],
[ 88, 32, 59],
[ 54, 70, 50],
[ 21, 29, 78],
[ 34, 7, 60],
[134, 21, 111],
[108, 130, 137],
[127, 105, 59]])
df.columns
Index(['python', 'math', 'english'], dtype='object')
df.index
RangeIndex(start=0, stop=100, step=1)
数据输入与输出
Csv
df = pd.DataFrame(data = np.random.randint(0,151,size = (100,3)),
columns=['Python','Math','English'])
df
|
Python |
Math |
English |
0 |
62 |
40 |
150 |
1 |
95 |
61 |
27 |
2 |
18 |
110 |
56 |
3 |
8 |
91 |
59 |
4 |
45 |
40 |
2 |
... |
... |
... |
... |
95 |
40 |
100 |
55 |
96 |
14 |
133 |
135 |
97 |
97 |
95 |
91 |
98 |
45 |
70 |
72 |
99 |
131 |
54 |
49 |
100 rows × 3 columns
df.to_csv('./data.csv',
sep=',',
index = True,
header = True)
pd.read_csv('./data.csv',
index_col = 0
)
|
Python |
Math |
English |
0 |
62 |
40 |
150 |
1 |
95 |
61 |
27 |
2 |
18 |
110 |
56 |
3 |
8 |
91 |
59 |
4 |
45 |
40 |
2 |
... |
... |
... |
... |
95 |
40 |
100 |
55 |
96 |
14 |
133 |
135 |
97 |
97 |
95 |
91 |
98 |
45 |
70 |
72 |
99 |
131 |
54 |
49 |
100 rows × 3 columns
Excel
df
|
Python |
Math |
English |
0 |
62 |
40 |
150 |
1 |
95 |
61 |
27 |
2 |
18 |
110 |
56 |
3 |
8 |
91 |
59 |
4 |
45 |
40 |
2 |
... |
... |
... |
... |
95 |
40 |
100 |
55 |
96 |
14 |
133 |
135 |
97 |
97 |
95 |
91 |
98 |
45 |
70 |
72 |
99 |
131 |
54 |
49 |
100 rows × 3 columns
df.to_excel('./data.xlsx')
pd.read_excel('./data.xlsx',
index_col = 0)
|
Python |
Math |
English |
0 |
62 |
40 |
150 |
1 |
95 |
61 |
27 |
2 |
18 |
110 |
56 |
3 |
8 |
91 |
59 |
4 |
45 |
40 |
2 |
... |
... |
... |
... |
95 |
40 |
100 |
55 |
96 |
14 |
133 |
135 |
97 |
97 |
95 |
91 |
98 |
45 |
70 |
72 |
99 |
131 |
54 |
49 |
100 rows × 3 columns
HDF5
df
|
Python |
Math |
English |
0 |
62 |
40 |
150 |
1 |
95 |
61 |
27 |
2 |
18 |
110 |
56 |
3 |
8 |
91 |
59 |
4 |
45 |
40 |
2 |
... |
... |
... |
... |
95 |
40 |
100 |
55 |
96 |
14 |
133 |
135 |
97 |
97 |
95 |
91 |
98 |
45 |
70 |
72 |
99 |
131 |
54 |
49 |
100 rows × 3 columns
df.to_hdf('./data.h5',
key = 'score')
df2 = pd.DataFrame(data = np.random.randint(6,100,size=(1000,5)),
columns = ['计算机','化工','生物','工程','教师'])
df2
|
计算机 |
化工 |
生物 |
工程 |
教师 |
0 |
58 |
60 |
64 |
34 |
15 |
1 |
25 |
36 |
94 |
25 |
55 |
2 |
33 |
15 |
68 |
58 |
85 |
3 |
30 |
81 |
98 |
55 |
66 |
4 |
83 |
15 |
35 |
37 |
33 |
... |
... |
... |
... |
... |
... |
995 |
96 |
8 |
77 |
34 |
8 |
996 |
40 |
42 |
23 |
13 |
76 |
997 |
35 |
33 |
41 |
23 |
71 |
998 |
80 |
72 |
21 |
18 |
29 |
999 |
19 |
77 |
44 |
30 |
18 |
1000 rows × 5 columns
df2.to_hdf('./data.h5',
key = 'salary')
pd.read_hdf('./data.h5',
key = 'salary')
|
计算机 |
化工 |
生物 |
工程 |
教师 |
0 |
58 |
60 |
64 |
34 |
15 |
1 |
25 |
36 |
94 |
25 |
55 |
2 |
33 |
15 |
68 |
58 |
85 |
3 |
30 |
81 |
98 |
55 |
66 |
4 |
83 |
15 |
35 |
37 |
33 |
... |
... |
... |
... |
... |
... |
995 |
96 |
8 |
77 |
34 |
8 |
996 |
40 |
42 |
23 |
13 |
76 |
997 |
35 |
33 |
41 |
23 |
71 |
998 |
80 |
72 |
21 |
18 |
29 |
999 |
19 |
77 |
44 |
30 |
18 |
1000 rows × 5 columns
SQL
from sqlalchemy import create_engine
engine = create_engine(
'mysql+pymysql://root:123456@localhost/cda?charset=utf8'
)
df2.to_sql('salary',
con = engine,
index = False
)
1000
pd.read_sql('select * from salary limit 10',
con=engine)
|
计算机 |
化工 |
生物 |
工程 |
教师 |
0 |
58 |
60 |
64 |
34 |
15 |
1 |
25 |
36 |
94 |
25 |
55 |
2 |
33 |
15 |
68 |
58 |
85 |
3 |
30 |
81 |
98 |
55 |
66 |
4 |
83 |
15 |
35 |
37 |
33 |
5 |
54 |
26 |
43 |
93 |
15 |
6 |
54 |
13 |
42 |
62 |
38 |
7 |
68 |
10 |
26 |
65 |
25 |
8 |
6 |
9 |
51 |
48 |
7 |
9 |
29 |
98 |
21 |
36 |
85 |
数据获取
获取数据
df = pd.DataFrame(np.random.randint(0,151,size=(10,3)),
index=list('ABCDEFGHIJ'),
columns=['python','math','en'])
df
|
python |
math |
en |
A |
17 |
4 |
109 |
B |
78 |
51 |
91 |
C |
52 |
29 |
121 |
D |
118 |
125 |
65 |
E |
72 |
104 |
144 |
F |
113 |
69 |
18 |
G |
86 |
114 |
97 |
H |
64 |
21 |
142 |
I |
102 |
72 |
118 |
J |
114 |
34 |
93 |
df['python']
A 27
B 126
C 91
D 66
E 103
F 53
G 45
H 34
I 2
J 26
Name: python, dtype: int32
df.python
A 27
B 126
C 91
D 66
E 103
F 53
G 45
H 34
I 2
J 26
Name: python, dtype: int32
df[['python','en']]
|
python |
en |
A |
27 |
38 |
B |
126 |
48 |
C |
91 |
37 |
D |
66 |
7 |
E |
103 |
72 |
F |
53 |
47 |
G |
45 |
4 |
H |
34 |
116 |
I |
2 |
1 |
J |
26 |
18 |
标签选择
df.loc['A']
python 27
math 148
en 38
Name: A, dtype: int32
df.loc[['A','B','D']]
|
python |
math |
en |
A |
27 |
148 |
38 |
B |
126 |
141 |
48 |
D |
66 |
16 |
7 |
df.loc['A','python']
27
df.loc[['A','C','F'],'python']
A 27
C 91
F 53
Name: python, dtype: int32
df.loc['A'::2,['math','en']]
|
math |
en |
A |
148 |
38 |
C |
0 |
37 |
E |
141 |
72 |
G |
28 |
4 |
I |
23 |
1 |
df.loc['A':'D',:]
|
python |
math |
en |
A |
27 |
148 |
38 |
B |
126 |
141 |
48 |
C |
91 |
0 |
37 |
D |
66 |
16 |
7 |
位置选择
df.iloc[0]
python 27
math 148
en 38
Name: A, dtype: int32
df.iloc[:4,:2]
|
python |
math |
A |
27 |
148 |
B |
126 |
141 |
C |
91 |
0 |
D |
66 |
16 |
布尔索引
cond = df.python > 80
df[cond]
|
python |
math |
en |
B |
126 |
141 |
48 |
C |
91 |
0 |
37 |
E |
103 |
141 |
72 |
df.mean()
python 57.3
math 71.7
en 38.8
dtype: float64
df.mean(axis=1)
A 71.000000
B 105.000000
C 42.666667
D 29.666667
E 105.333333
F 55.666667
G 25.666667
H 73.666667
I 8.666667
J 42.000000
dtype: float64
df[df.mean(axis=1) > 75]
|
python |
math |
en |
B |
126 |
141 |
48 |
E |
103 |
141 |
72 |
df[(df.python > 70) & (df.math > 70 )]
|
python |
math |
en |
B |
126 |
141 |
48 |
E |
103 |
141 |
72 |
df.index
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype='object')
cond = df.index.isin(['C','E','H','K'])
df[cond]
|
python |
math |
en |
C |
91 |
0 |
37 |
E |
103 |
141 |
72 |
H |
34 |
71 |
116 |
赋值操作
df['python']['A'] = 150
df
|
python |
math |
en |
A |
150 |
148 |
38 |
B |
126 |
141 |
48 |
C |
91 |
0 |
37 |
D |
66 |
16 |
7 |
E |
103 |
141 |
72 |
F |
53 |
67 |
47 |
G |
45 |
28 |
4 |
H |
34 |
71 |
116 |
I |
2 |
23 |
1 |
J |
26 |
82 |
18 |
df['java'] = np.random.randint(1,151,size=10)
df
|
python |
math |
en |
java |
A |
17 |
4 |
109 |
88 |
B |
78 |
51 |
91 |
99 |
C |
52 |
29 |
121 |
54 |
D |
218 |
125 |
165 |
135 |
E |
72 |
104 |
144 |
73 |
F |
113 |
69 |
18 |
53 |
G |
186 |
114 |
197 |
22 |
H |
64 |
21 |
142 |
37 |
I |
102 |
72 |
118 |
39 |
J |
214 |
34 |
193 |
125 |
df.loc[['C','D','E'],'math'] = 147
df
|
python |
math |
en |
java |
A |
150 |
148 |
38 |
107 |
B |
126 |
141 |
48 |
6 |
C |
91 |
147 |
37 |
133 |
D |
66 |
147 |
7 |
145 |
E |
103 |
147 |
72 |
46 |
F |
53 |
67 |
47 |
89 |
G |
45 |
28 |
4 |
95 |
H |
34 |
71 |
116 |
27 |
I |
2 |
23 |
1 |
104 |
J |
26 |
82 |
18 |
117 |
df[df < 60] = 60
df
|
python |
math |
en |
java |
A |
150 |
148 |
60 |
107 |
B |
126 |
141 |
60 |
60 |
C |
91 |
147 |
60 |
133 |
D |
66 |
147 |
60 |
145 |
E |
103 |
147 |
72 |
60 |
F |
60 |
67 |
60 |
89 |
G |
60 |
60 |
60 |
95 |
H |
60 |
71 |
116 |
60 |
I |
60 |
60 |
60 |
104 |
J |
60 |
82 |
60 |
117 |
df.iloc[3::3,[0,2]] += 100
df
|
python |
math |
en |
java |
A |
17 |
4 |
109 |
88 |
B |
78 |
51 |
91 |
99 |
C |
52 |
29 |
121 |
54 |
D |
318 |
125 |
265 |
135 |
E |
72 |
104 |
144 |
73 |
F |
113 |
69 |
18 |
53 |
G |
286 |
114 |
297 |
22 |
H |
64 |
21 |
142 |
37 |
I |
102 |
72 |
118 |
39 |
J |
314 |
34 |
293 |
125 |
数据集成
concat行列合并
df1 = pd.DataFrame(np.random.randint(0,151,size=(10,3)),
index=list('ABCDEFGHIJ'),
columns=['python','math','en'])
df2 = pd.DataFrame(np.random.randint(0,151,size=(10,3)),
index=list('HILKLMNOPQ'),
columns=['python','math','en'])
df3 = pd.DataFrame(np.random.randint(0,151,size=(10,2)),
index=list('ABCDEFGHIJ'),
columns=['java','chinese'])
pd.concat([df1,df2],
axis=0)
|
python |
math |
en |
A |
107 |
52 |
125 |
B |
46 |
43 |
64 |
C |
141 |
147 |
118 |
D |
51 |
110 |
21 |
E |
135 |
62 |
46 |
F |
75 |
3 |
25 |
G |
43 |
29 |
120 |
H |
22 |
34 |
39 |
I |
82 |
92 |
0 |
J |
32 |
17 |
77 |
H |
58 |
141 |
110 |
I |
12 |
137 |
62 |
L |
29 |
140 |
131 |
K |
74 |
59 |
33 |
L |
30 |
29 |
7 |
M |
3 |
6 |
77 |
N |
50 |
135 |
12 |
O |
139 |
134 |
134 |
P |
140 |
115 |
119 |
Q |
141 |
141 |
21 |
pd.concat([df1,df3],
axis = 1)
|
python |
math |
en |
java |
chinese |
A |
107 |
52 |
125 |
12 |
26 |
B |
46 |
43 |
64 |
70 |
13 |
C |
141 |
147 |
118 |
143 |
130 |
D |
51 |
110 |
21 |
73 |
115 |
E |
135 |
62 |
46 |
145 |
2 |
F |
75 |
3 |
25 |
54 |
49 |
G |
43 |
29 |
120 |
61 |
119 |
H |
22 |
34 |
39 |
105 |
137 |
I |
82 |
92 |
0 |
114 |
114 |
J |
32 |
17 |
77 |
85 |
49 |
数据插入
df1.insert(loc = 1,
column='c++',
value = np.random.randint(0,151,size=10))
---------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [106], in ()
----> 1 df1.insert(loc = 1,
2 column='c++',
3 value = np.random.randint(0,151,size=10)) # 插入一列
4 df1
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\frame.py:4434, in DataFrame.insert(self, loc, column, value, allow_duplicates)
4428 raise ValueError(
4429 "Cannot specify 'allow_duplicates=True' when "
4430 "'self.flags.allows_duplicate_labels' is False."
4431 )
4432 if not allow_duplicates and column in self.columns:
4433 # Should this be a different kind of error??
-> 4434 raise ValueError(f"cannot insert {column}, already exists")
4435 if not isinstance(loc, int):
4436 raise TypeError("loc must be int")
ValueError: cannot insert c++, already exists
|
df1
|
python |
c++ |
math |
en |
A |
107 |
94 |
52 |
125 |
B |
46 |
92 |
43 |
64 |
C |
141 |
72 |
147 |
118 |
D |
51 |
7 |
110 |
21 |
E |
135 |
73 |
62 |
46 |
F |
75 |
15 |
3 |
25 |
G |
43 |
60 |
29 |
120 |
H |
22 |
142 |
34 |
39 |
I |
82 |
43 |
92 |
0 |
J |
32 |
94 |
17 |
77 |
pd.concat([df1,df3],
axis=0)
|
python |
c++ |
math |
en |
java |
chinese |
A |
107.0 |
94.0 |
52.0 |
125.0 |
NaN |
NaN |
B |
46.0 |
92.0 |
43.0 |
64.0 |
NaN |
NaN |
C |
141.0 |
72.0 |
147.0 |
118.0 |
NaN |
NaN |
D |
51.0 |
7.0 |
110.0 |
21.0 |
NaN |
NaN |
E |
135.0 |
73.0 |
62.0 |
46.0 |
NaN |
NaN |
F |
75.0 |
15.0 |
3.0 |
25.0 |
NaN |
NaN |
G |
43.0 |
60.0 |
29.0 |
120.0 |
NaN |
NaN |
H |
22.0 |
142.0 |
34.0 |
39.0 |
NaN |
NaN |
I |
82.0 |
43.0 |
92.0 |
0.0 |
NaN |
NaN |
J |
32.0 |
94.0 |
17.0 |
77.0 |
NaN |
NaN |
A |
NaN |
NaN |
NaN |
NaN |
12.0 |
26.0 |
B |
NaN |
NaN |
NaN |
NaN |
70.0 |
13.0 |
C |
NaN |
NaN |
NaN |
NaN |
143.0 |
130.0 |
D |
NaN |
NaN |
NaN |
NaN |
73.0 |
115.0 |
E |
NaN |
NaN |
NaN |
NaN |
145.0 |
2.0 |
F |
NaN |
NaN |
NaN |
NaN |
54.0 |
49.0 |
G |
NaN |
NaN |
NaN |
NaN |
61.0 |
119.0 |
H |
NaN |
NaN |
NaN |
NaN |
105.0 |
137.0 |
I |
NaN |
NaN |
NaN |
NaN |
114.0 |
114.0 |
J |
NaN |
NaN |
NaN |
NaN |
85.0 |
49.0 |
Join SQL风格合并
df1 = pd.DataFrame(data={'name':['softpo','brandon','ella','daniel','张三'],
'height':[175,180,169,177,168]})
df2 = pd.DataFrame(data={'name':['softpo','brandon','ella','daniel','张三'],
'weigth':[70,65,704,653,88]})
df3 = pd.DataFrame(data={'名字':['softpo','brandon','ella','daniel','张三'],
'salary':np.random.randint(20,100,size=5)})
display(df1,df2,df3)
|
name |
height |
0 |
softpo |
175 |
1 |
brandon |
180 |
2 |
ella |
169 |
3 |
daniel |
177 |
4 |
张三 |
168 |
|
name |
weigth |
0 |
softpo |
70 |
1 |
brandon |
65 |
2 |
ella |
704 |
3 |
daniel |
653 |
4 |
张三 |
88 |
|
名字 |
salary |
0 |
softpo |
74 |
1 |
brandon |
79 |
2 |
ella |
39 |
3 |
daniel |
97 |
4 |
张三 |
62 |
pd.concat([df1,df2],axis = 1)
|
name |
height |
name |
weigth |
0 |
softpo |
175 |
softpo |
70 |
1 |
brandon |
180 |
brandon |
65 |
2 |
ella |
169 |
ella |
704 |
3 |
daniel |
177 |
daniel |
653 |
4 |
张三 |
168 |
张三 |
88 |
pd.merge(df1,df2,how='inner')
|
name |
height |
weigth |
0 |
softpo |
175 |
70 |
1 |
brandon |
180 |
65 |
2 |
ella |
169 |
704 |
3 |
daniel |
177 |
653 |
4 |
张三 |
168 |
88 |
pd.merge(df1,df2,how='outer')
|
name |
height |
weigth |
0 |
softpo |
175 |
70 |
1 |
brandon |
180 |
65 |
2 |
ella |
169 |
704 |
3 |
daniel |
177 |
653 |
4 |
张三 |
168 |
88 |
pd.merge(df1,df2,how='left')
|
name |
height |
weigth |
0 |
softpo |
175 |
70 |
1 |
brandon |
180 |
65 |
2 |
ella |
169 |
704 |
3 |
daniel |
177 |
653 |
4 |
张三 |
168 |
88 |
pd.merge(df1,df3,left_on='name',right_on='名字')
|
name |
height |
名字 |
salary |
0 |
softpo |
175 |
softpo |
74 |
1 |
brandon |
180 |
brandon |
79 |
2 |
ella |
169 |
ella |
39 |
3 |
daniel |
177 |
daniel |
97 |
4 |
张三 |
168 |
张三 |
62 |
df4 = pd.DataFrame(data=np.random.randint(0,151,size=(10,3)),
columns=['python','math','en'],
index=list('ABCDEFGHIJ'))
df4
|
python |
math |
en |
A |
83 |
7 |
108 |
B |
34 |
48 |
128 |
C |
92 |
99 |
132 |
D |
112 |
126 |
98 |
E |
107 |
5 |
83 |
F |
77 |
33 |
136 |
G |
58 |
39 |
12 |
H |
67 |
109 |
69 |
I |
115 |
29 |
48 |
J |
19 |
55 |
92 |
score_mean = df4.mean(axis=1).round(1)
score_mean
A 66.0
B 70.0
C 107.7
D 112.0
E 65.0
F 82.0
G 36.3
H 81.7
I 64.0
J 55.3
dtype: float64
df4.insert(loc=3,column='平均分',value=score_mean)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [32], in ()
----> 1 df4.insert(loc=3,column='平均分',value=score_mean)
2 df4
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\frame.py:4434, in DataFrame.insert(self, loc, column, value, allow_duplicates)
4428 raise ValueError(
4429 "Cannot specify 'allow_duplicates=True' when "
4430 "'self.flags.allows_duplicate_labels' is False."
4431 )
4432 if not allow_duplicates and column in self.columns:
4433 # Should this be a different kind of error??
-> 4434 raise ValueError(f"cannot insert {column}, already exists")
4435 if not isinstance(loc, int):
4436 raise TypeError("loc must be int")
ValueError: cannot insert 平均分, already exists
|
df4
|
python |
math |
平均分 |
en |
A |
83 |
7 |
66.0 |
108 |
B |
34 |
48 |
70.0 |
128 |
C |
92 |
99 |
107.7 |
132 |
D |
112 |
126 |
112.0 |
98 |
E |
107 |
5 |
65.0 |
83 |
F |
77 |
33 |
82.0 |
136 |
G |
58 |
39 |
36.3 |
12 |
H |
67 |
109 |
81.7 |
69 |
I |
115 |
29 |
64.0 |
48 |
J |
19 |
55 |
55.3 |
92 |
df5 = df4.iloc[:,[0,1,3]]
df5
|
python |
math |
en |
A |
83 |
7 |
108 |
B |
34 |
48 |
128 |
C |
92 |
99 |
132 |
D |
112 |
126 |
98 |
E |
107 |
5 |
83 |
F |
77 |
33 |
136 |
G |
58 |
39 |
12 |
H |
67 |
109 |
69 |
I |
115 |
29 |
48 |
J |
19 |
55 |
92 |
score_mean.name='平均分'
score_mean
A 66.0
B 70.0
C 107.7
D 112.0
E 65.0
F 82.0
G 36.3
H 81.7
I 64.0
J 55.3
Name: 平均分, dtype: float64
df5
|
python |
math |
en |
A |
83 |
7 |
108 |
B |
34 |
48 |
128 |
C |
92 |
99 |
132 |
D |
112 |
126 |
98 |
E |
107 |
5 |
83 |
F |
77 |
33 |
136 |
G |
58 |
39 |
12 |
H |
67 |
109 |
69 |
I |
115 |
29 |
48 |
J |
19 |
55 |
92 |
pd.merge(df5,
score_mean,
left_index = True,
right_index = True)
|
python |
math |
en |
平均分 |
A |
83 |
7 |
108 |
66.0 |
B |
34 |
48 |
128 |
70.0 |
C |
92 |
99 |
132 |
107.7 |
D |
112 |
126 |
98 |
112.0 |
E |
107 |
5 |
83 |
65.0 |
F |
77 |
33 |
136 |
82.0 |
G |
58 |
39 |
12 |
36.3 |
H |
67 |
109 |
69 |
81.7 |
I |
115 |
29 |
48 |
64.0 |
J |
19 |
55 |
92 |
55.3 |
数据清洗
df = pd.DataFrame(data = {'color':['red','blue','red','green','green','blue',None,np.NaN,'green'],
'price':[20,15,20,18,18,22,30,30,22]})
df
|
color |
price |
0 |
red |
20 |
1 |
blue |
15 |
2 |
red |
20 |
3 |
green |
18 |
4 |
green |
18 |
5 |
blue |
22 |
6 |
None |
30 |
7 |
NaN |
30 |
8 |
green |
22 |
df.drop_duplicates()
|
color |
price |
0 |
red |
20 |
1 |
blue |
15 |
3 |
green |
18 |
5 |
blue |
22 |
6 |
None |
30 |
8 |
green |
22 |
df.dropna()
|
color |
price |
0 |
red |
20 |
1 |
blue |
15 |
2 |
red |
20 |
3 |
green |
18 |
4 |
green |
18 |
5 |
blue |
22 |
8 |
green |
22 |
df.drop(labels=[2,4,6,8])
|
color |
price |
0 |
red |
20 |
1 |
blue |
15 |
3 |
green |
18 |
5 |
blue |
22 |
7 |
NaN |
30 |
df.drop(labels=['color'],axis = 1)
|
price |
0 |
20 |
1 |
15 |
2 |
20 |
3 |
18 |
4 |
18 |
5 |
22 |
6 |
30 |
7 |
30 |
8 |
22 |
df.filter(items=['price'])
|
price |
0 |
20 |
1 |
15 |
2 |
20 |
3 |
18 |
4 |
18 |
5 |
22 |
6 |
30 |
7 |
30 |
8 |
22 |
df
|
color |
price |
0 |
red |
20 |
1 |
blue |
15 |
2 |
red |
20 |
3 |
green |
18 |
4 |
green |
18 |
5 |
blue |
22 |
6 |
None |
30 |
7 |
NaN |
30 |
8 |
green |
22 |
df['size'] = 1024
df
|
color |
price |
size |
0 |
red |
20 |
1024 |
1 |
blue |
15 |
1024 |
2 |
red |
20 |
1024 |
3 |
green |
18 |
1024 |
4 |
green |
18 |
1024 |
5 |
blue |
22 |
1024 |
6 |
None |
30 |
1024 |
7 |
NaN |
30 |
1024 |
8 |
green |
22 |
1024 |
df.filter(like = 'i')
|
price |
size |
0 |
20 |
1024 |
1 |
15 |
1024 |
2 |
20 |
1024 |
3 |
18 |
1024 |
4 |
18 |
1024 |
5 |
22 |
1024 |
6 |
30 |
1024 |
7 |
30 |
1024 |
8 |
22 |
1024 |
df.filter(regex = 'e$')
|
price |
size |
0 |
20 |
1024 |
1 |
15 |
1024 |
2 |
20 |
1024 |
3 |
18 |
1024 |
4 |
18 |
1024 |
5 |
22 |
1024 |
6 |
30 |
1024 |
7 |
30 |
1024 |
8 |
22 |
1024 |
a = np.random.randint(1,200,size = 1000)
a
array([ 95, 17, 106, 14, 77, 43, 35, 82, 169, 84, 135, 61, 107,
10, 57, 88, 181, 147, 160, 141, 99, 190, 148, 101, 33, 150,
37, 16, 90, 104, 56, 179, 7, 120, 172, 180, 161, 76, 175,
137, 33, 52, 157, 62, 49, 46, 49, 78, 110, 97, 188, 87,
185, 45, 106, 29, 173, 188, 71, 195, 84, 47, 13, 68, 35,
39, 79, 86, 67, 132, 58, 27, 48, 118, 190, 60, 66, 118,
36, 76, 109, 130, 75, 38, 190, 158, 45, 153, 136, 20, 30,
106, 136, 83, 182, 170, 51, 118, 95, 50, 77, 135, 65, 196,
129, 183, 115, 77, 27, 106, 24, 67, 56, 150, 139, 100, 6,
160, 36, 184, 191, 19, 189, 15, 163, 140, 123, 151, 86, 39,
39, 197, 193, 122, 62, 21, 25, 182, 29, 101, 154, 23, 195,
68, 174, 148, 154, 199, 33, 168, 45, 162, 90, 152, 182, 65,
199, 51, 83, 75, 197, 72, 159, 123, 152, 4, 175, 187, 77,
53, 136, 35, 106, 98, 135, 57, 189, 84, 4, 108, 20, 41,
7, 16, 45, 168, 156, 114, 32, 147, 24, 105, 198, 57, 16,
192, 128, 37, 156, 53, 44, 142, 160, 198, 154, 75, 109, 22,
16, 12, 80, 146, 153, 144, 192, 99, 97, 199, 111, 27, 59,
127, 112, 160, 166, 198, 41, 196, 170, 38, 147, 64, 199, 93,
1, 152, 150, 132, 181, 57, 56, 58, 196, 171, 163, 150, 8,
197, 38, 135, 176, 196, 19, 185, 29, 9, 15, 101, 177, 126,
7, 197, 58, 56, 149, 182, 172, 60, 106, 130, 174, 198, 185,
6, 124, 53, 95, 92, 57, 78, 18, 192, 93, 190, 38, 111,
14, 94, 47, 180, 79, 86, 37, 44, 15, 166, 42, 124, 137,
152, 173, 180, 142, 19, 110, 21, 7, 83, 3, 158, 21, 112,
183, 65, 193, 84, 171, 106, 100, 118, 43, 183, 142, 143, 188,
109, 136, 59, 84, 28, 72, 1, 73, 166, 104, 73, 16, 28,
55, 29, 76, 13, 110, 83, 134, 23, 160, 195, 175, 47, 27,
148, 100, 96, 169, 64, 45, 162, 113, 52, 45, 170, 127, 91,
141, 13, 128, 121, 167, 103, 66, 134, 80, 46, 171, 91, 53,
75, 7, 190, 35, 48, 164, 130, 104, 16, 31, 181, 68, 162,
1, 95, 174, 163, 183, 78, 1, 103, 12, 41, 50, 125, 49,
4, 61, 81, 142, 157, 119, 11, 70, 121, 191, 28, 140, 63,
144, 190, 51, 184, 111, 16, 3, 150, 1, 69, 79, 143, 16,
136, 90, 27, 107, 78, 123, 4, 18, 36, 80, 32, 103, 19,
79, 181, 83, 167, 20, 29, 150, 92, 26, 123, 57, 39, 115,
69, 75, 44, 60, 88, 99, 191, 191, 82, 122, 156, 164, 41,
150, 3, 184, 15, 104, 144, 77, 49, 157, 129, 29, 56, 188,
113, 171, 118, 63, 132, 177, 27, 8, 164, 43, 22, 78, 89,
188, 168, 148, 140, 125, 140, 97, 21, 185, 4, 91, 176, 93,
165, 81, 124, 144, 137, 86, 133, 45, 193, 86, 158, 91, 137,
100, 55, 95, 33, 30, 179, 3, 77, 26, 113, 197, 49, 105,
3, 160, 139, 106, 94, 5, 16, 104, 145, 77, 16, 14, 98,
137, 196, 53, 105, 54, 157, 66, 23, 11, 197, 140, 138, 61,
155, 138, 187, 138, 175, 143, 66, 79, 25, 26, 165, 49, 53,
128, 2, 4, 194, 49, 17, 4, 89, 107, 75, 99, 162, 166,
173, 122, 87, 159, 75, 107, 143, 85, 41, 108, 180, 120, 3,
29, 16, 3, 186, 86, 135, 89, 61, 172, 74, 178, 38, 49,
61, 93, 96, 114, 170, 110, 136, 40, 72, 102, 186, 120, 147,
158, 128, 71, 110, 59, 87, 99, 171, 3, 55, 163, 181, 188,
80, 194, 75, 38, 173, 137, 97, 171, 113, 132, 165, 20, 37,
141, 124, 40, 98, 38, 61, 136, 3, 47, 1, 117, 143, 190,
162, 181, 115, 181, 100, 92, 180, 165, 26, 183, 65, 43, 64,
102, 120, 102, 100, 38, 9, 49, 95, 126, 185, 18, 146, 64,
47, 151, 191, 113, 64, 119, 49, 51, 25, 6, 49, 171, 98,
112, 134, 15, 135, 144, 7, 69, 113, 89, 190, 179, 129, 182,
9, 157, 55, 95, 181, 44, 144, 193, 109, 3, 183, 48, 194,
80, 39, 173, 194, 195, 55, 156, 158, 29, 21, 119, 77, 142,
97, 41, 50, 14, 70, 197, 43, 93, 51, 190, 105, 130, 162,
33, 153, 190, 90, 123, 144, 71, 80, 138, 167, 128, 176, 95,
37, 148, 193, 169, 175, 131, 198, 188, 39, 26, 178, 79, 50,
190, 148, 156, 56, 3, 10, 112, 136, 135, 16, 54, 16, 105,
102, 197, 96, 30, 80, 47, 144, 150, 3, 96, 39, 72, 128,
146, 96, 156, 37, 33, 123, 180, 103, 82, 26, 144, 9, 81,
144, 163, 53, 128, 133, 160, 104, 135, 6, 92, 188, 112, 57,
120, 182, 151, 95, 73, 19, 137, 25, 179, 150, 44, 162, 30,
165, 108, 43, 16, 84, 195, 3, 61, 59, 15, 5, 185, 143,
131, 28, 61, 133, 51, 93, 83, 51, 39, 132, 195, 93, 152,
90, 68, 166, 44, 38, 50, 129, 74, 111, 139, 79, 97, 135,
57, 139, 103, 75, 33, 176, 27, 147, 114, 76, 2, 21, 1,
187, 58, 191, 139, 171, 146, 189, 184, 28, 74, 173, 57, 78,
166, 82, 115, 78, 140, 171, 45, 86, 20, 101, 19, 169, 7,
130, 182, 164, 54, 175, 188, 44, 2, 31, 171, 135, 110, 113,
152, 45, 98, 82, 48, 158, 4, 31, 193, 123, 69, 48, 49,
169, 3, 105, 178, 45, 59, 34, 120, 179, 163, 154, 163, 62,
2, 4, 34, 71, 144, 55, 93, 100, 168, 144, 19, 161, 156,
115, 76, 126, 48, 92, 137, 129, 142, 83, 90, 192, 76, 162,
54, 92, 31, 62, 32, 126, 72, 41, 38, 172, 93, 99])
cond = (a <= 800) & (a >= 100)
a[cond]
array([106, 169, 135, 107, 181, 147, 160, 141, 190, 148, 101, 150, 104,
179, 120, 172, 180, 161, 175, 137, 157, 110, 188, 185, 106, 173,
188, 195, 132, 118, 190, 118, 109, 130, 190, 158, 153, 136, 106,
136, 182, 170, 118, 135, 196, 129, 183, 115, 106, 150, 139, 100,
160, 184, 191, 189, 163, 140, 123, 151, 197, 193, 122, 182, 101,
154, 195, 174, 148, 154, 199, 168, 162, 152, 182, 199, 197, 159,
123, 152, 175, 187, 136, 106, 135, 189, 108, 168, 156, 114, 147,
105, 198, 192, 128, 156, 142, 160, 198, 154, 109, 146, 153, 144,
192, 199, 111, 127, 112, 160, 166, 198, 196, 170, 147, 199, 152,
150, 132, 181, 196, 171, 163, 150, 197, 135, 176, 196, 185, 101,
177, 126, 197, 149, 182, 172, 106, 130, 174, 198, 185, 124, 192,
190, 111, 180, 166, 124, 137, 152, 173, 180, 142, 110, 158, 112,
183, 193, 171, 106, 100, 118, 183, 142, 143, 188, 109, 136, 166,
104, 110, 134, 160, 195, 175, 148, 100, 169, 162, 113, 170, 127,
141, 128, 121, 167, 103, 134, 171, 190, 164, 130, 104, 181, 162,
174, 163, 183, 103, 125, 142, 157, 119, 121, 191, 140, 144, 190,
184, 111, 150, 143, 136, 107, 123, 103, 181, 167, 150, 123, 115,
191, 191, 122, 156, 164, 150, 184, 104, 144, 157, 129, 188, 113,
171, 118, 132, 177, 164, 188, 168, 148, 140, 125, 140, 185, 176,
165, 124, 144, 137, 133, 193, 158, 137, 100, 179, 113, 197, 105,
160, 139, 106, 104, 145, 137, 196, 105, 157, 197, 140, 138, 155,
138, 187, 138, 175, 143, 165, 128, 194, 107, 162, 166, 173, 122,
159, 107, 143, 108, 180, 120, 186, 135, 172, 178, 114, 170, 110,
136, 102, 186, 120, 147, 158, 128, 110, 171, 163, 181, 188, 194,
173, 137, 171, 113, 132, 165, 141, 124, 136, 117, 143, 190, 162,
181, 115, 181, 100, 180, 165, 183, 102, 120, 102, 100, 126, 185,
146, 151, 191, 113, 119, 171, 112, 134, 135, 144, 113, 190, 179,
129, 182, 157, 181, 144, 193, 109, 183, 194, 173, 194, 195, 156,
158, 119, 142, 197, 190, 105, 130, 162, 153, 190, 123, 144, 138,
167, 128, 176, 148, 193, 169, 175, 131, 198, 188, 178, 190, 148,
156, 112, 136, 135, 105, 102, 197, 144, 150, 128, 146, 156, 123,
180, 103, 144, 144, 163, 128, 133, 160, 104, 135, 188, 112, 120,
182, 151, 137, 179, 150, 162, 165, 108, 195, 185, 143, 131, 133,
132, 195, 152, 166, 129, 111, 139, 135, 139, 103, 176, 147, 114,
187, 191, 139, 171, 146, 189, 184, 173, 166, 115, 140, 171, 101,
169, 130, 182, 164, 175, 188, 171, 135, 110, 113, 152, 158, 193,
123, 169, 105, 178, 120, 179, 163, 154, 163, 144, 100, 168, 144,
161, 156, 115, 126, 137, 129, 142, 192, 162, 126, 172])
b = np.random.randn(100000)
b
array([ 9.30519601e-01, -3.44360662e-01, 6.65514977e-05, ...,
-2.30123357e+00, 2.17474374e-01, 2.86950376e-01])
cond = np.abs(b) > 3*1
b[cond]
array([-3.113988 , -3.43316674, -3.28413448, -4.18671551, 3.01815086,
-3.28350433, 3.39229337, -3.8349722 , 3.16620524, -3.01368522,
-3.15619888, -3.02706183, -3.20469248, -3.05930093, -3.25541995,
-3.16289873, 3.16185476, -3.04808409, 3.54344272, -3.2585841 ,
3.08115957, -3.17921412, 3.30126439, -3.18372694, -3.06651084,
3.58377405, -3.07022682, 3.16360384, -3.34376458, -3.81579603,
-3.99645404, 3.58753363, 3.35206183, -3.74457891, -3.33198327,
-3.33993979, -3.17078523, 3.02754635, -3.17691131, -3.12715306,
-3.38487724, 3.3550324 , 3.5759783 , 3.48937298, 3.36398366,
-3.53495624, -3.5156431 , 3.32001882, -3.12741237, 3.02816144,
-3.13263251, 3.76773434, -3.3346886 , 3.40044899, 3.29585109,
3.01097665, -3.06049384, 3.14896826, 3.11488174, 3.15971499,
-3.63196786, 3.21199513, 3.06839176, 3.06794524, 3.06776471,
3.24618032, 3.67058793, 3.00866597, 3.66160146, -3.19700386,
-3.09563525, -3.2564589 , 3.34902208, 3.33267903, 3.46037542,
3.3830624 , -3.10813257, -3.27485335, -3.09740389, 3.0953684 ,
3.53897655, -3.21084279, 3.54120948, 3.15640238, -3.14993374,
3.03442456, -3.10751423, 3.28260234, 3.19101812, 3.15943105,
3.25920238, -3.03706885, -3.30509468, -3.6464602 , 3.26733257,
3.01876074, 3.01606638, 3.21088046, 3.65200452, -3.08139884,
-3.07168219, -3.18855195, -3.06266893, 3.01535252, -3.03137453,
3.12828081, 3.03530021, 3.60066559, -3.10464509, 3.30509352,
3.37102098, 3.75526796, 3.09714856, 3.08665134, -3.4107222 ,
-3.26908219, -3.18781645, 3.63833634, -3.008857 , 3.72641975,
-3.01805091, -3.11050402, -3.31183682, -3.29071569, -3.19408484,
3.1183616 , 3.02554176, 3.21680547, -3.12981011, -3.73713851,
3.24781689, -3.25066575, 3.13496805, 3.17233319, -3.2934357 ,
3.10123582, -3.15880969, -3.74877105, 3.09063324, 3.63149354,
-3.19359058, -3.00209698, 3.15207181, -3.03289971, 3.10833683,
3.72043919, 3.19074043, 3.25338012, -3.02472805, 3.06074732,
3.07540501, -3.73763831, 3.45588068, -3.14578358, -3.24339571,
-3.59522797, -3.53748643, -3.06930718, -3.83786393, 3.23984478,
-3.17985666, -3.16852988, -3.31971537, -3.2654107 , -3.12220388,
3.88867571, 3.08592098, -3.53951333, -3.27504561, 3.30145401,
3.00455928, -3.00767388, 3.00140698, 3.1611202 , -3.28482905,
-4.00614279, -3.32878319, -3.01924921, -4.09578766, 3.09062415,
3.02481019, -3.47085658, 3.13053125, -3.12608597, -3.29770516,
-3.1034331 , 3.03759591, 3.10319671, -3.23967661, -3.08915652,
-3.10352916, 3.06012689, 3.40541064, -3.20405873, 3.46436661,
3.09252034, 3.76867128, 3.08367434, -3.05931862, 3.46965336,
3.22184177, 3.28171723, -3.21577232, 3.20512585, -3.2546693 ,
3.18373904, -3.22062146, -3.01107575, -3.33173298, -3.74574667,
-3.76695662, 3.21289624, 3.46063306, -3.09351324, 3.69547389,
3.07973201, 3.00144524, -3.97588352, 3.15801451, -3.91660008,
-3.00728368, -3.31921622, -3.07473275, -3.18439241, 3.04325623,
-3.11538206, 3.22550755, 3.08726614, 3.1049817 , 3.04386415,
3.17139888, -3.33516451, 3.26625014, 3.25838664, -3.13951313,
3.04181862, -3.01509484, -3.10914859, 3.12164542, -3.44755551,
3.18598457, 3.96503764, -3.09481686, 3.11598421, 3.05800474,
3.04178915, -3.0661321 , -3.13813002, -3.23868465, -3.27279316,
-3.19480029, -3.17663568, 3.06456787, -3.60160737, 3.4677848 ,
3.21025512, -3.11477108, 3.43266798, 3.21094261, -3.0864423 ,
3.09832142, -3.00932592, 3.02609391, 3.28907182, 3.93589522,
-3.25151322, 3.19567292, 3.36226978, 3.57918895, 3.01461819,
3.3557285 , 3.33894176, -3.11794324, -3.66518483, 3.06824796,
3.29518237, -3.28611762, -3.28105128, 3.1483141 , 3.1262014 ,
3.27454636, -3.13272194, 3.12399007, -3.52308132, 3.63540754,
3.29869533, 3.00378091, 3.08858567, 3.6191856 ])
数据转换
轴和元素转换
df = pd.DataFrame(np.random.randint(0,10,size=(10,3)),
columns = ['python','tensorflow','keras'],
index = list('ABCDEFGHIJ'))
df.T
|
A |
B |
C |
D |
E |
F |
G |
H |
I |
J |
python |
4 |
4 |
7 |
4 |
8 |
0 |
5 |
8 |
4 |
2 |
tensorflow |
9 |
5 |
8 |
7 |
6 |
1 |
5 |
1 |
7 |
0 |
keras |
3 |
1 |
5 |
1 |
7 |
7 |
7 |
6 |
9 |
9 |
df.rename(index={'A':'X','K':'Y'},
columns = {'python':'人工智能'},
inplace = False)
|
人工智能 |
tensorflow |
keras |
X |
4 |
9 |
3 |
B |
4 |
5 |
1 |
C |
7 |
8 |
5 |
D |
4 |
7 |
1 |
E |
8 |
6 |
7 |
F |
0 |
1 |
7 |
G |
5 |
5 |
7 |
H |
8 |
1 |
6 |
I |
4 |
7 |
9 |
J |
2 |
0 |
9 |
df.replace(5,
50,
inplace = False)
|
python |
tensorflow |
keras |
A |
4 |
9 |
3 |
B |
4 |
50 |
1 |
C |
7 |
8 |
50 |
D |
4 |
7 |
1 |
E |
8 |
6 |
7 |
F |
0 |
1 |
7 |
G |
50 |
50 |
7 |
H |
8 |
1 |
6 |
I |
4 |
7 |
9 |
J |
2 |
0 |
9 |
df.replace([2,7],
1024,
inplace=False)
|
python |
tensorflow |
keras |
A |
4 |
9 |
3 |
B |
4 |
5 |
1 |
C |
1024 |
8 |
5 |
D |
4 |
1024 |
1 |
E |
8 |
6 |
1024 |
F |
0 |
1 |
1024 |
G |
5 |
5 |
1024 |
H |
8 |
1 |
6 |
I |
4 |
1024 |
9 |
J |
1024 |
0 |
9 |
df.iloc[4,2] = np.NaN
df
|
python |
tensorflow |
keras |
A |
4 |
9 |
3.0 |
B |
4 |
5 |
1.0 |
C |
7 |
8 |
5.0 |
D |
4 |
7 |
1.0 |
E |
8 |
6 |
NaN |
F |
0 |
1 |
7.0 |
G |
5 |
5 |
7.0 |
H |
8 |
1 |
6.0 |
I |
4 |
7 |
9.0 |
J |
2 |
0 |
9.0 |
df.replace({0:2048,np.NaN:-100},
inplace = True)
df
|
python |
tensorflow |
keras |
A |
4 |
9 |
3.0 |
B |
4 |
5 |
1.0 |
C |
7 |
8 |
5.0 |
D |
4 |
7 |
1.0 |
E |
8 |
6 |
-100.0 |
F |
2048 |
1 |
7.0 |
G |
5 |
5 |
7.0 |
H |
8 |
1 |
6.0 |
I |
4 |
7 |
9.0 |
J |
2 |
2048 |
9.0 |
df.replace({'tensorflow':1024},
2048)
|
python |
tensorflow |
keras |
A |
4 |
9 |
3.0 |
B |
4 |
5 |
1.0 |
C |
7 |
8 |
5.0 |
D |
4 |
7 |
1.0 |
E |
8 |
6 |
-100.0 |
F |
2048 |
1 |
7.0 |
G |
5 |
5 |
7.0 |
H |
8 |
1 |
6.0 |
I |
4 |
7 |
9.0 |
J |
2 |
2048 |
9.0 |
map映射元素转变
df['人工智能'] = np.random.choice([1024,2048,6],
10,
p=[1/3,1/3,1/3])
df
|
python |
tensorflow |
keras |
人工智能 |
A |
4 |
9 |
3.0 |
6 |
B |
4 |
5 |
1.0 |
2048 |
C |
7 |
8 |
5.0 |
2048 |
D |
4 |
7 |
1.0 |
1024 |
E |
8 |
6 |
-100.0 |
2048 |
F |
2048 |
1 |
7.0 |
1024 |
G |
5 |
5 |
7.0 |
2048 |
H |
8 |
1 |
6.0 |
1024 |
I |
4 |
7 |
9.0 |
6 |
J |
2 |
2048 |
9.0 |
2048 |
df['人工智能'].map({1024:3.14,2048:2.718,6:1108})
A 1108.000
B 2.718
C 2.718
D 3.140
E 2.718
F 3.140
G 2.718
H 3.140
I 1108.000
J 2.718
Name: 人工智能, dtype: float64
df['keras'].map(lambda x:True if x > 0 else False)
A True
B True
C True
D True
E False
F True
G True
H True
I True
J True
Name: keras, dtype: bool
def convert(x):
if x >= 5:
return True
else:
return False
df['level'] = df['tensorflow'].map(convert)
df
|
python |
tensorflow |
keras |
人工智能 |
level |
A |
4 |
9 |
3.0 |
6 |
True |
B |
4 |
5 |
1.0 |
2048 |
True |
C |
7 |
8 |
5.0 |
2048 |
True |
D |
4 |
7 |
1.0 |
1024 |
True |
E |
8 |
6 |
-100.0 |
2048 |
True |
F |
2048 |
1 |
7.0 |
1024 |
False |
G |
5 |
5 |
7.0 |
2048 |
True |
H |
8 |
1 |
6.0 |
1024 |
False |
I |
4 |
7 |
9.0 |
6 |
True |
J |
2 |
2048 |
9.0 |
2048 |
True |
apply映射元素转变
df['人工智能'].apply(lambda x:x+100)
A 106
B 2148
C 2148
D 1124
E 2148
F 1124
G 2148
H 1124
I 106
J 2148
Name: 人工智能, dtype: int64
df['level'].apply(lambda x:1 if x else 0)
A 1
B 1
C 1
D 1
E 1
F 0
G 1
H 0
I 1
J 1
Name: level, dtype: int64
df.apply(lambda x:x+1000)
|
python |
tensorflow |
keras |
人工智能 |
level |
A |
1004 |
1009 |
1003.0 |
1006 |
1001 |
B |
1004 |
1005 |
1001.0 |
3048 |
1001 |
C |
1007 |
1008 |
1005.0 |
3048 |
1001 |
D |
1004 |
1007 |
1001.0 |
2024 |
1001 |
E |
1008 |
1006 |
900.0 |
3048 |
1001 |
F |
3048 |
1001 |
1007.0 |
2024 |
1000 |
G |
1005 |
1005 |
1007.0 |
3048 |
1001 |
H |
1008 |
1001 |
1006.0 |
2024 |
1000 |
I |
1004 |
1007 |
1009.0 |
1006 |
1001 |
J |
1002 |
3048 |
1009.0 |
3048 |
1001 |
def convert(x):
return (x.median(),
x.count(),
x.min(),
x.max(),
x.std()
)
df.apply(convert).round(1)
|
python |
tensorflow |
keras |
人工智能 |
level |
0 |
4.5 |
6.5 |
5.5 |
1536.0 |
1.0 |
1 |
10.0 |
10.0 |
10.0 |
10.0 |
10 |
2 |
2.0 |
1.0 |
-100.0 |
6.0 |
False |
3 |
2048.0 |
2048.0 |
9.0 |
2048.0 |
True |
4 |
646.0 |
645.9 |
33.4 |
840.9 |
0.421637 |
df.apply(convert,axis = 1)
A (4.0, 5, True, 9, 3.0495901363953815)
B (4.0, 5, 1.0, 2048, 914.6653486385062)
C (7.0, 5, True, 2048, 913.5495060477018)
D (4.0, 5, 1.0, 1024, 456.5000547645093)
E (6.0, 5, -100.0, 2048, 926.5164866315116)
F (7.0, 5, False, 2048, 914.555903157374)
G (5.0, 5, True, 2048, 913.8835812071469)
H (6.0, 5, False, 1024, 456.2819303895345)
I (6.0, 5, True, 9.0, 3.0495901363953815)
J (9.0, 5, True, 2048, 1119.5491503279345)
dtype: object
transform元素转变
df = pd.DataFrame(np.random.randint(0,10,size=(10,3)),
columns=['python','tensorflow','keras'],
index=list('ABCDEFGHIJ'))
df
|
python |
tensorflow |
keras |
A |
4 |
9 |
4 |
B |
9 |
4 |
9 |
C |
4 |
4 |
6 |
D |
3 |
1 |
9 |
E |
1 |
3 |
5 |
F |
5 |
6 |
7 |
G |
4 |
2 |
3 |
H |
1 |
1 |
2 |
I |
7 |
6 |
3 |
J |
4 |
2 |
9 |
df['python'].transform(lambda x:1024 if x > 5 else -1024)
A -1024
B 1024
C -1024
D -1024
E -1024
F -1024
G -1024
H -1024
I 1024
J -1024
Name: python, dtype: int64
df['tensorflow'].transform([np.sqrt,
np.square,
np.cumsum]
)
|
sqrt |
square |
cumsum |
A |
3.000000 |
81 |
9 |
B |
2.000000 |
16 |
13 |
C |
2.000000 |
16 |
17 |
D |
1.000000 |
1 |
18 |
E |
1.732051 |
9 |
21 |
F |
2.449490 |
36 |
27 |
G |
1.414214 |
4 |
29 |
H |
1.000000 |
1 |
30 |
I |
2.449490 |
36 |
36 |
J |
1.414214 |
4 |
38 |
def convert(x):
if x>5 :
return True
else:
return False
df.transform({'python':np.cumsum,
'tensorflow':np.square,
'keras':convert})
|
python |
tensorflow |
keras |
A |
4 |
81 |
False |
B |
13 |
16 |
True |
C |
17 |
16 |
True |
D |
20 |
1 |
True |
E |
21 |
9 |
False |
F |
26 |
36 |
True |
G |
30 |
4 |
False |
H |
31 |
1 |
False |
I |
38 |
36 |
False |
J |
42 |
4 |
True |
重排随机抽样哑变量
df
|
python |
tensorflow |
keras |
A |
4 |
9 |
4 |
B |
9 |
4 |
9 |
C |
4 |
4 |
6 |
D |
3 |
1 |
9 |
E |
1 |
3 |
5 |
F |
5 |
6 |
7 |
G |
4 |
2 |
3 |
H |
1 |
1 |
2 |
I |
7 |
6 |
3 |
J |
4 |
2 |
9 |
index = np.random.permutation(10)
index
array([0, 4, 6, 2, 7, 8, 5, 3, 1, 9])
df.take(index)
|
python |
tensorflow |
keras |
A |
4 |
9 |
4 |
E |
1 |
3 |
5 |
G |
4 |
2 |
3 |
C |
4 |
4 |
6 |
H |
1 |
1 |
2 |
I |
7 |
6 |
3 |
F |
5 |
6 |
7 |
D |
3 |
1 |
9 |
B |
9 |
4 |
9 |
J |
4 |
2 |
9 |
df.take(np.random.randint(0,10,size=20))
|
python |
tensorflow |
keras |
B |
9 |
4 |
9 |
I |
7 |
6 |
3 |
F |
5 |
6 |
7 |
H |
1 |
1 |
2 |
F |
5 |
6 |
7 |
E |
1 |
3 |
5 |
J |
4 |
2 |
9 |
B |
9 |
4 |
9 |
E |
1 |
3 |
5 |
A |
4 |
9 |
4 |
C |
4 |
4 |
6 |
H |
1 |
1 |
2 |
B |
9 |
4 |
9 |
J |
4 |
2 |
9 |
F |
5 |
6 |
7 |
B |
9 |
4 |
9 |
C |
4 |
4 |
6 |
G |
4 |
2 |
3 |
F |
5 |
6 |
7 |
H |
1 |
1 |
2 |
df2 = pd.DataFrame(data={'key':['a','a','b','b','c','b','c']})
df2
|
key |
0 |
a |
1 |
a |
2 |
b |
3 |
b |
4 |
c |
5 |
b |
6 |
c |
pd.get_dummies(df2,
prefix='',
prefix_sep='')
|
a |
b |
c |
0 |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
2 |
0 |
1 |
0 |
3 |
0 |
1 |
0 |
4 |
0 |
0 |
1 |
5 |
0 |
1 |
0 |
6 |
0 |
0 |
1 |
数据重塑
df.T
|
A |
B |
C |
D |
E |
F |
G |
H |
I |
J |
python |
4 |
9 |
4 |
3 |
1 |
5 |
4 |
1 |
7 |
4 |
tensorflow |
9 |
4 |
4 |
1 |
3 |
6 |
2 |
1 |
6 |
2 |
keras |
4 |
9 |
6 |
9 |
5 |
7 |
3 |
2 |
3 |
9 |
df2 = pd.DataFrame(np.random.randint(0,10,size=(20,3)),
columns=['python','math','en'],
index= pd.MultiIndex.from_product([list('ABCDEFGHIJ'),
['期中','期末']]))
df2
|
|
python |
math |
en |
A |
期中 |
9 |
7 |
8 |
期末 |
9 |
2 |
2 |
B |
期中 |
1 |
1 |
5 |
期末 |
5 |
6 |
4 |
C |
期中 |
5 |
8 |
5 |
期末 |
5 |
6 |
9 |
D |
期中 |
4 |
1 |
2 |
期末 |
0 |
1 |
7 |
E |
期中 |
4 |
3 |
2 |
期末 |
4 |
6 |
6 |
F |
期中 |
0 |
1 |
0 |
期末 |
9 |
7 |
3 |
G |
期中 |
9 |
9 |
5 |
期末 |
6 |
9 |
5 |
H |
期中 |
4 |
5 |
2 |
期末 |
8 |
1 |
1 |
I |
期中 |
3 |
5 |
4 |
期末 |
5 |
4 |
8 |
J |
期中 |
8 |
4 |
3 |
期末 |
9 |
7 |
8 |
df2.unstack(level = 1)
|
python |
math |
en |
|
期中 |
期末 |
期中 |
期末 |
期中 |
期末 |
A |
9 |
9 |
7 |
2 |
8 |
2 |
B |
1 |
5 |
1 |
6 |
5 |
4 |
C |
5 |
5 |
8 |
6 |
5 |
9 |
D |
4 |
0 |
1 |
1 |
2 |
7 |
E |
4 |
4 |
3 |
6 |
2 |
6 |
F |
0 |
9 |
1 |
7 |
0 |
3 |
G |
9 |
6 |
9 |
9 |
5 |
5 |
H |
4 |
8 |
5 |
1 |
2 |
1 |
I |
3 |
5 |
5 |
4 |
4 |
8 |
J |
8 |
9 |
4 |
7 |
3 |
8 |
df2.unstack(level = 0)
|
python |
... |
en |
|
A |
B |
C |
D |
E |
F |
G |
H |
I |
J |
... |
A |
B |
C |
D |
E |
F |
G |
H |
I |
J |
期中 |
9 |
1 |
5 |
4 |
4 |
0 |
9 |
4 |
3 |
8 |
... |
8 |
5 |
5 |
2 |
2 |
0 |
5 |
2 |
4 |
3 |
期末 |
9 |
5 |
5 |
0 |
4 |
9 |
6 |
8 |
5 |
9 |
... |
2 |
4 |
9 |
7 |
6 |
3 |
5 |
1 |
8 |
8 |
2 rows × 30 columns
df2.unstack(level = -1)
|
python |
math |
en |
|
期中 |
期末 |
期中 |
期末 |
期中 |
期末 |
A |
9 |
9 |
7 |
2 |
8 |
2 |
B |
1 |
5 |
1 |
6 |
5 |
4 |
C |
5 |
5 |
8 |
6 |
5 |
9 |
D |
4 |
0 |
1 |
1 |
2 |
7 |
E |
4 |
4 |
3 |
6 |
2 |
6 |
F |
0 |
9 |
1 |
7 |
0 |
3 |
G |
9 |
6 |
9 |
9 |
5 |
5 |
H |
4 |
8 |
5 |
1 |
2 |
1 |
I |
3 |
5 |
5 |
4 |
4 |
8 |
J |
8 |
9 |
4 |
7 |
3 |
8 |
df2.stack()
A 期中 python 9
math 7
en 8
期末 python 9
math 2
en 2
B 期中 python 1
math 1
en 5
期末 python 5
math 6
en 4
C 期中 python 5
math 8
en 5
期末 python 5
math 6
en 9
D 期中 python 4
math 1
en 2
期末 python 0
math 1
en 7
E 期中 python 4
math 3
en 2
期末 python 4
math 6
en 6
F 期中 python 0
math 1
en 0
期末 python 9
math 7
en 3
G 期中 python 9
math 9
en 5
期末 python 6
math 9
en 5
H 期中 python 4
math 5
en 2
期末 python 8
math 1
en 1
I 期中 python 3
math 5
en 4
期末 python 5
math 4
en 8
J 期中 python 8
math 4
en 3
期末 python 9
math 7
en 8
dtype: int32
df2.unstack().stack(level = 0)
|
|
期中 |
期末 |
A |
en |
8 |
2 |
math |
7 |
2 |
python |
9 |
9 |
B |
en |
5 |
4 |
math |
1 |
6 |
python |
1 |
5 |
C |
en |
5 |
9 |
math |
8 |
6 |
python |
5 |
5 |
D |
en |
2 |
7 |
math |
1 |
1 |
python |
4 |
0 |
E |
en |
2 |
6 |
math |
3 |
6 |
python |
4 |
4 |
F |
en |
0 |
3 |
math |
1 |
7 |
python |
0 |
9 |
G |
en |
5 |
5 |
math |
9 |
9 |
python |
9 |
6 |
H |
en |
2 |
1 |
math |
5 |
1 |
python |
4 |
8 |
I |
en |
4 |
8 |
math |
5 |
4 |
python |
3 |
5 |
J |
en |
3 |
8 |
math |
4 |
7 |
python |
8 |
9 |
df2.mean(axis = 0)
python 5.35
math 4.65
en 4.45
dtype: float64
df2.mean(axis = 1)
A 期中 8.000000
期末 4.333333
B 期中 2.333333
期末 5.000000
C 期中 6.000000
期末 6.666667
D 期中 2.333333
期末 2.666667
E 期中 3.000000
期末 5.333333
F 期中 0.333333
期末 6.333333
G 期中 7.666667
期末 6.666667
H 期中 3.666667
期末 3.333333
I 期中 4.000000
期末 5.666667
J 期中 5.000000
期末 8.000000
dtype: float64
df2.groupby(level=1).mean()
|
python |
math |
en |
期中 |
4.7 |
4.4 |
3.6 |
期末 |
6.0 |
4.9 |
5.3 |
df2.groupby(level=0).mean()
|
python |
math |
en |
A |
9.0 |
4.5 |
5.0 |
B |
3.0 |
3.5 |
4.5 |
C |
5.0 |
7.0 |
7.0 |
D |
2.0 |
1.0 |
4.5 |
E |
4.0 |
4.5 |
4.0 |
F |
4.5 |
4.0 |
1.5 |
G |
7.5 |
9.0 |
5.0 |
H |
6.0 |
3.0 |
1.5 |
I |
4.0 |
4.5 |
6.0 |
J |
8.5 |
5.5 |
5.5 |
pandas高级应用
数学和统计方法
简单统计指标
df =pd.DataFrame(np.random.randint(0,10,size=(20,3)),
columns=['python','math','en'],
index=list('QWERTYUIOPLKJHGFDSAZ'))
df
|
python |
math |
en |
Q |
6 |
1 |
5 |
W |
3 |
1 |
1 |
E |
2 |
7 |
3 |
R |
0 |
2 |
0 |
T |
6 |
2 |
6 |
Y |
5 |
0 |
5 |
U |
9 |
7 |
5 |
I |
7 |
7 |
8 |
O |
5 |
3 |
3 |
P |
0 |
8 |
2 |
L |
7 |
0 |
3 |
K |
4 |
8 |
5 |
J |
2 |
0 |
7 |
H |
4 |
6 |
3 |
G |
5 |
9 |
8 |
F |
7 |
6 |
1 |
D |
9 |
1 |
6 |
S |
8 |
2 |
3 |
A |
8 |
7 |
4 |
Z |
3 |
7 |
1 |
df.iloc[6,2] = np.NaN
df
|
python |
math |
en |
Q |
6 |
1 |
5.0 |
W |
3 |
1 |
1.0 |
E |
2 |
7 |
3.0 |
R |
0 |
2 |
0.0 |
T |
6 |
2 |
6.0 |
Y |
5 |
0 |
5.0 |
U |
9 |
7 |
NaN |
I |
7 |
7 |
8.0 |
O |
5 |
3 |
3.0 |
P |
0 |
8 |
2.0 |
L |
7 |
0 |
3.0 |
K |
4 |
8 |
5.0 |
J |
2 |
0 |
7.0 |
H |
4 |
6 |
3.0 |
G |
5 |
9 |
8.0 |
F |
7 |
6 |
1.0 |
D |
9 |
1 |
6.0 |
S |
8 |
2 |
3.0 |
A |
8 |
7 |
4.0 |
Z |
3 |
7 |
1.0 |
df.count()
python 20
math 20
en 19
dtype: int64
df.mean()
python 5.000000
math 4.200000
en 3.894737
dtype: float64
df.median()
python 5.0
math 4.5
en 3.0
dtype: float64
df.max()
python 9.0
math 9.0
en 8.0
dtype: float64
df.min()
python 0.0
math 0.0
en 0.0
dtype: float64
df['python'].unique()
array([6, 3, 2, 0, 5, 9, 7, 4, 8])
df['math'].value_counts()
7 5
1 3
2 3
0 3
8 2
6 2
3 1
9 1
Name: math, dtype: int64
df.quantile(q=[0,0.25,0.5,0.75,1])
|
python |
math |
en |
0.00 |
0.0 |
0.0 |
0.0 |
0.25 |
3.0 |
1.0 |
2.5 |
0.50 |
5.0 |
4.5 |
3.0 |
0.75 |
7.0 |
7.0 |
5.5 |
1.00 |
9.0 |
9.0 |
8.0 |
df.describe().round()
|
python |
math |
en |
count |
20.0 |
20.0 |
19.0 |
mean |
5.0 |
4.0 |
4.0 |
std |
3.0 |
3.0 |
2.0 |
min |
0.0 |
0.0 |
0.0 |
25% |
3.0 |
1.0 |
2.0 |
50% |
5.0 |
4.0 |
3.0 |
75% |
7.0 |
7.0 |
6.0 |
max |
9.0 |
9.0 |
8.0 |
索引标签、位置获取
df['python'].argmax()
6
df.idxmax()
python U
math G
en I
dtype: object
更多统计指标
df.cumsum()
|
python |
math |
en |
Q |
6 |
1 |
5.0 |
W |
9 |
2 |
6.0 |
E |
11 |
9 |
9.0 |
R |
11 |
11 |
9.0 |
T |
17 |
13 |
15.0 |
Y |
22 |
13 |
20.0 |
U |
31 |
20 |
NaN |
I |
38 |
27 |
28.0 |
O |
43 |
30 |
31.0 |
P |
43 |
38 |
33.0 |
L |
50 |
38 |
36.0 |
K |
54 |
46 |
41.0 |
J |
56 |
46 |
48.0 |
H |
60 |
52 |
51.0 |
G |
65 |
61 |
59.0 |
F |
72 |
67 |
60.0 |
D |
81 |
68 |
66.0 |
S |
89 |
70 |
69.0 |
A |
97 |
77 |
73.0 |
Z |
100 |
84 |
74.0 |
df.cumprod()
|
python |
math |
en |
Q |
6 |
1 |
5.0 |
W |
18 |
1 |
5.0 |
E |
36 |
7 |
15.0 |
R |
0 |
14 |
0.0 |
T |
0 |
28 |
0.0 |
Y |
0 |
0 |
0.0 |
U |
0 |
0 |
NaN |
I |
0 |
0 |
0.0 |
O |
0 |
0 |
0.0 |
P |
0 |
0 |
0.0 |
L |
0 |
0 |
0.0 |
K |
0 |
0 |
0.0 |
J |
0 |
0 |
0.0 |
H |
0 |
0 |
0.0 |
G |
0 |
0 |
0.0 |
F |
0 |
0 |
0.0 |
D |
0 |
0 |
0.0 |
S |
0 |
0 |
0.0 |
A |
0 |
0 |
0.0 |
Z |
0 |
0 |
0.0 |
df.cummin()
|
python |
math |
en |
Q |
6 |
1 |
5.0 |
W |
3 |
1 |
1.0 |
E |
2 |
1 |
1.0 |
R |
0 |
1 |
0.0 |
T |
0 |
1 |
0.0 |
Y |
0 |
0 |
0.0 |
U |
0 |
0 |
NaN |
I |
0 |
0 |
0.0 |
O |
0 |
0 |
0.0 |
P |
0 |
0 |
0.0 |
L |
0 |
0 |
0.0 |
K |
0 |
0 |
0.0 |
J |
0 |
0 |
0.0 |
H |
0 |
0 |
0.0 |
G |
0 |
0 |
0.0 |
F |
0 |
0 |
0.0 |
D |
0 |
0 |
0.0 |
S |
0 |
0 |
0.0 |
A |
0 |
0 |
0.0 |
Z |
0 |
0 |
0.0 |
df.std()
python 2.733804
math 3.221637
en 2.401267
dtype: float64
df.var()
python 7.473684
math 10.378947
en 5.766082
dtype: float64
df.diff()
|
python |
math |
en |
Q |
NaN |
NaN |
NaN |
W |
-3.0 |
0.0 |
-4.0 |
E |
-1.0 |
6.0 |
2.0 |
R |
-2.0 |
-5.0 |
-3.0 |
T |
6.0 |
0.0 |
6.0 |
Y |
-1.0 |
-2.0 |
-1.0 |
U |
4.0 |
7.0 |
NaN |
I |
-2.0 |
0.0 |
NaN |
O |
-2.0 |
-4.0 |
-5.0 |
P |
-5.0 |
5.0 |
-1.0 |
L |
7.0 |
-8.0 |
1.0 |
K |
-3.0 |
8.0 |
2.0 |
J |
-2.0 |
-8.0 |
2.0 |
H |
2.0 |
6.0 |
-4.0 |
G |
1.0 |
3.0 |
5.0 |
F |
2.0 |
-3.0 |
-7.0 |
D |
2.0 |
-5.0 |
5.0 |
S |
-1.0 |
1.0 |
-3.0 |
A |
0.0 |
5.0 |
1.0 |
Z |
-5.0 |
0.0 |
-3.0 |
df.pct_change().round(2)
|
python |
math |
en |
Q |
NaN |
NaN |
NaN |
W |
-0.50 |
0.00 |
-0.80 |
E |
-0.33 |
6.00 |
2.00 |
R |
-1.00 |
-0.71 |
-1.00 |
T |
inf |
0.00 |
inf |
Y |
-0.17 |
-1.00 |
-0.17 |
U |
0.80 |
inf |
0.00 |
I |
-0.22 |
0.00 |
0.60 |
O |
-0.29 |
-0.57 |
-0.62 |
P |
-1.00 |
1.67 |
-0.33 |
L |
inf |
-1.00 |
0.50 |
K |
-0.43 |
inf |
0.67 |
J |
-0.50 |
-1.00 |
0.40 |
H |
1.00 |
inf |
-0.57 |
G |
0.25 |
0.50 |
1.67 |
F |
0.40 |
-0.33 |
-0.88 |
D |
0.29 |
-0.83 |
5.00 |
S |
-0.11 |
1.00 |
-0.50 |
A |
0.00 |
2.50 |
0.33 |
Z |
-0.62 |
0.00 |
-0.75 |
高级统计指标
df.cov()
|
python |
math |
en |
python |
7.473684 |
-0.684211 |
2.421053 |
math |
-0.684211 |
10.378947 |
0.172515 |
en |
2.421053 |
0.172515 |
5.766082 |
df.var()
python 7.473684
math 10.378947
en 5.766082
dtype: float64
df['python'].cov(df['math'])
-0.6842105263157894
df.corr()
|
python |
math |
en |
python |
1.000000 |
-0.077687 |
0.382359 |
math |
-0.077687 |
1.000000 |
0.022174 |
en |
0.382359 |
0.022174 |
1.000000 |
df.corrwith(df['en'])
python 0.382359
math 0.022174
en 1.000000
dtype: float64
排序
df = pd.DataFrame(np.random.randint(0,20,size=(20,3)),
columns=['python','math','en'],
index=list('QWERTYUIOPLKJHGDSAZM'))
df
|
python |
math |
en |
Q |
9 |
16 |
17 |
W |
5 |
3 |
5 |
E |
1 |
0 |
4 |
R |
0 |
12 |
14 |
T |
6 |
8 |
3 |
Y |
16 |
14 |
0 |
U |
17 |
17 |
8 |
I |
5 |
4 |
8 |
O |
13 |
5 |
12 |
P |
4 |
5 |
10 |
L |
15 |
3 |
7 |
K |
17 |
4 |
10 |
J |
8 |
14 |
12 |
H |
10 |
3 |
13 |
G |
7 |
16 |
16 |
D |
0 |
3 |
8 |
S |
15 |
16 |
14 |
A |
3 |
11 |
11 |
Z |
18 |
15 |
4 |
M |
10 |
14 |
14 |
df.sort_index(axis = 0,
ascending = False)
|
python |
math |
en |
Z |
18 |
15 |
4 |
Y |
16 |
14 |
0 |
W |
5 |
3 |
5 |
U |
17 |
17 |
8 |
T |
6 |
8 |
3 |
S |
15 |
16 |
14 |
R |
0 |
12 |
14 |
Q |
9 |
16 |
17 |
P |
4 |
5 |
10 |
O |
13 |
5 |
12 |
M |
10 |
14 |
14 |
L |
15 |
3 |
7 |
K |
17 |
4 |
10 |
J |
8 |
14 |
12 |
I |
5 |
4 |
8 |
H |
10 |
3 |
13 |
G |
7 |
16 |
16 |
E |
1 |
0 |
4 |
D |
0 |
3 |
8 |
A |
3 |
11 |
11 |
df.sort_values(by='python',
ascending=True)
|
python |
math |
en |
R |
0 |
12 |
14 |
D |
0 |
3 |
8 |
E |
1 |
0 |
4 |
A |
3 |
11 |
11 |
P |
4 |
5 |
10 |
W |
5 |
3 |
5 |
I |
5 |
4 |
8 |
T |
6 |
8 |
3 |
G |
7 |
16 |
16 |
J |
8 |
14 |
12 |
Q |
9 |
16 |
17 |
H |
10 |
3 |
13 |
M |
10 |
14 |
14 |
O |
13 |
5 |
12 |
S |
15 |
16 |
14 |
L |
15 |
3 |
7 |
Y |
16 |
14 |
0 |
K |
17 |
4 |
10 |
U |
17 |
17 |
8 |
Z |
18 |
15 |
4 |
df.sort_values(by=['python','math'],
ascending=True)
|
python |
math |
en |
D |
0 |
3 |
8 |
R |
0 |
12 |
14 |
E |
1 |
0 |
4 |
A |
3 |
11 |
11 |
P |
4 |
5 |
10 |
W |
5 |
3 |
5 |
I |
5 |
4 |
8 |
T |
6 |
8 |
3 |
G |
7 |
16 |
16 |
J |
8 |
14 |
12 |
Q |
9 |
16 |
17 |
H |
10 |
3 |
13 |
M |
10 |
14 |
14 |
O |
13 |
5 |
12 |
L |
15 |
3 |
7 |
S |
15 |
16 |
14 |
Y |
16 |
14 |
0 |
K |
17 |
4 |
10 |
U |
17 |
17 |
8 |
Z |
18 |
15 |
4 |
df.nlargest(n=5,
columns='python')
|
python |
math |
en |
Z |
18 |
15 |
4 |
U |
17 |
17 |
8 |
K |
17 |
4 |
10 |
Y |
16 |
14 |
0 |
L |
15 |
3 |
7 |
df.nsmallest(n=5,
columns='python')
|
python |
math |
en |
R |
0 |
12 |
14 |
D |
0 |
3 |
8 |
E |
1 |
0 |
4 |
A |
3 |
11 |
11 |
P |
4 |
5 |
10 |
分箱操作
df = pd.DataFrame(np.random.randint(0,151,size=(100,3)),
columns=['python','math','en'])
df
|
python |
math |
en |
0 |
127 |
124 |
50 |
1 |
150 |
91 |
130 |
2 |
24 |
113 |
69 |
3 |
90 |
2 |
123 |
4 |
133 |
113 |
60 |
... |
... |
... |
... |
95 |
45 |
134 |
58 |
96 |
74 |
21 |
75 |
97 |
100 |
36 |
93 |
98 |
130 |
21 |
10 |
99 |
20 |
66 |
87 |
100 rows × 3 columns
pd.cut(df.python,
bins = 4,
labels=['不及格','及格','中等','优秀'],
right=True
)
0 优秀
1 优秀
2 不及格
3 中等
4 优秀
...
95 及格
96 及格
97 中等
98 优秀
99 不及格
Name: python, Length: 100, dtype: category
Categories (4, object): ['不及格' < '及格' < '中等' < '优秀']
df['等级'] = pd.cut(df.python,
bins = [0,30,60,90,120,150],
labels=['极差','不及格','及格','中等','优秀'])
df
|
python |
math |
en |
等级 |
0 |
127 |
124 |
50 |
优秀 |
1 |
150 |
91 |
130 |
优秀 |
2 |
24 |
113 |
69 |
极差 |
3 |
90 |
2 |
123 |
及格 |
4 |
133 |
113 |
60 |
优秀 |
... |
... |
... |
... |
... |
95 |
45 |
134 |
58 |
不及格 |
96 |
74 |
21 |
75 |
及格 |
97 |
100 |
36 |
93 |
中等 |
98 |
130 |
21 |
10 |
优秀 |
99 |
20 |
66 |
87 |
极差 |
100 rows × 4 columns
pd.qcut(df.python,
q = 4,
labels=['不及格','及格','中等','优秀'],
)
0 优秀
1 优秀
2 不及格
3 中等
4 优秀
...
95 及格
96 中等
97 优秀
98 优秀
99 不及格
Name: python, Length: 100, dtype: category
Categories (4, object): ['不及格' < '及格' < '中等' < '优秀']
pd.qcut(df.python,
q = 4,
labels=['不及格','及格','中等','优秀'],
).value_counts()
不及格 25
及格 25
中等 25
优秀 25
Name: python, dtype: int64
分组聚合
分组
df = pd.DataFrame(data={'sex':np.random.randint(0,2,size = 300),
'class':np.random.randint(1,9,size = 300),
'python':np.random.randint(0,151,size = 300),
'math':np.random.randint(0,151,size = 300),
'en':np.random.randint(0,151,size = 300)})
df['sex'] = df['sex'].map({0:'男',1:'女'})
df
|
sex |
class |
python |
math |
en |
0 |
男 |
8 |
16 |
70 |
85 |
1 |
男 |
1 |
37 |
8 |
106 |
2 |
男 |
4 |
25 |
106 |
63 |
3 |
男 |
3 |
60 |
55 |
101 |
4 |
男 |
2 |
121 |
114 |
14 |
... |
... |
... |
... |
... |
... |
295 |
男 |
4 |
33 |
84 |
66 |
296 |
男 |
8 |
142 |
134 |
89 |
297 |
女 |
2 |
150 |
88 |
130 |
298 |
男 |
8 |
62 |
127 |
1 |
299 |
男 |
1 |
98 |
101 |
80 |
300 rows × 5 columns
for name,group in df.groupby(by = 'sex'):
print(name)
print(group)
女
sex class python math en
6 女 8 102 26 26
11 女 2 94 140 71
12 女 5 109 27 127
18 女 6 65 86 52
21 女 7 147 58 55
.. .. ... ... ... ...
289 女 3 139 62 26
290 女 2 11 52 1
291 女 4 37 14 42
293 女 4 30 21 47
297 女 2 150 88 130
[136 rows x 5 columns]
男
sex class python math en
0 男 8 16 70 85
1 男 1 37 8 106
2 男 4 25 106 63
3 男 3 60 55 101
4 男 2 121 114 14
.. .. ... ... ... ...
294 男 6 124 90 107
295 男 4 33 84 66
296 男 8 142 134 89
298 男 8 62 127 1
299 男 1 98 101 80
[164 rows x 5 columns]
for name,group in df.groupby(by = ['sex','class']):
print(name)
print(group)
('女', 1)
sex class python math en
27 女 1 128 67 30
34 女 1 10 116 91
35 女 1 3 22 144
73 女 1 102 143 120
84 女 1 29 23 120
99 女 1 53 77 144
143 女 1 104 84 103
157 女 1 145 77 92
173 女 1 61 118 143
174 女 1 114 26 130
178 女 1 85 21 0
186 女 1 26 46 7
192 女 1 76 115 79
198 女 1 61 132 128
217 女 1 58 53 89
230 女 1 61 104 144
234 女 1 50 150 128
238 女 1 115 103 13
277 女 1 83 17 82
('女', 2)
sex class python math en
11 女 2 94 140 71
37 女 2 9 71 129
47 女 2 31 5 14
79 女 2 49 142 113
82 女 2 105 67 17
122 女 2 23 143 96
142 女 2 142 144 29
147 女 2 53 107 0
163 女 2 66 87 1
171 女 2 104 80 118
206 女 2 16 100 140
222 女 2 38 12 112
226 女 2 118 85 20
242 女 2 1 3 136
260 女 2 50 115 146
268 女 2 70 93 22
272 女 2 71 74 47
290 女 2 11 52 1
297 女 2 150 88 130
('女', 3)
sex class python math en
30 女 3 24 90 78
40 女 3 89 134 99
65 女 3 48 81 73
69 女 3 0 0 66
72 女 3 147 65 95
104 女 3 143 122 54
140 女 3 140 119 97
144 女 3 125 96 14
145 女 3 84 112 42
249 女 3 85 59 122
283 女 3 69 34 27
286 女 3 129 10 73
289 女 3 139 62 26
('女', 4)
sex class python math en
32 女 4 116 5 119
43 女 4 60 96 103
48 女 4 13 147 133
64 女 4 126 140 10
92 女 4 122 49 71
118 女 4 125 34 23
139 女 4 84 58 66
164 女 4 59 64 80
167 女 4 47 109 94
183 女 4 96 14 97
187 女 4 123 26 21
203 女 4 6 74 105
274 女 4 148 10 109
291 女 4 37 14 42
293 女 4 30 21 47
('女', 5)
sex class python math en
12 女 5 109 27 127
25 女 5 6 124 98
55 女 5 78 63 10
70 女 5 30 71 29
90 女 5 20 52 78
152 女 5 60 116 35
240 女 5 49 27 33
264 女 5 137 95 142
275 女 5 101 69 80
276 女 5 49 88 5
('女', 6)
sex class python math en
18 女 6 65 86 52
26 女 6 112 25 89
31 女 6 86 88 92
42 女 6 134 37 111
66 女 6 25 119 113
85 女 6 29 123 36
91 女 6 109 46 87
98 女 6 100 132 10
102 女 6 27 113 107
141 女 6 35 22 144
159 女 6 108 13 136
185 女 6 77 116 103
218 女 6 120 137 45
219 女 6 132 95 42
232 女 6 110 51 47
241 女 6 77 57 40
259 女 6 61 104 58
('女', 7)
sex class python math en
21 女 7 147 58 55
28 女 7 141 69 22
33 女 7 53 70 146
51 女 7 120 64 42
59 女 7 89 30 107
74 女 7 64 145 40
89 女 7 36 100 61
96 女 7 66 82 49
106 女 7 77 65 28
127 女 7 103 9 115
133 女 7 31 138 106
138 女 7 130 33 120
151 女 7 83 82 43
161 女 7 22 127 52
165 女 7 47 51 3
170 女 7 100 23 56
194 女 7 118 146 142
199 女 7 125 132 87
213 女 7 64 46 41
216 女 7 43 99 114
228 女 7 140 91 70
243 女 7 76 103 44
245 女 7 9 89 12
250 女 7 139 81 40
288 女 7 115 148 79
('女', 8)
sex class python math en
6 女 8 102 26 26
88 女 8 19 44 134
114 女 8 133 19 74
124 女 8 91 77 120
130 女 8 141 135 21
166 女 8 3 114 91
168 女 8 136 69 119
172 女 8 69 147 32
177 女 8 37 81 49
195 女 8 66 69 4
224 女 8 18 29 141
233 女 8 117 63 23
239 女 8 68 71 71
248 女 8 31 71 9
252 女 8 141 113 147
267 女 8 125 136 17
271 女 8 102 137 97
279 女 8 137 110 51
('男', 1)
sex class python math en
1 男 1 37 8 106
8 男 1 8 48 113
23 男 1 116 43 13
24 男 1 79 115 123
29 男 1 24 112 126
39 男 1 112 115 133
68 男 1 131 138 80
95 男 1 86 125 5
101 男 1 49 0 67
137 男 1 59 5 62
176 男 1 91 75 68
191 男 1 90 91 15
196 男 1 129 59 9
211 男 1 78 97 13
215 男 1 31 27 66
229 男 1 62 21 28
237 男 1 140 66 126
282 男 1 5 67 12
299 男 1 98 101 80
('男', 2)
sex class python math en
4 男 2 121 114 14
7 男 2 55 125 62
14 男 2 36 142 77
16 男 2 80 143 49
36 男 2 105 101 88
38 男 2 51 55 18
45 男 2 80 17 13
62 男 2 23 35 20
105 男 2 115 106 59
128 男 2 86 18 78
149 男 2 81 34 12
158 男 2 80 24 133
169 男 2 55 100 47
175 男 2 138 87 43
182 男 2 111 62 54
210 男 2 69 128 67
227 男 2 16 34 79
246 男 2 29 29 86
253 男 2 130 117 119
265 男 2 8 21 18
284 男 2 22 66 16
('男', 3)
sex class python math en
3 男 3 60 55 101
5 男 3 85 69 89
9 男 3 69 42 34
13 男 3 91 142 86
17 男 3 36 50 15
50 男 3 116 60 68
54 男 3 34 144 96
58 男 3 102 126 95
61 男 3 127 97 78
77 男 3 16 74 7
81 男 3 130 54 112
86 男 3 92 3 92
93 男 3 51 83 25
103 男 3 118 122 64
108 男 3 41 109 124
109 男 3 84 123 71
111 男 3 15 32 94
117 男 3 18 62 86
126 男 3 8 76 64
129 男 3 145 9 5
204 男 3 134 5 54
207 男 3 61 22 143
212 男 3 66 146 49
251 男 3 132 109 147
254 男 3 10 146 117
280 男 3 89 109 101
287 男 3 24 24 128
('男', 4)
sex class python math en
2 男 4 25 106 63
46 男 4 24 149 108
49 男 4 133 134 64
52 男 4 46 37 134
60 男 4 137 40 45
87 男 4 64 6 19
112 男 4 136 73 114
120 男 4 133 18 63
190 男 4 45 25 111
200 男 4 38 41 92
202 男 4 19 121 72
221 男 4 29 132 83
236 男 4 54 100 53
263 男 4 40 14 48
270 男 4 125 103 132
295 男 4 33 84 66
('男', 5)
sex class python math en
53 男 5 114 92 128
63 男 5 91 105 125
76 男 5 14 18 56
97 男 5 42 43 95
110 男 5 83 20 49
113 男 5 139 46 146
115 男 5 93 49 142
125 男 5 31 85 95
131 男 5 51 18 12
150 男 5 46 90 33
162 男 5 78 123 127
208 男 5 3 130 94
209 男 5 106 92 92
214 男 5 39 67 119
223 男 5 7 102 137
257 男 5 79 44 88
262 男 5 109 34 96
266 男 5 107 72 149
273 男 5 145 2 58
('男', 6)
sex class python math en
15 男 6 147 53 58
20 男 6 18 24 53
116 男 6 124 92 52
119 男 6 117 60 33
134 男 6 36 17 136
135 男 6 66 19 48
148 男 6 102 26 42
154 男 6 12 18 31
155 男 6 36 19 136
181 男 6 48 51 142
184 男 6 72 114 74
188 男 6 136 82 26
189 男 6 120 116 36
193 男 6 42 21 69
201 男 6 144 124 24
225 男 6 107 140 18
235 男 6 83 117 99
294 男 6 124 90 107
('男', 7)
sex class python math en
10 男 7 35 134 96
19 男 7 112 127 64
22 男 7 106 60 8
44 男 7 132 15 124
67 男 7 103 40 93
71 男 7 57 64 150
78 男 7 131 122 136
83 男 7 12 149 83
94 男 7 76 54 118
123 男 7 53 109 50
136 男 7 15 64 51
146 男 7 12 101 65
153 男 7 58 110 107
156 男 7 109 49 138
180 男 7 50 73 86
197 男 7 25 71 74
205 男 7 136 80 50
220 男 7 103 48 80
244 男 7 58 55 99
261 男 7 75 136 83
269 男 7 69 86 149
278 男 7 66 70 81
292 男 7 144 149 11
('男', 8)
sex class python math en
0 男 8 16 70 85
41 男 8 91 118 70
56 男 8 32 30 146
57 男 8 85 129 98
75 男 8 145 99 135
80 男 8 56 94 57
100 男 8 85 100 53
107 男 8 146 133 64
121 男 8 69 21 9
132 男 8 66 41 43
160 男 8 24 125 136
179 男 8 16 44 92
231 男 8 96 6 1
247 男 8 81 139 133
255 男 8 12 13 113
256 男 8 64 101 20
258 男 8 138 27 103
281 男 8 66 23 17
285 男 8 20 118 54
296 男 8 142 134 89
298 男 8 62 127 1
for name,group in df[['python','math']].groupby(df['sex']):
print(name)
print(group)
女
python math
6 102 26
11 94 140
12 109 27
18 65 86
21 147 58
.. ... ...
289 139 62
290 11 52
291 37 14
293 30 21
297 150 88
[136 rows x 2 columns]
男
python math
0 16 70
1 37 8
2 25 106
3 60 55
4 121 114
.. ... ...
294 124 90
295 33 84
296 142 134
298 62 127
299 98 101
[164 rows x 2 columns]
for name,group in df[['python','math']].groupby([df['sex'],df['class']]):
print(name)
print(group)
('女', 1)
python math
27 128 67
34 10 116
35 3 22
73 102 143
84 29 23
99 53 77
143 104 84
157 145 77
173 61 118
174 114 26
178 85 21
186 26 46
192 76 115
198 61 132
217 58 53
230 61 104
234 50 150
238 115 103
277 83 17
('女', 2)
python math
11 94 140
37 9 71
47 31 5
79 49 142
82 105 67
122 23 143
142 142 144
147 53 107
163 66 87
171 104 80
206 16 100
222 38 12
226 118 85
242 1 3
260 50 115
268 70 93
272 71 74
290 11 52
297 150 88
('女', 3)
python math
30 24 90
40 89 134
65 48 81
69 0 0
72 147 65
104 143 122
140 140 119
144 125 96
145 84 112
249 85 59
283 69 34
286 129 10
289 139 62
('女', 4)
python math
32 116 5
43 60 96
48 13 147
64 126 140
92 122 49
118 125 34
139 84 58
164 59 64
167 47 109
183 96 14
187 123 26
203 6 74
274 148 10
291 37 14
293 30 21
('女', 5)
python math
12 109 27
25 6 124
55 78 63
70 30 71
90 20 52
152 60 116
240 49 27
264 137 95
275 101 69
276 49 88
('女', 6)
python math
18 65 86
26 112 25
31 86 88
42 134 37
66 25 119
85 29 123
91 109 46
98 100 132
102 27 113
141 35 22
159 108 13
185 77 116
218 120 137
219 132 95
232 110 51
241 77 57
259 61 104
('女', 7)
python math
21 147 58
28 141 69
33 53 70
51 120 64
59 89 30
74 64 145
89 36 100
96 66 82
106 77 65
127 103 9
133 31 138
138 130 33
151 83 82
161 22 127
165 47 51
170 100 23
194 118 146
199 125 132
213 64 46
216 43 99
228 140 91
243 76 103
245 9 89
250 139 81
288 115 148
('女', 8)
python math
6 102 26
88 19 44
114 133 19
124 91 77
130 141 135
166 3 114
168 136 69
172 69 147
177 37 81
195 66 69
224 18 29
233 117 63
239 68 71
248 31 71
252 141 113
267 125 136
271 102 137
279 137 110
('男', 1)
python math
1 37 8
8 8 48
23 116 43
24 79 115
29 24 112
39 112 115
68 131 138
95 86 125
101 49 0
137 59 5
176 91 75
191 90 91
196 129 59
211 78 97
215 31 27
229 62 21
237 140 66
282 5 67
299 98 101
('男', 2)
python math
4 121 114
7 55 125
14 36 142
16 80 143
36 105 101
38 51 55
45 80 17
62 23 35
105 115 106
128 86 18
149 81 34
158 80 24
169 55 100
175 138 87
182 111 62
210 69 128
227 16 34
246 29 29
253 130 117
265 8 21
284 22 66
('男', 3)
python math
3 60 55
5 85 69
9 69 42
13 91 142
17 36 50
50 116 60
54 34 144
58 102 126
61 127 97
77 16 74
81 130 54
86 92 3
93 51 83
103 118 122
108 41 109
109 84 123
111 15 32
117 18 62
126 8 76
129 145 9
204 134 5
207 61 22
212 66 146
251 132 109
254 10 146
280 89 109
287 24 24
('男', 4)
python math
2 25 106
46 24 149
49 133 134
52 46 37
60 137 40
87 64 6
112 136 73
120 133 18
190 45 25
200 38 41
202 19 121
221 29 132
236 54 100
263 40 14
270 125 103
295 33 84
('男', 5)
python math
53 114 92
63 91 105
76 14 18
97 42 43
110 83 20
113 139 46
115 93 49
125 31 85
131 51 18
150 46 90
162 78 123
208 3 130
209 106 92
214 39 67
223 7 102
257 79 44
262 109 34
266 107 72
273 145 2
('男', 6)
python math
15 147 53
20 18 24
116 124 92
119 117 60
134 36 17
135 66 19
148 102 26
154 12 18
155 36 19
181 48 51
184 72 114
188 136 82
189 120 116
193 42 21
201 144 124
225 107 140
235 83 117
294 124 90
('男', 7)
python math
10 35 134
19 112 127
22 106 60
44 132 15
67 103 40
71 57 64
78 131 122
83 12 149
94 76 54
123 53 109
136 15 64
146 12 101
153 58 110
156 109 49
180 50 73
197 25 71
205 136 80
220 103 48
244 58 55
261 75 136
269 69 86
278 66 70
292 144 149
('男', 8)
python math
0 16 70
41 91 118
56 32 30
57 85 129
75 145 99
80 56 94
100 85 100
107 146 133
121 69 21
132 66 41
160 24 125
179 16 44
231 96 6
247 81 139
255 12 13
256 64 101
258 138 27
281 66 23
285 20 118
296 142 134
298 62 127
df.dtypes
sex object
class int32
python int32
math int32
en int32
dtype: object
for name,group in df.groupby(df.dtypes,axis = 1):
print(name,group)
int32 class python math en
0 8 16 70 85
1 1 37 8 106
2 4 25 106 63
3 3 60 55 101
4 2 121 114 14
.. ... ... ... ...
295 4 33 84 66
296 8 142 134 89
297 2 150 88 130
298 8 62 127 1
299 1 98 101 80
[300 rows x 4 columns]
object sex
0 男
1 男
2 男
3 男
4 男
.. ..
295 男
296 男
297 女
298 男
299 男
[300 rows x 1 columns]
m = {'sex':'category','class':'category','python':'科目','math':'科目','en':'科目'}
for name,group in df.groupby(by = m,axis = 1):
print(name,group)
category sex class
0 男 8
1 男 1
2 男 4
3 男 3
4 男 2
.. .. ...
295 男 4
296 男 8
297 女 2
298 男 8
299 男 1
[300 rows x 2 columns]
科目 python math en
0 16 70 85
1 37 8 106
2 25 106 63
3 60 55 101
4 121 114 14
.. ... ... ...
295 33 84 66
296 142 134 89
297 150 88 130
298 62 127 1
299 98 101 80
[300 rows x 3 columns]
分组聚合
df.groupby(by='sex')[['python','math']].max() 3 数学和python根据性别分组,quit解的最大值
|
python |
math |
sex |
|
|
女 |
150 |
150 |
男 |
147 |
149 |
df.groupby(by=['sex','class']).mean().round(1)
|
|
python |
math |
en |
sex |
class |
|
|
|
女 |
1 |
71.8 |
78.6 |
94.1 |
2 |
63.2 |
84.6 |
70.6 |
3 |
94.0 |
75.7 |
66.6 |
4 |
79.5 |
57.4 |
74.7 |
5 |
63.9 |
73.2 |
63.7 |
6 |
82.8 |
80.2 |
77.2 |
7 |
85.5 |
83.2 |
67.0 |
8 |
85.3 |
83.9 |
68.1 |
男 |
1 |
75.0 |
69.1 |
65.5 |
2 |
71.0 |
74.2 |
54.9 |
3 |
72.4 |
77.5 |
79.4 |
4 |
67.6 |
73.9 |
79.2 |
5 |
72.5 |
64.8 |
96.9 |
6 |
85.2 |
65.7 |
65.8 |
7 |
75.5 |
85.5 |
86.8 |
8 |
72.0 |
80.6 |
72.3 |
df.groupby(by=['sex','class']).size()
sex class
女 1 19
2 19
3 13
4 15
5 10
6 17
7 25
8 18
男 1 19
2 21
3 27
4 16
5 19
6 18
7 23
8 21
dtype: int64
df.groupby(by=['sex','class']).describe().round(1)
|
|
python |
math |
en |
|
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
count |
mean |
... |
75% |
max |
count |
mean |
std |
min |
25% |
50% |
75% |
max |
sex |
class |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
女 |
1 |
19.0 |
71.8 |
39.6 |
3.0 |
51.5 |
61.0 |
103.0 |
145.0 |
19.0 |
78.6 |
... |
115.5 |
150.0 |
19.0 |
94.1 |
48.5 |
0.0 |
80.5 |
103.0 |
129.0 |
144.0 |
2 |
19.0 |
63.2 |
45.2 |
1.0 |
27.0 |
53.0 |
99.0 |
150.0 |
19.0 |
84.6 |
... |
111.0 |
144.0 |
19.0 |
70.6 |
55.8 |
0.0 |
18.5 |
71.0 |
123.5 |
146.0 |
3 |
13.0 |
94.0 |
48.5 |
0.0 |
69.0 |
89.0 |
139.0 |
147.0 |
13.0 |
75.7 |
... |
112.0 |
134.0 |
13.0 |
66.6 |
32.7 |
14.0 |
42.0 |
73.0 |
95.0 |
122.0 |
4 |
15.0 |
79.5 |
46.4 |
6.0 |
42.0 |
84.0 |
122.5 |
148.0 |
15.0 |
57.4 |
... |
85.0 |
147.0 |
15.0 |
74.7 |
38.6 |
10.0 |
44.5 |
80.0 |
104.0 |
133.0 |
5 |
10.0 |
63.9 |
42.0 |
6.0 |
34.8 |
54.5 |
95.2 |
137.0 |
10.0 |
73.2 |
... |
93.2 |
124.0 |
10.0 |
63.7 |
48.4 |
5.0 |
30.0 |
56.5 |
93.5 |
142.0 |
6 |
17.0 |
82.8 |
37.2 |
25.0 |
61.0 |
86.0 |
110.0 |
134.0 |
17.0 |
80.2 |
... |
116.0 |
137.0 |
17.0 |
77.2 |
38.8 |
10.0 |
45.0 |
87.0 |
107.0 |
144.0 |
7 |
25.0 |
85.5 |
41.3 |
9.0 |
53.0 |
83.0 |
120.0 |
147.0 |
25.0 |
83.2 |
... |
103.0 |
148.0 |
25.0 |
67.0 |
39.9 |
3.0 |
41.0 |
55.0 |
106.0 |
146.0 |
8 |
18.0 |
85.3 |
47.8 |
3.0 |
44.2 |
96.5 |
131.0 |
141.0 |
18.0 |
83.9 |
... |
113.8 |
147.0 |
18.0 |
68.1 |
49.0 |
4.0 |
23.8 |
61.0 |
113.5 |
147.0 |
男 |
1 |
19.0 |
75.0 |
41.5 |
5.0 |
43.0 |
79.0 |
105.0 |
140.0 |
19.0 |
69.1 |
... |
106.5 |
138.0 |
19.0 |
65.5 |
46.4 |
5.0 |
14.0 |
67.0 |
109.5 |
133.0 |
2 |
21.0 |
71.0 |
39.6 |
8.0 |
36.0 |
80.0 |
105.0 |
138.0 |
21.0 |
74.2 |
... |
114.0 |
143.0 |
21.0 |
54.9 |
35.4 |
12.0 |
18.0 |
54.0 |
78.0 |
133.0 |
3 |
27.0 |
72.4 |
43.6 |
8.0 |
35.0 |
69.0 |
109.0 |
145.0 |
27.0 |
77.5 |
... |
115.5 |
146.0 |
27.0 |
79.4 |
39.0 |
5.0 |
59.0 |
86.0 |
101.0 |
147.0 |
4 |
16.0 |
67.6 |
46.8 |
19.0 |
32.0 |
45.5 |
127.0 |
137.0 |
16.0 |
73.9 |
... |
109.8 |
149.0 |
16.0 |
79.2 |
33.1 |
19.0 |
60.5 |
69.0 |
108.8 |
134.0 |
5 |
19.0 |
72.5 |
43.1 |
3.0 |
40.5 |
79.0 |
106.5 |
145.0 |
19.0 |
64.8 |
... |
92.0 |
130.0 |
19.0 |
96.9 |
40.0 |
12.0 |
73.0 |
95.0 |
127.5 |
149.0 |
6 |
18.0 |
85.2 |
45.0 |
12.0 |
43.5 |
92.5 |
123.0 |
147.0 |
18.0 |
65.7 |
... |
108.5 |
140.0 |
18.0 |
65.8 |
41.0 |
18.0 |
33.8 |
52.5 |
92.8 |
142.0 |
7 |
23.0 |
75.5 |
41.4 |
12.0 |
51.5 |
69.0 |
107.5 |
144.0 |
23.0 |
85.5 |
... |
116.0 |
149.0 |
23.0 |
86.8 |
38.8 |
8.0 |
64.5 |
83.0 |
112.5 |
150.0 |
8 |
21.0 |
72.0 |
43.8 |
12.0 |
32.0 |
66.0 |
91.0 |
146.0 |
21.0 |
80.6 |
... |
125.0 |
139.0 |
21.0 |
72.3 |
46.4 |
1.0 |
43.0 |
70.0 |
103.0 |
146.0 |
16 rows × 24 columns
分组聚合apply、transform
df
|
sex |
class |
python |
math |
en |
0 |
男 |
8 |
16 |
70 |
85 |
1 |
男 |
1 |
37 |
8 |
106 |
2 |
男 |
4 |
25 |
106 |
63 |
3 |
男 |
3 |
60 |
55 |
101 |
4 |
男 |
2 |
121 |
114 |
14 |
... |
... |
... |
... |
... |
... |
295 |
男 |
4 |
33 |
84 |
66 |
296 |
男 |
8 |
142 |
134 |
89 |
297 |
女 |
2 |
150 |
88 |
130 |
298 |
男 |
8 |
62 |
127 |
1 |
299 |
男 |
1 |
98 |
101 |
80 |
300 rows × 5 columns
df.groupby(by=['sex','class'])[['python','en']].apply(np.mean).round(1)
C:\Users\Panda\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\fromnumeric.py:3472: FutureWarning: In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'
return mean(axis=axis, dtype=dtype, out=out, **kwargs)
|
|
python |
en |
sex |
class |
|
|
女 |
1 |
71.8 |
94.1 |
2 |
63.2 |
70.6 |
3 |
94.0 |
66.6 |
4 |
79.5 |
74.7 |
5 |
63.9 |
63.7 |
6 |
82.8 |
77.2 |
7 |
85.5 |
67.0 |
8 |
85.3 |
68.1 |
男 |
1 |
75.0 |
65.5 |
2 |
71.0 |
54.9 |
3 |
72.4 |
79.4 |
4 |
67.6 |
79.2 |
5 |
72.5 |
96.9 |
6 |
85.2 |
65.8 |
7 |
75.5 |
86.8 |
8 |
72.0 |
72.3 |
df.groupby(by=['sex','class'])[['python','en']].transform(np.mean).round(1)
|
python |
en |
0 |
72.0 |
72.3 |
1 |
75.0 |
65.5 |
2 |
67.6 |
79.2 |
3 |
72.4 |
79.4 |
4 |
71.0 |
54.9 |
... |
... |
... |
295 |
67.6 |
79.2 |
296 |
72.0 |
72.3 |
297 |
63.2 |
70.6 |
298 |
72.0 |
72.3 |
299 |
75.0 |
65.5 |
300 rows × 2 columns
def _mean(x):
return np.round(x.mean(),1)
df.groupby(by=['sex','class'])[['python','en']].transform(_mean)
|
python |
en |
0 |
72.0 |
72.3 |
1 |
75.0 |
65.5 |
2 |
67.6 |
79.2 |
3 |
72.4 |
79.4 |
4 |
71.0 |
54.9 |
... |
... |
... |
295 |
67.6 |
79.2 |
296 |
72.0 |
72.3 |
297 |
63.2 |
70.6 |
298 |
72.0 |
72.3 |
299 |
75.0 |
65.5 |
300 rows × 2 columns
分组聚合agg
df.groupby(by = ['sex','class']).agg([np.mean,np.max,np.min])
|
|
python |
math |
en |
|
|
mean |
amax |
amin |
mean |
amax |
amin |
mean |
amax |
amin |
sex |
class |
|
|
|
|
|
|
|
|
|
女 |
1 |
71.789474 |
145 |
3 |
78.631579 |
150 |
17 |
94.052632 |
144 |
0 |
2 |
63.210526 |
150 |
1 |
84.631579 |
144 |
3 |
70.631579 |
146 |
0 |
3 |
94.000000 |
147 |
0 |
75.692308 |
134 |
0 |
66.615385 |
122 |
14 |
4 |
79.466667 |
148 |
6 |
57.400000 |
147 |
5 |
74.666667 |
133 |
10 |
5 |
63.900000 |
137 |
6 |
73.200000 |
124 |
27 |
63.700000 |
142 |
5 |
6 |
82.764706 |
134 |
25 |
80.235294 |
137 |
13 |
77.176471 |
144 |
10 |
7 |
85.520000 |
147 |
9 |
83.240000 |
148 |
9 |
66.960000 |
146 |
3 |
8 |
85.333333 |
141 |
3 |
83.944444 |
147 |
19 |
68.111111 |
147 |
4 |
男 |
1 |
75.000000 |
140 |
5 |
69.105263 |
138 |
0 |
65.526316 |
133 |
5 |
2 |
71.000000 |
138 |
8 |
74.190476 |
143 |
17 |
54.857143 |
133 |
12 |
3 |
72.370370 |
145 |
8 |
77.518519 |
146 |
3 |
79.444444 |
147 |
5 |
4 |
67.562500 |
137 |
19 |
73.937500 |
149 |
6 |
79.187500 |
134 |
19 |
5 |
72.473684 |
145 |
3 |
64.842105 |
130 |
2 |
96.894737 |
149 |
12 |
6 |
85.222222 |
147 |
12 |
65.722222 |
140 |
17 |
65.777778 |
142 |
18 |
7 |
75.521739 |
144 |
12 |
85.478261 |
149 |
15 |
86.782609 |
150 |
8 |
8 |
72.000000 |
146 |
12 |
80.571429 |
139 |
6 |
72.333333 |
146 |
1 |
df.groupby(by = ['sex','class']).agg([('平均值',np.mean),
('最大值',np.max),
('最小值',np.min)]).round(1)
|
|
python |
math |
en |
|
|
平均值 |
最大值 |
最小值 |
平均值 |
最大值 |
最小值 |
平均值 |
最大值 |
最小值 |
sex |
class |
|
|
|
|
|
|
|
|
|
女 |
1 |
71.8 |
145 |
3 |
78.6 |
150 |
17 |
94.1 |
144 |
0 |
2 |
63.2 |
150 |
1 |
84.6 |
144 |
3 |
70.6 |
146 |
0 |
3 |
94.0 |
147 |
0 |
75.7 |
134 |
0 |
66.6 |
122 |
14 |
4 |
79.5 |
148 |
6 |
57.4 |
147 |
5 |
74.7 |
133 |
10 |
5 |
63.9 |
137 |
6 |
73.2 |
124 |
27 |
63.7 |
142 |
5 |
6 |
82.8 |
134 |
25 |
80.2 |
137 |
13 |
77.2 |
144 |
10 |
7 |
85.5 |
147 |
9 |
83.2 |
148 |
9 |
67.0 |
146 |
3 |
8 |
85.3 |
141 |
3 |
83.9 |
147 |
19 |
68.1 |
147 |
4 |
男 |
1 |
75.0 |
140 |
5 |
69.1 |
138 |
0 |
65.5 |
133 |
5 |
2 |
71.0 |
138 |
8 |
74.2 |
143 |
17 |
54.9 |
133 |
12 |
3 |
72.4 |
145 |
8 |
77.5 |
146 |
3 |
79.4 |
147 |
5 |
4 |
67.6 |
137 |
19 |
73.9 |
149 |
6 |
79.2 |
134 |
19 |
5 |
72.5 |
145 |
3 |
64.8 |
130 |
2 |
96.9 |
149 |
12 |
6 |
85.2 |
147 |
12 |
65.7 |
140 |
17 |
65.8 |
142 |
18 |
7 |
75.5 |
144 |
12 |
85.5 |
149 |
15 |
86.8 |
150 |
8 |
8 |
72.0 |
146 |
12 |
80.6 |
139 |
6 |
72.3 |
146 |
1 |
df.groupby(by=['sex','class']).agg({'python':np.max,
'math':np.min,
'en':np.mean}).round(1)
|
|
python |
math |
en |
sex |
class |
|
|
|
女 |
1 |
145 |
17 |
94.1 |
2 |
150 |
3 |
70.6 |
3 |
147 |
0 |
66.6 |
4 |
148 |
5 |
74.7 |
5 |
137 |
27 |
63.7 |
6 |
134 |
13 |
77.2 |
7 |
147 |
9 |
67.0 |
8 |
141 |
19 |
68.1 |
男 |
1 |
140 |
0 |
65.5 |
2 |
138 |
17 |
54.9 |
3 |
145 |
3 |
79.4 |
4 |
137 |
6 |
79.2 |
5 |
145 |
2 |
96.9 |
6 |
147 |
17 |
65.8 |
7 |
144 |
15 |
86.8 |
8 |
146 |
6 |
72.3 |
透视表pivot_table
df
|
sex |
class |
python |
math |
en |
0 |
男 |
8 |
16 |
70 |
85 |
1 |
男 |
1 |
37 |
8 |
106 |
2 |
男 |
4 |
25 |
106 |
63 |
3 |
男 |
3 |
60 |
55 |
101 |
4 |
男 |
2 |
121 |
114 |
14 |
... |
... |
... |
... |
... |
... |
295 |
男 |
4 |
33 |
84 |
66 |
296 |
男 |
8 |
142 |
134 |
89 |
297 |
女 |
2 |
150 |
88 |
130 |
298 |
男 |
8 |
62 |
127 |
1 |
299 |
男 |
1 |
98 |
101 |
80 |
300 rows × 5 columns
df.pivot_table(values = ['python','en'],
index=['sex','class'],
aggfunc='mean').round(1)
|
|
en |
python |
sex |
class |
|
|
女 |
1 |
94.1 |
71.8 |
2 |
70.6 |
63.2 |
3 |
66.6 |
94.0 |
4 |
74.7 |
79.5 |
5 |
63.7 |
63.9 |
6 |
77.2 |
82.8 |
7 |
67.0 |
85.5 |
8 |
68.1 |
85.3 |
男 |
1 |
65.5 |
75.0 |
2 |
54.9 |
71.0 |
3 |
79.4 |
72.4 |
4 |
79.2 |
67.6 |
5 |
96.9 |
72.5 |
6 |
65.8 |
85.2 |
7 |
86.8 |
75.5 |
8 |
72.3 |
72.0 |
df1 = df.pivot_table(values = ['python','math','en'],
index=['sex','class'],
aggfunc={'python':[('平均值',np.mean)],
'math':[('最大值',np.max),
('最小值',np.min)],
'en':[('标准差',np.std),
('方差',np.var),
('计数',np.size)]}).round(1)
df1
|
|
en |
math |
python |
|
|
方差 |
标准差 |
计数 |
最大值 |
最小值 |
平均值 |
sex |
class |
|
|
|
|
|
|
女 |
1 |
2356.2 |
48.5 |
19 |
150 |
17 |
71.8 |
2 |
3115.6 |
55.8 |
19 |
144 |
3 |
63.2 |
3 |
1067.4 |
32.7 |
13 |
134 |
0 |
94.0 |
4 |
1487.4 |
38.6 |
15 |
147 |
5 |
79.5 |
5 |
2342.7 |
48.4 |
10 |
124 |
27 |
63.9 |
6 |
1508.8 |
38.8 |
17 |
137 |
13 |
82.8 |
7 |
1593.5 |
39.9 |
25 |
148 |
9 |
85.5 |
8 |
2398.1 |
49.0 |
18 |
147 |
19 |
85.3 |
男 |
1 |
2150.3 |
46.4 |
19 |
138 |
0 |
75.0 |
2 |
1252.5 |
35.4 |
21 |
143 |
17 |
71.0 |
3 |
1524.5 |
39.0 |
27 |
146 |
3 |
72.4 |
4 |
1097.1 |
33.1 |
16 |
149 |
6 |
67.6 |
5 |
1600.5 |
40.0 |
19 |
130 |
2 |
72.5 |
6 |
1684.1 |
41.0 |
18 |
140 |
17 |
85.2 |
7 |
1507.1 |
38.8 |
23 |
149 |
15 |
75.5 |
8 |
2151.7 |
46.4 |
21 |
139 |
6 |
72.0 |
时间序列
时间戳操作
pd.Timestamp('2020.09.15 20')
Timestamp('2020-09-15 20:00:00')
pd.Period('2020.09.15',freq='M')
Period('2020-09', 'M')
index = pd.date_range('2020.09.15',
freq='D',
period