1. 数据索引
import pandas as pd
left = pd.DataFrame( {
'key':['k1', 'k2', 'k3', 'k4'],
'name':['n1', 'n2', 'n3', 'n4'],
'id':['id1', 'id2', 'id3', 'id4']
}
)
right = pd.DataFrame( {
'key':['k221', 'k222', 'k223', 'k224'],
'name':['n221', 'n222', 'n223', 'n224'],
'id':['id221', 'id222', 'id223', 'id224']
}
)
print(left)
print(right)
id key name
0 id1 k1 n1
1 id2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
id key name
0 id221 k221 n221
1 id222 k222 n222
2 id223 k223 n223
3 id224 k224 n224
A 选取行
left.loc[2]
id id3
key k3
name n3
Name: 2, dtype: object
left.loc[ [1,2,3]]
|
id |
key |
name |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
left.loc[ 1:2 ]
|
id |
key |
name |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
mask = left['name'] > 'n1'
mask
0 False
1 True
2 True
3 True
Name: name, dtype: bool
left.loc[ mask]
|
id |
key |
name |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
left.iloc[3]
id id4
key k4
name n4
Name: 3, dtype: object
left.iloc[ [0,1]]
|
id |
key |
name |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
left.iloc[ 0:2]
|
id |
key |
name |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
left.loc[ 0:2]
|
id |
key |
name |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
left.set_index('key')
|
id |
name |
key |
|
|
k1 |
id1 |
n1 |
k2 |
id2 |
n2 |
k3 |
id3 |
n3 |
k4 |
id4 |
n4 |
left.set_index('name')
|
id |
key |
name |
|
|
n1 |
id1 |
k1 |
n2 |
id2 |
k2 |
n3 |
id3 |
k3 |
n4 |
id4 |
k4 |
left.loc[ 0:2]
|
id |
key |
name |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
left_reset_index = left.set_index("key")
left_reset_index
|
id |
name |
key |
|
|
k1 |
id1 |
n1 |
k2 |
id2 |
n2 |
k3 |
id3 |
n3 |
k4 |
id4 |
n4 |
left_reset_index.loc[ ['k1']]
left_reset_index.iloc[ [0,2]]
|
id |
name |
key |
|
|
k1 |
id1 |
n1 |
k3 |
id3 |
n3 |
B. 选取列
left[ 'name']
0 n1
1 n2
2 n3
3 n4
Name: name, dtype: object
left[ ['name', 'id']]
|
name |
id |
0 |
n1 |
id1 |
1 |
n2 |
id2 |
2 |
n3 |
id3 |
3 |
n4 |
id4 |
left[ 0:2]
|
id |
key |
name |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
left[ left['name'] > 'n2']
|
id |
key |
name |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
C. 选取行和列
left
left.loc[0, "key"]
'k1'
left.loc[ [1,2], ['name', 'id']]
|
name |
id |
1 |
n2 |
id2 |
2 |
n3 |
id3 |
left.loc[ 0:2, :]
|
id |
key |
name |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
left.iloc[ [0,1],[0,2]]
|
id |
name |
0 |
id1 |
n1 |
1 |
id2 |
n2 |
left.iloc[ [0,1],[0,1]]
基本操作
left.columns
Index(['id', 'key', 'name'], dtype='object')
left.columns = ["xuhao", 'guanjianzhi', 'mingzi']
left
|
xuhao |
guanjianzhi |
mingzi |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
left.xuhao
0 id1
1 id2
2 id3
3 id4
Name: xuhao, dtype: object
left.rename(columns={
"xuhao":"ID", "guanjianzhi":"KEY", "mingzi":"NAME"})
|
ID |
KEY |
NAME |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
left
|
xuhao |
guanjianzhi |
mingzi |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
left.rename( columns={
"xuhao":"ID", "guanjianzhi":"KEY", "mingzi":"NAME"}, inplace=True)
left
|
ID |
KEY |
NAME |
0 |
id1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
left.replace( {
"ID":{
'id1':"ID1", 'id2':'ID2'}}, inplace=True);
left
|
ID |
KEY |
NAME |
0 |
ID1 |
k1 |
n1 |
1 |
ID2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
基本的排序和增删
left.ID.unique()
array(['ID1', 'ID2', 'id3', 'id4'], dtype=object)
left.ID.value_counts()
ID1 1
id3 1
ID2 1
id4 1
Name: ID, dtype: int64
left.sort_values('ID')
|
ID |
KEY |
NAME |
0 |
ID1 |
k1 |
n1 |
1 |
ID2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
left.sort_values(['ID', 'NAME'], ascending=[True, False])
|
ID |
KEY |
NAME |
0 |
ID1 |
k1 |
n1 |
1 |
ID2 |
k2 |
n2 |
2 |
id3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
left.sort_values( ['ID', "NAME"], ascending=[False, False])
|
ID |
KEY |
NAME |
3 |
id4 |
k4 |
n4 |
2 |
id3 |
k3 |
n3 |
1 |
ID2 |
k2 |
n2 |
0 |
ID1 |
k1 |
n1 |
left.replace( {
"ID":{
'id3':'ID3', 'ID2':'id2'}}, inplace=True)
left
|
ID |
KEY |
NAME |
0 |
ID1 |
k1 |
n1 |
1 |
id2 |
k2 |
n2 |
2 |
ID3 |
k3 |
n3 |
3 |
id4 |
k4 |
n4 |
left.sort_values(['ID'], ascending=[True])
|
ID |
KEY |
NAME |
0 |
ID1 |
k1 |
n1 |
2 |
ID3 |
k3 |
n3 |
1 |
id2 |
k2 |
n2 |
3 |
id4 |
k4 |
n4 |
left['mailema'] = 1
left
|
ID |
KEY |
NAME |
mailema |
0 |
ID1 |
k1 |
n1 |
1 |
1 |
id2 |
k2 |
n2 |
1 |
2 |
ID3 |
k3 |
n3 |
1 |
3 |
id4 |
k4 |
n4 |
1 |
left['rongyuceshilie'] = 1
left
|
ID |
KEY |
NAME |
mailema |
rongyuceshilie |
0 |
ID1 |
k1 |
n1 |
1 |
1 |
1 |
id2 |
k2 |
n2 |
1 |
1 |
2 |
ID3 |
k3 |
n3 |
1 |
1 |
3 |
id4 |
k4 |
n4 |
1 |
1 |
del left['rongyuceshilie']
left
|
ID |
KEY |
NAME |
mailema |
0 |
ID1 |
k1 |
n1 |
1 |
1 |
id2 |
k2 |
n2 |
1 |
2 |
ID3 |
k3 |
n3 |
1 |
3 |
id4 |
k4 |
n4 |
1 |
ret = left.drop(labels=2)
print(left)
print(ret)
ID KEY NAME mailema
0 ID1 k1 n1 1
1 id2 k2 n2 1
2 ID3 k3 n3 1
3 id4 k4 n4 1
ID KEY NAME mailema
0 ID1 k1 n1 1
1 id2 k2 n2 1
3 id4 k4 n4 1
left.NAME
ret = left.NAME.map({
'n1':'NNN1','n2':"N2", 'n3':'NN3', 'n4':"NN4"})
print(ret)
print(left.NAME)
0 NNN1
1 N2
2 NN3
3 NN4
Name: NAME, dtype: object
0 n1
1 n2
2 n3
3 n4
Name: NAME, dtype: object
import numpy as np
left.NAME.map("hello my name is {}".format)
test_series = left.NAME
yy = pd.Series(['name_ext', np.nan])
print(yy)
test_series = test_series.append(yy)
test_series = test_series.map("my name is {}".format, na_action='ignore')
print(test_series)
0 name_ext
1 NaN
dtype: object
0 my name is n1
1 my name is n2
2 my name is n3
3 my name is n4
0 my name is name_ext
1 NaN
dtype: object
from collections import Counter
counter = Counter()
counter['bar'] += 1
counter
Counter({'bar': 1})
left.min()
ID ID1
KEY k1
NAME n1
mailema 1
rongyuceshilie 1
dtype: object
left.max()
ID id4
KEY k4
NAME n4
mailema 1
rongyuceshilie 1
dtype: object
left.sum()
ID ID1id2ID3id4
KEY k1k2k3k4
NAME n1n2n3n4
mailema 4
rongyuceshilie 4
dtype: object
left.cumsum()
|
ID |
KEY |
NAME |
mailema |
rongyuceshilie |
0 |
ID1 |
k1 |
n1 |
1 |
1 |
1 |
ID1id2 |
k1k2 |
n1n2 |
2 |
2 |
2 |
ID1id2ID3 |
k1k2k3 |
n1n2n3 |
3 |
3 |
3 |
ID1id2ID3id4 |
k1k2k3k4 |
n1n2n3n4 |
4 |
4 |
pandas 常用操作矩阵运算
df = pd.DataFrame(np.random.random( (5, 10)), columns = list('abcdefghij'))
df
|
a |
b |
c |
d |
e |
f |
g |
h |
i |
j |
0 |
0.920187 |
0.468342 |
0.251023 |
0.012371 |
0.410920 |
0.823567 |
0.644462 |
0.971567 |
0.363926 |
0.125066 |
1 |
0.388891 |
0.017500 |
0.400027 |
0.907302 |
0.747884 |
0.850026 |
0.871119 |
0.361581 |
0.313365 |
0.517795 |
2 |
0.858049 |
0.818444 |
0.429782 |
0.953707 |
0.362399 |
0.597594 |
0.583865 |
0.363218 |
0.671535 |
0.887737 |
3 |
0.981348 |
0.884835 |
0.333632 |
0.930583 |
0.990436 |
0.264681 |
0.349709 |
0.938117 |
0.729409 |
0.371021 |
4 |
0.409097 |
0.270208 |
0.093192 |
0.985581 |
0.389709 |
0.891675 |
0.551112 |
0.897221 |
0.573714 |
0.854950 |
df.idxmax()
a 3
b 3
c 2
d 4
e 3
f 4
g 1
h 0
i 3
j 2
dtype: int64
df*10
|
a |
b |
c |
d |
e |
f |
g |
h |
i |
j |
0 |
9.201869 |
4.683421 |
2.510227 |
0.123709 |
4.109205 |
8.235670 |
6.444622 |
9.715667 |
3.639265 |
1.250659 |
1 |
3.888908 |
0.174995 |
4.000269 |
9.073024 |
7.478841 |
8.500256 |
8.711186 |
3.615811 |
3.133648 |
5.177951 |
2 |
8.580488 |
8.184435 |
4.297818 |
9.537071 |
3.623993 |
5.975941 |
5.838652 |
3.632183 |
6.715348 |
8.877372 |
3 |
9.813483 |
8.848354 |
3.336317 |
9.305834 |
9.904355 |
2.646808 |
3.497094 |
9.381173 |
7.294089 |
3.710212 |
4 |
4.090968 |
2.702080 |
0.931915 |
9.855806 |
3.897086 |
8.916751 |
5.511120 |
8.972206 |
5.737144 |
8.549502 |
df*df
|
a |
b |
c |
d |
e |
f |
g |
h |
i |
j |
0 |
0.846744 |
0.219344 |
0.063012 |
0.000153 |
0.168856 |
0.678263 |
0.415332 |
0.943942 |
0.132442 |
0.015641 |
1 |
0.151236 |
0.000306 |
0.160022 |
0.823198 |
0.559331 |
0.722544 |
0.758848 |
0.130741 |
0.098197 |
0.268112 |
2 |
0.736248 |
0.669850 |
0.184712 |
0.909557 |
0.131333 |
0.357119 |
0.340899 |
0.131928 |
0.450959 |
0.788077 |
3 |
0.963045 |
0.782934 |
0.111310 |
0.865985 |
0.980962 |
0.070056 |
0.122297 |
0.880064 |
0.532037 |
0.137657 |
4 |
0.167360 |
0.073012 |
0.008685 |
0.971369 |
0.151873 |
0.795084 |
0.303724 |
0.805005 |
0.329148 |
0.730940 |
df.dot(df.T)
|
0 |
1 |
2 |
3 |
4 |
0 |
3.483729 |
2.576565 |
3.018221 |
3.486335 |
2.975670 |
1 |
2.576565 |
3.672533 |
3.474288 |
3.405148 |
3.571698 |
2 |
3.018221 |
3.474288 |
4.700682 |
4.478348 |
4.018175 |
3 |
3.486335 |
3.405148 |
4.478348 |
5.446347 |
3.980907 |
4 |
2.975670 |
3.571698 |
4.018175 |
3.980907 |
4.336201 |
df.T.dot(df)
|
a |
b |
c |
d |
e |
f |
g |
h |
i |
j |
a |
2.864633 |
2.118905 |
1.120862 |
2.498977 |
2.111316 |
2.225693 |
2.001426 |
2.633968 |
1.983464 |
1.792030 |
b |
2.118905 |
1.745446 |
0.796707 |
1.891951 |
1.483817 |
1.364820 |
1.253284 |
1.831142 |
1.525968 |
1.353505 |
c |
1.120862 |
0.796707 |
0.527741 |
1.178257 |
0.924835 |
0.975005 |
0.929213 |
0.941231 |
0.802141 |
0.823518 |
d |
2.498977 |
1.891951 |
1.178257 |
3.570262 |
2.335035 |
2.476474 |
2.223776 |
2.443766 |
2.173484 |
2.505874 |
e |
2.111316 |
1.483817 |
0.924835 |
2.335035 |
1.992355 |
1.800352 |
1.689049 |
2.080087 |
1.573283 |
1.461012 |
f |
2.225693 |
1.364820 |
0.975005 |
2.476474 |
1.800352 |
2.623065 |
2.204120 |
2.372892 |
1.672019 |
1.934186 |
g |
2.001426 |
1.253284 |
0.929213 |
2.223776 |
1.689049 |
2.204120 |
1.941099 |
1.975726 |
1.470863 |
1.650903 |
h |
2.633968 |
1.831142 |
0.941231 |
2.443766 |
2.080087 |
2.372892 |
1.975726 |
2.891679 |
1.909819 |
1.746318 |
i |
1.983464 |
1.525968 |
0.802141 |
2.173484 |
1.573283 |
1.672019 |
1.470863 |
1.909819 |
1.542785 |
1.565043 |
j |
1.792030 |
1.353505 |
0.823518 |
2.505874 |
1.461012 |
1.934186 |
1.650903 |
1.746318 |
1.565043 |
1.940427 |