import pandas as pd
import numpy as np
Series
s = pd.Series([1, 2, 3, 6, np.nan, 6, 8, 10])
s
0 1.0
1 2.0
2 3.0
3 6.0
4 NaN
5 6.0
6 8.0
7 10.0
dtype: float64
DataFrame
利用
date_range
创建时间索引,并用numpy
创建一个随机数组,创建一个DataFrame
dates = pd.date_range("20230101", periods=10)
df = pd.DataFrame(np.random.randn(10,4), index=dates, columns=['A','B','C','D'])
df
A B C D
2023-01-01 -0.153925 -0.723386 0.392443 -0.745368
2023-01-02 -0.257983 1.627218 -0.362061 0.891725
2023-01-03 -1.199357 1.456383 0.264737 -1.653872
2023-01-04 1.619186 -0.114296 0.391061 0.762666
2023-01-05 -0.388519 -0.347260 0.368026 0.856233
2023-01-06 -0.214157 0.090970 1.585380 -2.145105
2023-01-07 -0.347972 -0.421805 0.456455 -0.647794
2023-01-08 -1.111556 0.644303 -1.621916 -0.713999
2023-01-09 -1.934481 -0.299281 0.664085 -0.719971
2023-01-10 -1.080188 -1.709461 -0.215850 -0.170179
利用字典数据创建
DataFrame
df2 = pd.DataFrame(
{
"A": 1.0,
"B": pd.Timestamp("20230101"),
"C": pd.Series(1, index=list(range(5)), dtype='float32'),
"D": np.array([5] * 5, dtype='int32'),
"E": pd.Categorical(["test", "train", "test", "train", "val"]),
"F": "any"
}
)
df2
A B C D E F
0 1.0 2023-01-01 1.0 5 test any
1 1.0 2023-01-01 1.0 5 train any
2 1.0 2023-01-01 1.0 5 test any
3 1.0 2023-01-01 1.0 5 train any
4 1.0 2023-01-01 1.0 5 val any
DataFrame.head(), DataFrame.tail()
查看头部数据和尾部数据df.head()
如果没有写显示几个,默认为 5 个
A B C D
2023-01-01 -0.153925 -0.723386 0.392443 -0.745368
2023-01-02 -0.257983 1.627218 -0.362061 0.891725
2023-01-03 -1.199357 1.456383 0.264737 -1.653872
2023-01-04 1.619186 -0.114296 0.391061 0.762666
2023-01-05 -0.388519 -0.347260 0.368026 0.856233
df.tail(3)
A B C D
2023-01-08 -1.111556 0.644303 -1.621916 -0.713999
2023-01-09 -1.934481 -0.299281 0.664085 -0.719971
2023-01-10 -1.080188 -1.709461 -0.215850 -0.170179
df.index
DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
'2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
'2023-01-09', '2023-01-10'],
dtype='datetime64[ns]', freq='D')
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
DataFrame.to_numpy()
给出底层数据的Numpy
表示,当DataFrame
的列具有不同的数据类型时,这可能是一个比较耗时的操作,主要是因为Pandas
和Numpy
之间的根本区别,Numpy
对整个数据有一个dtype
, 而Pandas
的DataFrame
每列有一个dtype
, 当调用DataFrame.to_numpy()
时,Pandas
会找到能保存DataFrame
中所有dtype
的Numpy dtype
, 最终形成一个object
, 这需要将每个值强制转换为Python
对象。
对于df
, 所有浮点值的DataFrame
和DataFrame.to_numpy()
是快速的,不需要复制数据。
df.to_numpy()
array([[-0.153925 , -0.7233858 , 0.39244256, -0.74536788],
[-0.2579828 , 1.62721786, -0.3620608 , 0.89172504],
[-1.19935721, 1.4563827 , 0.2647369 , -1.65387242],
[ 1.61918622, -0.11429613, 0.39106094, 0.76266589],
[-0.3885188 , -0.34726003, 0.36802601, 0.85623267],
[-0.2141565 , 0.09097039, 1.58537994, -2.1451055 ],
[-0.34797235, -0.42180524, 0.45645503, -0.64779373],
[-1.1115556 , 0.6443028 , -1.62191554, -0.713999 ],
[-1.93448125, -0.29928057, 0.66408539, -0.719971 ],
[-1.08018766, -1.70946083, -0.21584991, -0.1701788 ]])
df2.to_numpy()
array([[1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 5, 'test', 'any'],
[1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 5, 'train', 'any'],
[1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 5, 'test', 'any'],
[1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 5, 'train', 'any'],
[1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 5, 'val', 'any']],
dtype=object)
DataFrame.to_numpy()
没有索引和列标签。
describe()
数据的快速摘要df.describe()
A B C D
count 10.000000 10.000000 10.000000 10.000000
mean -0.506895 0.020339 0.192236 -0.428566
std 0.945370 1.001934 0.824217 1.035072
min -1.934481 -1.709461 -1.621916 -2.145105
25% -1.103714 -0.403169 -0.095703 -0.739019
50% -0.368246 -0.206788 0.379543 -0.680896
75% -0.225113 0.505970 0.440452 0.529455
max 1.619186 1.627218 1.585380 0.891725
df.T
2023-01-01 2023-01-02 2023-01-03 2023-01-04 2023-01-05 2023-01-06 \
A -0.153925 -0.257983 -1.199357 1.619186 -0.388519 -0.214157
B -0.723386 1.627218 1.456383 -0.114296 -0.347260 0.090970
C 0.392443 -0.362061 0.264737 0.391061 0.368026 1.585380
D -0.745368 0.891725 -1.653872 0.762666 0.856233 -2.145105
2023-01-07 2023-01-08 2023-01-09 2023-01-10
A -0.347972 -1.111556 -1.934481 -1.080188
B -0.421805 0.644303 -0.299281 -1.709461
C 0.456455 -1.621916 0.664085 -0.215850
D -0.647794 -0.713999 -0.719971 -0.170179
DataFrame.sort_index()
df.sort_index(axis=1, ascending=False)
D C B A
2023-01-01 -0.745368 0.392443 -0.723386 -0.153925
2023-01-02 0.891725 -0.362061 1.627218 -0.257983
2023-01-03 -1.653872 0.264737 1.456383 -1.199357
2023-01-04 0.762666 0.391061 -0.114296 1.619186
2023-01-05 0.856233 0.368026 -0.347260 -0.388519
2023-01-06 -2.145105 1.585380 0.090970 -0.214157
2023-01-07 -0.647794 0.456455 -0.421805 -0.347972
2023-01-08 -0.713999 -1.621916 0.644303 -1.111556
2023-01-09 -0.719971 0.664085 -0.299281 -1.934481
2023-01-10 -0.170179 -0.215850 -1.709461 -1.080188
DataFrame.sort_values()
df.sort_values(by='A')
1673011955589
A B C D
2023-01-09 -1.934481 -0.299281 0.664085 -0.719971
2023-01-03 -1.199357 1.456383 0.264737 -1.653872
2023-01-08 -1.111556 0.644303 -1.621916 -0.713999
2023-01-10 -1.080188 -1.709461 -0.215850 -0.170179
2023-01-05 -0.388519 -0.347260 0.368026 0.856233
2023-01-07 -0.347972 -0.421805 0.456455 -0.647794
2023-01-02 -0.257983 1.627218 -0.362061 0.891725
2023-01-06 -0.214157 0.090970 1.585380 -2.145105
2023-01-01 -0.153925 -0.723386 0.392443 -0.745368
2023-01-04 1.619186 -0.114296 0.391061 0.762666
虽然标准的
Python/Numpy
提供了更加直观的数据表达,并在交互中也很方便,但对于生产性代码,推荐使用优化的Pandas
数据访问方式:DataFrame.at(), DataFrame.iat(), DataFrame.loc(), DataFrame.iloc()
。
Series
,df['A']
等价于 df.A
df['A']
2023-01-01 -0.153925
2023-01-02 -0.257983
2023-01-03 -1.199357
2023-01-04 1.619186
2023-01-05 -0.388519
2023-01-06 -0.214157
2023-01-07 -0.347972
2023-01-08 -1.111556
2023-01-09 -1.934481
2023-01-10 -1.080188
Freq: D, Name: A, dtype: float64
[]
df[0:3]
A B C D
2023-01-01 -0.153925 -0.723386 0.392443 -0.745368
2023-01-02 -0.257983 1.627218 -0.362061 0.891725
2023-01-03 -1.199357 1.456383 0.264737 -1.653872
df["20230104":"20230107"]
A B C D
2023-01-04 1.619186 -0.114296 0.391061 0.762666
2023-01-05 -0.388519 -0.347260 0.368026 0.856233
2023-01-06 -0.214157 0.090970 1.585380 -2.145105
2023-01-07 -0.347972 -0.421805 0.456455 -0.647794
df.loc[dates[0]]
A -0.153925
B -0.723386
C 0.392443
D -0.745368
Name: 2023-01-01 00:00:00, dtype: float64
A
B
两列数据df.loc[:, ["A", "B"]]
A B
2023-01-01 -0.153925 -0.723386
2023-01-02 -0.257983 1.627218
2023-01-03 -1.199357 1.456383
2023-01-04 1.619186 -0.114296
2023-01-05 -0.388519 -0.347260
2023-01-06 -0.214157 0.090970
2023-01-07 -0.347972 -0.421805
2023-01-08 -1.111556 0.644303
2023-01-09 -1.934481 -0.299281
2023-01-10 -1.080188 -1.709461
df.loc["20230106":"20230109", ["C", "D"]]
C D
2023-01-06 1.585380 -2.145105
2023-01-07 0.456455 -0.647794
2023-01-08 -1.621916 -0.713999
2023-01-09 0.664085 -0.719971
df.loc["20230108", ["C", "D"]]
C -1.621916
D -0.713999
Name: 2023-01-08 00:00:00, dtype: float64
df.loc[dates[0], "B"]
-0.7233858038855415
df.at[dates[0], "B"]
与df.loc[dates[0], "B"]
等价
df.iloc[3]
A 1.619186
B -0.114296
C 0.391061
D 0.762666
Name: 2023-01-04 00:00:00, dtype: float64
df.iloc[3:6, 2:4]
C D
2023-01-04 0.391061 0.762666
2023-01-05 0.368026 0.856233
2023-01-06 1.585380 -2.145105
df.iloc[[0, 3, 5], [0, 2]]
A C
2023-01-01 -0.153925 0.392443
2023-01-04 1.619186 0.391061
2023-01-06 -0.214157 1.585380
df.iloc[0:3, :]
A B C D
2023-01-01 -0.153925 -0.723386 0.392443 -0.745368
2023-01-02 -0.257983 1.627218 -0.362061 0.891725
2023-01-03 -1.199357 1.456383 0.264737 -1.653872
df.iloc[:, 0:2]
A B
2023-01-01 -0.153925 -0.723386
2023-01-02 -0.257983 1.627218
2023-01-03 -1.199357 1.456383
2023-01-04 1.619186 -0.114296
2023-01-05 -0.388519 -0.347260
2023-01-06 -0.214157 0.090970
2023-01-07 -0.347972 -0.421805
2023-01-08 -1.111556 0.644303
2023-01-09 -1.934481 -0.299281
2023-01-10 -1.080188 -1.709461
df.iloc[1, 1]
1.627217858284936
df.iat[1, 1]
与df.iloc[1, 1]
等价
df[df["B"]>0]
A B C D
2023-01-02 -0.257983 1.627218 -0.362061 0.891725
2023-01-03 -1.199357 1.456383 0.264737 -1.653872
2023-01-06 -0.214157 0.090970 1.585380 -2.145105
2023-01-08 -1.111556 0.644303 -1.621916 -0.713999
DataFrame
里大于 0
的数据df[df > 0]
A B C D
2023-01-01 NaN NaN 0.392443 NaN
2023-01-02 NaN 1.627218 NaN 0.891725
2023-01-03 NaN 1.456383 0.264737 NaN
2023-01-04 1.619186 NaN 0.391061 0.762666
2023-01-05 NaN NaN 0.368026 0.856233
2023-01-06 NaN 0.090970 1.585380 NaN
2023-01-07 NaN NaN 0.456455 NaN
2023-01-08 NaN 0.644303 NaN NaN
2023-01-09 NaN NaN 0.664085 NaN
2023-01-10 NaN NaN NaN NaN
里面的
NaN
代表该处的值小于0
, 其中比较时,采用的是 或 方式
df2[df2["E"].isin(["test", "val"])]
A B C D E F
0 1.0 2023-01-01 1.0 5 test any
2 1.0 2023-01-01 1.0 5 test any
4 1.0 2023-01-01 1.0 5 val any
其中 df2
的数据为
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u0UCderW-1673366325297)(image/【pandas】用户手册:1-10分钟熟悉pandas./1673014516402.png)]
df.at[dates[2], "B"] = 0
df
A B C D
2023-01-01 -0.153925 -0.723386 0.392443 -0.745368
2023-01-02 -0.257983 1.627218 -0.362061 0.891725
2023-01-03 -1.199357 0.000000 0.264737 -1.653872
2023-01-04 1.619186 -0.114296 0.391061 0.762666
2023-01-05 -0.388519 -0.347260 0.368026 0.856233
2023-01-06 -0.214157 0.090970 1.585380 -2.145105
2023-01-07 -0.347972 -0.421805 0.456455 -0.647794
2023-01-08 -1.111556 0.644303 -1.621916 -0.713999
2023-01-09 -1.934481 -0.299281 0.664085 -0.719971
2023-01-10 -1.080188 -1.709461 -0.215850 -0.170179
df.iat[1, 3] = 0
df
A B C D
2023-01-01 -0.153925 -0.723386 0.392443 -0.745368
2023-01-02 -0.257983 1.627218 -0.362061 0.000000
2023-01-03 -1.199357 0.000000 0.264737 -1.653872
2023-01-04 1.619186 -0.114296 0.391061 0.762666
2023-01-05 -0.388519 -0.347260 0.368026 0.856233
2023-01-06 -0.214157 0.090970 1.585380 -2.145105
2023-01-07 -0.347972 -0.421805 0.456455 -0.647794
2023-01-08 -1.111556 0.644303 -1.621916 -0.713999
2023-01-09 -1.934481 -0.299281 0.664085 -0.719971
2023-01-10 -1.080188 -1.709461 -0.215850 -0.170179
Numpy
给 DataFrame
赋值df.loc[:, "A"] = np.array([109] * len(df) )
df
A B C D
2023-01-01 109 -0.723386 0.392443 -0.745368
2023-01-02 109 1.627218 -0.362061 0.000000
2023-01-03 109 0.000000 0.264737 -1.653872
2023-01-04 109 -0.114296 0.391061 0.762666
2023-01-05 109 -0.347260 0.368026 0.856233
2023-01-06 109 0.090970 1.585380 -2.145105
2023-01-07 109 -0.421805 0.456455 -0.647794
2023-01-08 109 0.644303 -1.621916 -0.713999
2023-01-09 109 -0.299281 0.664085 -0.719971
2023-01-10 109 -1.709461 -0.215850 -0.170179
df3 = df.copy()
df3[df3 < 0] = -df3
df3
A B C D
2023-01-01 109 0.723386 0.392443 0.745368
2023-01-02 109 1.627218 0.362061 0.000000
2023-01-03 109 0.000000 0.264737 1.653872
2023-01-04 109 0.114296 0.391061 0.762666
2023-01-05 109 0.347260 0.368026 0.856233
2023-01-06 109 0.090970 1.585380 2.145105
2023-01-07 109 0.421805 0.456455 0.647794
2023-01-08 109 0.644303 1.621916 0.713999
2023-01-09 109 0.299281 0.664085 0.719971
2023-01-10 109 1.709461 0.215850 0.170179
s1 = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], index=pd.date_range("20230103", periods=10))
# s1
df["F"] = s1
df
A B C D F
2023-01-01 109 -0.723386 0.392443 -0.745368 NaN
2023-01-02 109 1.627218 -0.362061 0.000000 NaN
2023-01-03 109 0.000000 0.264737 -1.653872 1.0
2023-01-04 109 -0.114296 0.391061 0.762666 2.0
2023-01-05 109 -0.347260 0.368026 0.856233 3.0
2023-01-06 109 0.090970 1.585380 -2.145105 4.0
2023-01-07 109 -0.421805 0.456455 -0.647794 5.0
2023-01-08 109 0.644303 -1.621916 -0.713999 6.0
2023-01-09 109 -0.299281 0.664085 -0.719971 7.0
2023-01-10 109 -1.709461 -0.215850 -0.170179 8.0
dropna
删除掉那些 NaN
的数据df1 = df.reindex(index=dates[0:8], columns=list(df.columns) + ["E"])
df1.loc[dates[0]:dates[4], "E"] = 1
df1.dropna( how = "any" )
A B C D F E
2023-01-03 109 0.000000 0.264737 -1.653872 1.0 1.0
2023-01-04 109 -0.114296 0.391061 0.762666 2.0 1.0
2023-01-05 109 -0.347260 0.368026 0.856233 3.0 1.0
df1.fillna(value=9)
A B C D F E
2023-01-01 109 -0.723386 0.392443 -0.745368 9.0 1.0
2023-01-02 109 1.627218 -0.362061 0.000000 9.0 1.0
2023-01-03 109 0.000000 0.264737 -1.653872 1.0 1.0
2023-01-04 109 -0.114296 0.391061 0.762666 2.0 1.0
2023-01-05 109 -0.347260 0.368026 0.856233 3.0 1.0
2023-01-06 109 0.090970 1.585380 -2.145105 4.0 9.0
2023-01-07 109 -0.421805 0.456455 -0.647794 5.0 9.0
2023-01-08 109 0.644303 -1.621916 -0.713999 6.0 9.0
pd.isna(df1)
A B C D F E
2023-01-01 False False False False True False
2023-01-02 False False False False True False
2023-01-03 False False False False False False
2023-01-04 False False False False False False
2023-01-05 False False False False False False
2023-01-06 False False False False False True
2023-01-07 False False False False False True
2023-01-08 False False False False False True
df.mean()
A 109.000000
B -0.125300
C 0.192236
D -0.517739
F 4.500000
dtype: float64
缺失值不计入个数统计和值统计
df.mean(1)
2023-01-01 26.980922
2023-01-02 27.566289
2023-01-03 21.722173
2023-01-04 22.407886
2023-01-05 22.575400
2023-01-06 22.506249
2023-01-07 22.677371
2023-01-08 22.661678
2023-01-09 23.128967
2023-01-10 22.980902
Freq: D, dtype: float64
s = pd.Series([1, 3, 5, np.nan, 6, 8, 9, -1, 2, 3], index=dates).shift(1)
df.sub(s, axis="index")
A B C D F
2023-01-01 NaN NaN NaN NaN NaN
2023-01-02 108.0 0.627218 -1.362061 -1.000000 NaN
2023-01-03 106.0 -3.000000 -2.735263 -4.653872 -2.0
2023-01-04 104.0 -5.114296 -4.608939 -4.237334 -3.0
2023-01-05 NaN NaN NaN NaN NaN
2023-01-06 103.0 -5.909030 -4.414620 -8.145105 -2.0
2023-01-07 101.0 -8.421805 -7.543545 -8.647794 -3.0
2023-01-08 100.0 -8.355697 -10.621916 -9.713999 -3.0
2023-01-09 110.0 0.700719 1.664085 0.280029 8.0
2023-01-10 107.0 -3.709461 -2.215850 -2.170179 6.0
df.apply(np.cumsum)
A B C D F
2023-01-01 109 -0.723386 0.392443 -0.745368 NaN
2023-01-02 218 0.903832 0.030382 -0.745368 NaN
2023-01-03 327 0.903832 0.295119 -2.399240 1.0
2023-01-04 436 0.789536 0.686180 -1.636574 3.0
2023-01-05 545 0.442276 1.054206 -0.780342 6.0
2023-01-06 654 0.533246 2.639586 -2.925447 10.0
2023-01-07 763 0.111441 3.096041 -3.573241 15.0
2023-01-08 872 0.755744 1.474125 -4.287240 21.0
2023-01-09 981 0.456463 2.138210 -5.007211 28.0
2023-01-10 1090 -1.252998 1.922361 -5.177390 36.0
其中 df 的数据为
df.apply(lambda x: x.max() - x.min())
A 0.000000
B 2.717216
C 2.028888
D 2.587731
F 7.000000
dtype: float64
s = pd.Series(np.random.randint(0, 5, size=10))
s
0 1
1 0
2 3
3 1
4 4
5 3
6 2
7 4
8 3
9 2
dtype: int32
s.value_counts()
3 3
4 3
2 2
1 2
dtype: int64
Series
在str
属性中配备了一组字符串处理方法,可以很容易的对数组的每个元素进行操作,如下面的代码所示,str
中的模式匹配默认情况下通常使用正则表达式(在某些情况下总是使用它们。
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
pandas
提供了各种工具,可以轻松地将 Series
和 DataFrame
对象结合在一起,并在连接/合并类型操作的情况下,为索引和关系代数功能提供各种 set
逻辑。
使用 concat()
将 pandas
对象沿某个维度连接在一起:
df = pd.DataFrame(np.random.randn(10, 4))
df
0 1 2 3
0 -1.442477 -1.043943 -0.774844 -0.898237
1 -0.303733 -1.234733 -0.009566 0.481622
2 -1.704948 0.407194 -2.204521 0.357874
3 0.563955 -0.640291 0.809071 -2.357275
4 0.882155 0.267593 -2.405315 1.027958
5 1.025973 -0.043243 -0.371149 0.854007
6 0.406032 -0.284852 0.517701 1.675895
7 -1.137980 1.626351 -0.669841 1.725479
8 0.465242 -0.986676 -0.067137 -0.778349
9 1.349740 -1.668186 -0.421517 -0.024897
pieces = [df[:3], df[7:], df[5:7], df[3:5]]
pd.concat(pieces)
0 1 2 3
0 -1.442477 -1.043943 -0.774844 -0.898237
1 -0.303733 -1.234733 -0.009566 0.481622
2 -1.704948 0.407194 -2.204521 0.357874
7 -1.137980 1.626351 -0.669841 1.725479
8 0.465242 -0.986676 -0.067137 -0.778349
9 1.349740 -1.668186 -0.421517 -0.024897
5 1.025973 -0.043243 -0.371149 0.854007
6 0.406032 -0.284852 0.517701 1.675895
3 0.563955 -0.640291 0.809071 -2.357275
4 0.882155 0.267593 -2.405315 1.027958
向
DataFrame
中添加列是相对较快的。但是,添加一行需要一个副本,而且成本可能很高。我们建议将预先构建的记录列表传递给DataFrame
构造函数,而不是通过迭代地向其添加记录来构建DataFrame
。
merge()
支持沿着特定列的 SQL
样式连接类型。请参阅数据库样式连接部分。
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [3, 4]})
pd.merge(left, right, on="key")
key lval rval
0 foo 1 3
1 foo 1 4
2 foo 2 3
3 foo 2 4
left = pd.DataFrame({"key": ["foo1", "foo2"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo1", "foo2"], "rval": [3, 4]})
pd.merge(left, right, on="key")
# output
# key lval rval
#0 foo1 1 3
#1 foo2 2 4