运算通常会排除缺省项
df
=>
A B C D F
2013-01-01 0.000000 0.000000 1.110245 5 NaN
2013-01-02 -1.142750 -0.604850 1.583275 5 1.0
2013-01-03 0.024858 -1.279228 0.637824 5 2.0
2013-01-04 -0.722681 -1.083921 1.132138 5 3.0
2013-01-05 1.726373 -0.151729 0.391834 5 4.0
2013-01-06 0.508772 -0.751476 -0.699193 5 5.0
求每列平均值
df.mean() #每列平均值
=>
A 0.065762
B -0.645201
C 0.692687
D 5.000000
F 3.000000
dtype: float64
求指定轴上的平均值:
df.mean(1) #算每行的平均值
=>
2013-01-01 1.527561 #因F列为空,第一行算平均值除4
2013-01-02 1.167135
2013-01-03 1.276691
2013-01-04 1.465107
2013-01-05 2.193295
2013-01-06 1.811621
Freq: D, dtype: float64
不同维度的 pandas 对象也可以做运算,它会自动进行对应,shift 用来做对齐操作。
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) #从下标为2,就是第三行开始对齐
s
=>
2013-01-01 NaN
2013-01-02 NaN
2013-01-03 1.0
2013-01-04 3.0
2013-01-05 5.0
2013-01-06 NaN
Freq: D, dtype: float64
对不同维度的 pandas 对象进行减法操作:
df.sub(s, axis='index') #df每一列减去s每一列,与NaN相减都变为NaN
=>
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 -0.975142 -2.279228 -0.362176 4.0 1.0
2013-01-04 -3.722681 -4.083921 -1.867862 2.0 0.0
2013-01-05 -3.273627 -5.151729 -4.608166 0.0 -1.0
2013-01-06 NaN NaN NaN NaN NaN
对数据应用 numpy 函数:
df.apply(np.cumsum) #累加,最后一行的值是前面所有行的累加和
=>
A B C D F
2013-01-01 0.000000 0.000000 -1.509059 5 NaN
2013-01-02 1.212112 -0.173215 -1.389850 10 1
2013-01-03 0.350263 -2.277784 -1.884779 15 3
2013-01-04 1.071818 -2.984555 -2.924354 20 6
2013-01-05 0.646846 -2.417535 -2.648122 25 10
2013-01-06 -0.026844 -2.303886 -4.126549 30 15
应用自定义函数:
df.apply(lambda x: x.max() - x.min())
=>
A 2.073961
B 2.671590
C 1.785291
D 0.000000
F 4.000000
dtype: float64
对不同值的数量进行统计
s = pd.Series(np.random.randint(0, 7, size=10))
s
=>
0 4
1 2
2 1
3 2
4 6
5 4
6 4
7 6
8 4
9 4
dtype: int32
s.value_counts()
=>
4 5 #5个4,2个6。。。。
6 2
2 2
1 1
dtype: int64
String Methods
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()
=>
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
使用 concat() 连接 pandas 对象:
df = pd.DataFrame(np.random.randn(10, 4))
df
=>
0 1 2 3
0 -0.548702 1.467327 -1.015962 -0.483075
1 1.637550 -1.217659 -0.291519 -1.745505
2 -0.263952 0.991460 -0.919069 0.266046
3 -0.709661 1.669052 1.037882 -1.705775
4 -0.919854 -0.042379 1.247642 -0.009920
5 0.290213 0.495767 0.362949 1.548106
6 -1.131345 -0.089329 0.337863 -0.945867
7 -0.932132 1.956030 0.017587 -0.016692
8 -0.575247 0.254161 -1.143704 0.215897
9 1.193555 -0.077118 -0.408530 -0.862495
pieces = [df[:3], df[7:]] #切片
pd.concat(pieces)
=>
0 1 2 3
0 -0.548702 1.467327 -1.015962 -0.483075
1 1.637550 -1.217659 -0.291519 -1.745505
2 -0.263952 0.991460 -0.919069 0.266046
7 -0.932132 1.956030 0.017587 -0.016692
8 -0.575247 0.254161 -1.143704 0.215897
9 1.193555 -0.077118 -0.408530 -0.862495
SQL 风格的合并
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
=>
key lval
0 foo 1
1 foo 2
right
=>
key rval
0 foo 4
1 foo 5
pd.merge(left, right, on='key')
=>
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
在 dataframe 数据后追加行
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df
=>
A B C D
0 1.346061 1.511763 1.627081 -0.990582
1 -0.441652 1.211526 0.268520 0.024580
2 -1.577585 0.396823 -0.105381 -0.532532
3 1.453749 1.208843 -0.080952 -0.264610
4 -0.727965 -0.589346 0.339969 -0.693205
5 -0.339355 0.593616 0.884345 1.591431
6 0.141809 0.220390 0.435589 0.192451
7 -0.096701 0.803351 1.715071 -0.708758
s = df.iloc[3]
df.append(s, ignore_index=True) #最后再加上第四行
=>
A B C D
0 1.346061 1.511763 1.627081 -0.990582
1 -0.441652 1.211526 0.268520 0.024580
2 -1.577585 0.396823 -0.105381 -0.532532
3 1.453749 1.208843 -0.080952 -0.264610
4 -0.727965 -0.589346 0.339969 -0.693205
5 -0.339355 0.593616 0.884345 1.591431
6 0.141809 0.220390 0.435589 0.192451
7 -0.096701 0.803351 1.715071 -0.708758
8 1.453749 1.208843 -0.080952 -0.264610
分组常常意味着可能包含以下的几种的操作中一个或多个:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
=>
A B C D
0 foo one -1.202872 -0.055224
1 bar one -1.814470 2.395985
2 foo two 1.018601 1.552825
3 bar three -0.595447 0.166599
4 foo two 1.395433 0.047609
5 bar two -0.392670 -0.136473
6 foo one 0.007207 -0.561757
7 foo three 1.928123 -1.623033
#对单个分组应用函数,数据被分成了 bar 组与 foo 组,分别计算总和。
df.groupby('A').sum()
=>
C D
A
bar -2.802588 2.42611
foo 3.146492 -0.63958
#依据多个列分组会构成一个分级索引:
df.groupby(['A','B']).sum()
=>
C D
A B
bar one -1.814470 2.395985
three -0.595447 0.166599
two -0.392670 -0.136473
foo one -1.195665 -0.616981
three 1.928123 -1.623033
two 2.414034 1.600434
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
df
=>
A B C D E
0 one A foo 1.418757 -0.179666
1 one B foo -1.879024 1.291836
2 two C foo 0.536826 -0.009614
3 three A bar 1.006160 0.392149
4 one B bar -0.029716 0.264599
5 one C bar -1.146178 -0.057409
6 two A foo 0.100900 -1.425638
7 three B foo -1.035018 1.024098
8 one C foo 0.314665 -0.106062
9 one A bar -0.773723 1.824375
10 two B bar -1.170653 0.595974
11 three C bar 0.648740 1.167115
#生成数据透视表
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
=>
C bar foo
A B
one A -0.773723 1.418757
B -0.029716 -1.879024
C -1.146178 0.314665
three A 1.006160 NaN
B NaN -1.035018
C 0.648740 NaN
two A NaN 0.100900
B -1.170653 NaN
C NaN 0.536826
pandas 拥有既简单又强大的频率变换重新采样功能,下面的例子从 1次/秒 转换到了 1次/5分钟:
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample('5Min').sum()
=>
2012-01-01 25083
Freq: 5T, dtype: int32
#本地化时区表示
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
=>
2012-03-06 0.464000
2012-03-07 0.227371
2012-03-08 -0.496922
2012-03-09 0.306389
2012-03-10 -2.290613
Freq: D, dtype: float64
ts_utc = ts.tz_localize('UTC')
ts_utc
=>
2012-03-06 00:00:00+00:00 0.464000
2012-03-07 00:00:00+00:00 0.227371
2012-03-08 00:00:00+00:00 -0.496922
2012-03-09 00:00:00+00:00 0.306389
2012-03-10 00:00:00+00:00 -2.290613
Freq: D, dtype: float64
#转化成其他时区
ts_utc.tz_convert('US/Eastern')
=>
2012-03-05 19:00:00-05:00 0.464000
2012-03-06 19:00:00-05:00 0.227371
2012-03-07 19:00:00-05:00 -0.496922
2012-03-08 19:00:00-05:00 0.306389
2012-03-09 19:00:00-05:00 -2.290613
Freq: D, dtype: float64
#时间跨度的转换
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
=>
2012-01-31 -1.134623
2012-02-29 -1.561819
2012-03-31 -0.260838
2012-04-30 0.281957
2012-05-31 1.523962
Freq: M, dtype: float64
#转换为周期
ps = ts.to_period()
ps
=>
2012-01 -1.134623
2012-02 -1.561819
2012-03 -0.260838
2012-04 0.281957
2012-05 1.523962
Freq: M, dtype: float64
#转换为时间戳
ps.to_timestamp()
=>
2012-01-01 -1.134623
2012-02-01 -1.561819
2012-03-01 -0.260838
2012-04-01 0.281957
2012-05-01 1.523962
Freq: MS, dtype: float64
#在周期与时间戳之间进行转换这一功能对一些算术函数很有用,下面的例子改变时间序列的相位。
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()
=>
1990-03-01 09:00 -0.902937
1990-06-01 09:00 0.068159
1990-09-01 09:00 -0.057873
1990-12-01 09:00 -0.368204
1991-03-01 09:00 -1.144073
Freq: H, dtype: float64
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df
=>
id raw_grade
0 1 a
1 2 b
2 3 b
3 4 a
4 5 a
5 6 e
#将 raw_grades 转换成 Categoricals 类型。
df["grade"] = df["raw_grade"].astype("category")
df["grade"]
=>
0 a
1 b
2 b
3 a
4 a
5 e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
#重命名分类
df["grade"].cat.categories = ["very good", "good", "very bad"]
=>
#对分类进行重排序,同时加入新的分类。
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df["grade"]
=>
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]
#根据分类的顺序对数据进行排序
df.sort("grade")
=>
id raw_grade grade
5 6 e very bad
1 2 b good
2 3 b good
0 1 a very good
3 4 a very good
4 5 a very good
#按类别分组
df.groupby("grade").size()
=>
grade
very bad 1
bad NaN
medium NaN
good 2
very good 3
dtype: float64
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
在 DataFrame 中, plot() 可以很方便地为所有列作图:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure();
df.plot();
plt.legend(loc='best')
#保存到 csv 文件
df.to_csv('foo.csv')
#从 csv 文件读取数据
pd.read_csv('foo.csv')
#保存到 excel 文件
df.to_excel('foo.xlsx', sheet_name='Sheet1')
#读取 excel 文件
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])