df.mean() #查看每列的平均值
df.mean(1) #查看每行的平均值
#每一行减去一列数
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s
df.sub(s, axis='index')
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 -1.158085 -1.262675 -1.465764 -6.0 -3.0
2013-01-04 -3.679138 -3.191328 -4.159281 -8.0 -6.0
2013-01-05 -5.007158 -6.672655 -5.091954 -10.0 -9.0
2013-01-06 NaN NaN NaN NaN NaN
#对每一列应用累计函数
df.apply(np.cumsum)
A B C D F
2013-01-01 -0.001431 -0.908440 -0.851724 -5 NaN
2013-01-02 -1.093717 -2.312200 -1.815194 -10 -1.0
2013-01-03 -1.251802 -2.574875 -2.280958 -15 -3.0
2013-01-04 -1.930940 -2.766203 -3.440239 -20 -6.0
2013-01-05 -1.938097 -4.438858 -3.532193 -25 -10.0
2013-01-06 -2.051573 -4.438876 -5.427721 -30 -15.0
#每一列的极差
df.apply(lambda x: x.max()-x.min())
A 1.090854
B 1.672638
C 1.803573
D 0.000000
F 4.000000
dtype: float64
#统计每一种元素各出现了几次
s = pd.Series(np.random.randint(0, 7, size=10))
s
0 2
1 0
2 4
3 5
4 0
5 2
6 6
7 3
8 3
9 5
dtype: int32
s.value_counts()
5 2
3 2
2 2
0 2
6 1
4 1
dtype: int64
df.str.lower() #小写
df.str.upper()
df = pd.DataFrame(np.random.randn(10, 4))
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)
比较两种类型的合并:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
key lval
0 foo 1
1 foo 2
right
key rval
0 foo 4
1 foo 5
pd.merge(left, right, on='key')
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
left
key lval
0 foo 1
1 bar 2
right
key rval
0 foo 4
1 bar 5
pd.merge(left, right, on='key')
key lval rval
0 foo 1 4
1 bar 2 5
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df
A B C D
0 -1.221865 -0.313737 0.813024 -2.067007
1 -0.833239 -1.123765 -0.580756 -1.618360
2 0.780570 0.057091 1.610320 1.198047
3 1.306492 -0.657629 0.946997 0.064994
4 -0.104776 -0.300427 -0.226296 -0.638638
5 -0.215063 -0.443774 1.900574 -0.392732
6 -0.108958 0.813018 -0.316127 -1.677159
7 0.678901 0.164350 -1.391680 0.434714
s = df.iloc[3]\
df.append(s, ignore_index=True)
A B C D
0 -1.221865 -0.313737 0.813024 -2.067007
1 -0.833239 -1.123765 -0.580756 -1.618360
2 0.780570 0.057091 1.610320 1.198047
3 1.306492 -0.657629 0.946997 0.064994
4 -0.104776 -0.300427 -0.226296 -0.638638
5 -0.215063 -0.443774 1.900574 -0.392732
6 -0.108958 0.813018 -0.316127 -1.677159
7 0.678901 0.164350 -1.391680 0.434714
8 1.306492 -0.657629 0.946997 0.064994
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
....: 'foo', 'bar', 'foo', 'foo'],
....: 'B' : ['one', 'one', 'two', 'three',
....: 'two', 'two', 'one', 'three'],
....: 'C' : np.random.randn(8),
....: 'D' : np.random.randn(8)})
df
A B C D
0 foo one 1.981136 1.652507
1 bar one 2.676476 -1.424416
2 foo two -0.975054 -0.711273
3 bar three -0.366664 1.363469
4 foo two -1.447261 -0.122510
5 bar two 0.138113 -0.559464
6 foo one -1.292988 -0.375974
7 foo three -0.533342 1.218957
df.groupby('A').sum()
C D
A
bar 2.447925 -0.620411
foo -2.267508 1.661708
df.groupby(['A', 'B']).sum()
C D
A B
bar one 2.676476 -1.424416
three -0.366664 1.363469
two 0.138113 -0.559464
foo one 0.688148 1.276533
three -0.533342 1.218957
two -2.422314 -0.833782
In [95]: tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
....: 'foo', 'foo', 'qux', 'qux'],
....: ['one', 'two', 'one', 'two',
....: 'one', 'two', 'one', 'two']]))
....:
In [96]: index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
In [97]: df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
In [98]: df2 = df[:4]
In [99]: df2
Out[99]:
A B
first second
bar one 0.029399 -0.542108
two 0.282696 -0.087302
baz one -1.575170 1.771208
two 0.816482 1.100230
In [100]: stacked = df2.stack()
In [101]: stacked
Out[101]:
first second
bar one A 0.029399
B -0.542108
two A 0.282696
B -0.087302
baz one A -1.575170
B 1.771208
two A 0.816482
B 1.100230
dtype: float64
In [102]: stacked.unstack()
Out[102]:
A B
first second
bar one 0.029399 -0.542108
two 0.282696 -0.087302
baz one -1.575170 1.771208
two 0.816482 1.100230
In [103]: stacked.unstack(1)
Out[103]:
second one two
first
bar A 0.029399 0.282696
B -0.542108 -0.087302
baz A -1.575170 0.816482
B 1.771208 1.100230
In [104]: stacked.unstack(0)
Out[104]:
first bar baz
second
one A 0.029399 -1.575170
B -0.542108 1.771208
two A 0.282696 0.816482
B -0.087302 1.100230
In [105]: df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
.....: 'B' : ['A', 'B', 'C'] * 4,
.....: 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
.....: 'D' : np.random.randn(12),
.....: 'E' : np.random.randn(12)})
.....:
In [106]: df
Out[106]:
A B C D E
0 one A foo 1.418757 -0.179666
1 one B foo -1.879024 1.291836
2 two C foo 0.536826 -0.009614
3 three A bar 1.006160 0.392149
4 one B bar -0.029716 0.264599
5 one C bar -1.146178 -0.057409
6 two A foo 0.100900 -1.425638
7 three B foo -1.035018 1.024098
8 one C foo 0.314665 -0.106062
9 one A bar -0.773723 1.824375
10 two B bar -1.170653 0.595974
11 three C bar 0.648740 1.167115
#数据透视表
In [107]: pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
Out[107]:
C bar foo
A B
one A -0.773723 1.418757
B -0.029716 -1.879024
C -1.146178 0.314665
three A 1.006160 NaN
B NaN -1.035018
C 0.648740 NaN
two A NaN 0.100900
B -1.170653 NaN
C NaN 0.536826