滤除缺失数据
data.dropna(axis=1,how=’all’) #pandas的dropna函数里的axis=0是对行进行操作
import numpy as np
from pandas import Series,DataFrame #约定俗成的导入方法
import pandas as pd
df=DataFrame(np.random.randn(7,3))
df
0 1 2
0 1.233293 0.808366 -1.395037
1 -2.119901 1.020265 -1.838668
2 0.036282 -1.194296 1.062114
3 -0.005518 -0.508407 2.166345
4 -0.168671 1.506642 1.294891
5 -0.278522 0.016571 -1.090438
6 -0.258114 0.027558 0.122468
df.dropna(thresh=3) ###一行中至少三个非NAN的值
0 1 2
5 -1.596875 1.574546 0.312099
6 -0.963191 -0.304063 0.429177
填充缺失数据
df:
0 1 2
0 0.734225 NaN NaN
1 0.508512 NaN NaN
2 -1.814688 NaN NaN
3 0.914939 NaN 0.486078
4 -1.417042 NaN 1.061631
5 0.327197 -1.658661 0.573605
6 1.516876 -0.322871 -1.013175
df.fillna({1:0.5,3:-1}) #注意:填充具体值处一定是原本缺失数据的地方!!对这些地方中的:第一列赋值0.5,第3列赋值-1(没有第三列)
df
0 1 2
0 0.734225 0.500000 NaN
1 0.508512 0.500000 NaN
2 -1.814688 0.500000 NaN
3 0.914939 0.500000 0.486078
4 -1.417042 0.500000 1.061631
5 0.327197 -1.658661 0.573605
6 1.516876 -0.322871 -1.013175
层次化索引
出现分层索引,即层次化索引
IN:
data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data
OUT:
a 1 2.138609
2 0.608215
3 0.368269
b 1 -0.709574
2 0.671569
3 1.421320
c 1 -0.384030
2 -1.273858
d 2 0.828642
3 -2.359443
dtype: float64
IN:
data.index
OUT:
MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
In:
data.unstack() #把分层的索引排布到列上
OUT:
1 2 3
a 2.138609 0.608215 0.368269
b -0.709574 0.671569 1.421320
c -0.384030 -1.273858 NaN
d NaN 0.828642 -2.359443
IN:
data.unstack().stack()
Out[51]:
a 1 2.138609
2 0.608215
3 0.368269
b 1 -0.709574
2 0.671569
3 1.421320
c 1 -0.384030
2 -1.273858
d 2 0.828642
3 -2.359443
dtype: float64
IN:
frame=DataFrame(np.arange(12).reshape((4,3)),
index=[['a','a','b','b'],[1,2,1,2,]],
columns=[['Ohio','Ohio','Colorado'], ['Green','Red','Green']])
frame