panda缺失值处理

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html

import pandas as pd
import numpy as np
df=pd.DataFrame(np.random.rand(5,6))
df
0 1 2 3 4 5
0 0.399916 0.546635 0.729255 0.992735 0.747917 0.157603
1 0.200017 0.710527 0.361684 0.094026 0.895336 0.848446
2 0.552851 0.456602 0.410653 0.835223 0.769456 0.803724
3 0.392067 0.397841 0.393220 0.745361 0.360859 0.383625
4 0.820769 0.893594 0.312887 0.378115 0.584261 0.214013
# Make a few areas have NaN values
df.iloc[1:3,1] = np.nan
df.iloc[3:,3] = np.nan
df.iloc[2,5]=np.nan
df
0 1 2 3 4 5
0 0.399916 0.546635 0.729255 0.992735 0.747917 0.157603
1 0.200017 NaN 0.361684 0.094026 0.895336 0.848446
2 0.552851 NaN 0.410653 0.835223 0.769456 NaN
3 0.392067 0.397841 0.393220 NaN 0.360859 0.383625
4 0.820769 0.893594 0.312887 NaN 0.584261 0.214013
df.isnull()
0 1 2 3 4 5
0 False False False False False False
1 False True False False False False
2 False True False False False True
3 False False False True False False
4 False False False True False False
#显示存在缺失值的行列,清楚的确定缺失值的位置
df[df.isnull().values==True]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
0 1 2 3 4 5
1 0.200017 NaN 0.361684 0.094026 0.895336 0.848446
2 0.552851 NaN 0.410653 0.835223 0.769456 NaN
2 0.552851 NaN 0.410653 0.835223 0.769456 NaN
3 0.392067 0.397841 0.393220 NaN 0.360859 0.383625
4 0.820769 0.893594 0.312887 NaN 0.584261 0.214013
#填充缺失数据
df.fillna(0)
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
0 1 2 3 4 5
0 0.399916 0.546635 0.729255 0.992735 0.747917 0.157603
1 0.200017 0.000000 0.361684 0.094026 0.895336 0.848446
2 0.552851 0.000000 0.410653 0.835223 0.769456 0.000000
3 0.392067 0.397841 0.393220 0.000000 0.360859 0.383625
4 0.820769 0.893594 0.312887 0.000000 0.584261 0.214013
#根据(axis=0列,axis=1行)的前一个值进行填充,method : {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None
df.fillna(axis=0,method='ffill')
A B C D E F
0 0.399916 0.546635 0.729255 0.992735 0.747917 0.157603
1 0.200017 0.546635 0.361684 0.094026 0.895336 0.848446
2 0.552851 0.546635 0.410653 0.835223 0.769456 0.848446
3 0.392067 0.397841 0.393220 0.835223 0.360859 0.383625
4 0.820769 0.893594 0.312887 0.835223 0.584261 0.214013
df.columns=['A','B','C','D','E','F']
df
A B C D E F
0 0.399916 0.546635 0.729255 0.992735 0.747917 0.157603
1 0.200017 NaN 0.361684 0.094026 0.895336 0.848446
2 0.552851 NaN 0.410653 0.835223 0.769456 NaN
3 0.392067 0.397841 0.393220 NaN 0.360859 0.383625
4 0.820769 0.893594 0.312887 NaN 0.584261 0.214013
#Replace all NaN elements in column ‘A’, ‘B’, ‘C’, and so on with 0, 1, 2, and 3 respectively.
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3,'E':4,'F':5}
df.fillna(value=values)
A B C D E F
0 0.399916 0.546635 0.729255 0.992735 0.747917 0.157603
1 0.200017 1.000000 0.361684 0.094026 0.895336 0.848446
2 0.552851 1.000000 0.410653 0.835223 0.769456 5.000000
3 0.392067 0.397841 0.393220 3.000000 0.360859 0.383625
4 0.820769 0.893594 0.312887 3.000000 0.584261 0.214013

你可能感兴趣的:(数据分析)