pandas:填充缺失值 fillna("missing") 和fillna("missing",inplace=True)的区别

当数据中存在NaN缺失值时,我们可以用其他数值替代NaN,主要用到了DataFrame.fillna()方法,下面我们来看看具体的用法:

1.先来创建一个带有缺失值的DataFrame:

# coding=utf-8
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import csv
from pandas import DataFrame

df=pd.DataFrame(np.random.randn(5,3),index=list('abcde'),columns=['one','two','three'])
print df  #原有数据
print('\n')
df.ix[1,:-1]=np.nan  #第1行,从第0列到倒数第1列 的所有数据填充为NaN (都是从0行,0列开始的 )
df.ix[1:-1,2]=np.nan #第1行到倒数第1行,第2列 的所有数据填充为NaN (都是从0行,0列开始的 )
print df  #被填充有空值的数据
print('\n')
print df.fillna("missing")  #此数据已被填充
print('\n')
print df  #df 的内容还是原来有空值的数据
print('\n')
print df.fillna(method='pad')  #用前一个数据代替NaN:method='pad'
print('\n')
print df.fillna(method='bfill',limit=1)  #与pad相反,bfill表示用后一个数据代替NaN
print('\n')
print df.fillna(df.mean()) #使用平均数或者其他描述性统计量来代替NaN
print('\n')
print df.fillna(df.mean()['one':'two']) #选择哪一列进行缺失值的处理

结果:
        one       two     three
a  0.348287 -0.579763 -0.687073
b -0.128967  1.734375 -1.530778
c  0.448428 -0.791999  0.620952
d  0.808736 -0.554402 -0.094709
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b       NaN       NaN       NaN
c  0.448428 -0.791999       NaN
d  0.808736 -0.554402       NaN
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b   missing   missing   missing
c  0.448428 -0.791999   missing
d  0.808736 -0.554402   missing
e   1.55316  -1.33636 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b       NaN       NaN       NaN
c  0.448428 -0.791999       NaN
d  0.808736 -0.554402       NaN
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b  0.348287 -0.579763 -0.687073
c  0.448428 -0.791999 -0.687073
d  0.808736 -0.554402 -0.687073
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b  0.448428 -0.791999       NaN
c  0.448428 -0.791999       NaN
d  0.808736 -0.554402 -0.159426
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b  0.789653 -0.815631 -0.423249
c  0.448428 -0.791999 -0.423249
d  0.808736 -0.554402 -0.423249
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b  0.789653 -0.815631       NaN
c  0.448428 -0.791999       NaN
d  0.808736 -0.554402       NaN
e  1.553160 -1.336362 -0.159426

2、 fillna("missing") 和fillna("missing",inplace=True)的区别
# coding=utf-8
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import csv
from pandas import DataFrame

df=pd.DataFrame(np.random.randn(5,3),index=list('abcde'),columns=['one','two','three'])
print df  #原有数据
print('\n')
df.ix[1,:-1]=np.nan  #第1行,从第0列到倒数第1列 的所有数据填充为NaN (都是从0行,0列开始的 )
df.ix[1:-1,2]=np.nan #第1行到倒数第1行,第2列 的所有数据填充为NaN (都是从0行,0列开始的 )
print df    #被填充有空值的数据
print('\n')
print df.fillna("missing")  #df 原数据没有被 missing 填充,df 的数据没有变
print('\n')
print df   #df 原数据没有被 missing 填充,df 的数据没有变
print('\n')
df.fillna("missing",inplace=True)  #df 原数据已被 missing 填充
print df   #df 原数据已被 missing 填充
print "----------"

结果:
        one       two     three
a  0.428457 -0.797473 -0.448647
b -1.744598 -0.944395  0.952140
c  1.096071  0.812616  1.980379
d -1.120961  1.193119  0.455609
e  1.039164 -0.384459  0.289628


        one       two     three
a  0.428457 -0.797473 -0.448647
b       NaN       NaN       NaN
c  1.096071  0.812616       NaN
d -1.120961  1.193119       NaN
e  1.039164 -0.384459  0.289628


        one       two     three
a  0.428457 -0.797473 -0.448647
b   missing   missing   missing
c   1.09607  0.812616   missing
d  -1.12096   1.19312   missing
e   1.03916 -0.384459  0.289628


        one       two     three
a  0.428457 -0.797473 -0.448647
b       NaN       NaN       NaN
c  1.096071  0.812616       NaN
d -1.120961  1.193119       NaN
e  1.039164 -0.384459  0.289628


        one       two     three
a  0.428457 -0.797473 -0.448647
b   missing   missing   missing
c   1.09607  0.812616   missing
d  -1.12096   1.19312   missing
e   1.03916 -0.384459  0.289628
----------








你可能感兴趣的:(python,学习)