本文主要介绍一下 Pandas 数据清洗,处理缺失值。更多 Python 进阶系列文章,请参考 Python 进阶学习 玩转数据系列
内容提要:
对象 | 描述 |
---|---|
NA | Not Available |
NaN | Not a Number |
None | Pythonic missing data, a Python object that can be used in arrays with data type object |
inf | positive infinity |
-inf | negative infinity |
注意:NaN 是浮点型类型,如果用其它类型去访问它会抛出异常。
import pandas as pd
import io
data_pd_na = '''Gender|Age|Weight
M | 22 | 72.0
F | 29 | 55.0
M | 24 |
F || 57.0
'''
df_na = pd.read_table (io.StringIO(data_pd_na), sep = '|')
# NumPy: Can NOT handle NA values
d2 = '''M 22 72.0
F 29
M 24 78.0
F 25 57.0
'''
data_np_na = io.StringIO(d2)
# Should produce a ValueError
np.loadtxt(data_np_na,
dtype={'names': ('Gender', 'Age', 'Color'),
'formats': ('S1', 'i4', 'f4')})
# Missing values as NaN
d3 = '''M 22 72.0
F 29 NaN
M 24 78.0
F 25 57.0
'''
data_np_nan = io.StringIO(d3)
# Should work with NaN missing values
np_array_nan = np.loadtxt(data_np_nan,
dtype={'names': ('Gender', 'Age', 'Color'),
'formats': ('S1', 'i4', 'f4')})
# an array of tuples
np.reshape (np.array (np_array_nan), (4,1))
# Missing values as NaN
d3 = '''1 22 1
1 29 NaN
1 24 1
1 25 1
'''
data_np_nan = io.StringIO(d3)
# state explicitly to treat nan as floating point
np_array_nan = np.loadtxt(data_np_nan, dtype=np.float)
print (np_array_nan)
print (np_array_nan.shape) # 4-by-3 array
# should fail if try to treat nan as int:
d4 = '''1 22 1
1 29 NaN
1 24 1
1 25 1
'''
data_np_nan = io.StringIO(d4)
np_array_nan_int = np.loadtxt(data_np_nan, dtype=np.int)
import numpy as np
# positive infinity
p_inf = float("inf")
np_posinf = np.inf
np_posinf_1 = np.PINF
# negative infinity
n_inf = float("-inf")
np_neginf = -np.inf
np_neginf_1 = np.NINF
print("p_inf:{}".format(p_inf))
print("np_posinf:{}".format(np_posinf))
print("np_posinf_1:{}".format(np_posinf_1))
print("n_inf:{}".format(n_inf))
print("np_neginf:{}".format(np_neginf))
print("np_neginf_1:{}".format(np_neginf_1))
输出:
p_inf:inf
np_posinf:inf
np_posinf_1:inf
n_inf:-inf
np_neginf:-inf
np_neginf_1:-inf
import numpy as np
import pandas as pd
import io
x = np.array([-np.inf, 0., np.inf])
print("x:\n{}".format(np.isinf (x)))
data_pd_na = '''Gender|Age|Weight
M | 22 | 72.0
F || 55.0
M | 24 |-inf
F |inf| 57.0
'''
df = pd.read_table (io.StringIO(data_pd_na), sep = '|')
inf_df = df.isin ([np.inf, -np.inf])
# Remove rows with inf
no_inf_df = df[ ~df.isin([np.inf, -np.inf]).any(1) ]
print("inf_df:\n{}".format(inf_df))
print("no_inf_df:\n{}".format(no_inf_df))
通过 pd.set_option (‘mode.use_inf_as_na’, True) 将 inf 和 -inf 用 NaN 代替
import pandas as pd
import io
#pd.set_option?
# default is False
pd.set_option ('mode.use_inf_as_na', False)
pd_option = pd.get_option ('mode.use_inf_as_na')
data_pd_inf = '''Gender|Age|Weight
M | 22 | 72.0
F || 55.0
M | 24 |-inf
F |inf| 57.0
'''
df_inf = pd.read_table (io.StringIO(data_pd_inf), sep = '|')
print("options mode.use_inf_as_na:{}".format(pd_option))
print("df_inf:\n{}".format(df_inf))
# set mode.use_inf_as_na as true
pd.set_option ('mode.use_inf_as_na', True)
pd_option = pd.get_option ('mode.use_inf_as_na')
print("options mode.use_inf_as_na:{}".format(pd_option))
print("df_inf:\n{}".format(df_inf))
步骤:
下面通过一个例子来实战一把。
通过 shape,columns,head() 等大概了解一下数据
describe() 了解一下统计信息,只支持数字类的统计,但是没有缺失数据的报告。
df.isin([np.nan, np.inf, -np.inf]) 判断数据是否缺失
df.info() 用来了解非缺失数据的数量
还可以通过下面几种方式更直观的了解缺失数据的数量。
df.isnull().sum() 和 len(df.index) - df.count()
df.isnull().sum().sum() 和 *df.isnull().sum()/len(df)100
● 用 df.isin([np.nan, np.inf, -np.inf]) 方法
● 用 df.any() 方法只要包含任意一个缺失的值
● 最后,用 布尔数组来进行切分
df [ ~df.isin([np.nan, np.inf, -np.inf]).any(1) ].head(5)
df [ df.replace([np.inf, -np.inf], np.nan).notnull().all(axis=1) ].head(5)
df.replace([np.inf, -np.inf], np.nan).dropna(axis=1).head(5)
values = {“NonD”:111, “Dream”:222, “Span”:333}
df.fillna(value = values).head()