import pandas as pd
data= pd.read_csv(r'G:\kaggle\melb_data.csv')
#统计缺失值的数量
missing_val_count_by_column= data.isnull().sum()
missing_val_count_by_column #pandas Series类型
Suburb 0
Address 0
Rooms 0
Type 0
Price 0
Method 0
SellerG 0
Date 0
Distance 0
Postcode 0
Bedroom2 0
Bathroom 0
Car 62
Landsize 0
BuildingArea 6450
YearBuilt 5375
CouncilArea 1369
Lattitude 0
Longtitude 0
Regionname 0
Propertycount 0
dtype: int64
type(missing_val_count_by_column)
pandas.core.series.Series
#只输出有缺失值的
print(missing_val_count_by_column[missing_val_count_by_column > 0])
Car 62
BuildingArea 6450
YearBuilt 5375
CouncilArea 1369
dtype: int64
#判断data的'Car'列有缺失值吗?
data['Car'].isnull().any() #该列有缺失值,则out:True
True
#看该列有多少个缺失值
data['Car'].isnull().sum()
62
#看Car列缺失值的具体情况
data['Car'].isnull()
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 False
18 False
19 False
20 False
21 False
22 False
23 False
24 False
25 False
26 False
27 False
28 False
29 False
...
13550 True
13551 False
13552 False
13553 False
13554 False
13555 False
13556 False
13557 False
13558 False
13559 False
13560 False
13561 False
13562 False
13563 False
13564 False
13565 False
13566 False
13567 False
13568 False
13569 False
13570 False
13571 False
13572 False
13573 False
13574 False
13575 False
13576 False
13577 False
13578 False
13579 False
Name: Car, Length: 13580, dtype: bool
当列中的值大部分都缺失时,它是有用的。
data_without_missing_values = original_data.dropna(axis=1)
有training dataset和 test dataset, 假如想删除两个DataFrame中相同的列:
col_with_missing= [col for col in original_data.columns
if original_data[col].isnull().any()] #ifTrue的时候执行 什么时候True: 该列有缺失值的时候
reduced_original_data= original_data.drop(col_with_missing, axis=1) #drop啊 not dropna
reduced_test_data= test_data.drop(col_with_missing, axis=1)
以下默认填补了mean value
from sklearn.impute import SimpleImputer
my_imputer= SimpleImputer() #default: missing_values=np.nan, strategy='mean' 处理稀疏矩阵:missing_values=-1 其他策略:strategy="most_frequent"
data_with_imputed_values= my_imputer.fit_transform(original_data)
# make copy to avoid changing original data (when Imputing)
new_data = original_data.copy()
# make new columns indicating what will be imputed
cols_with_missing = (col for col in new_data.columns
if new_data[col].isnull().any())
for col in cols_with_missing:
new_data[col + '_was_missing'] = new_data[col].isnull()
# Imputation
my_imputer = SimpleImputer()
new_data = pd.DataFrame(my_imputer.fit_transform(new_data))
new_data.columns = original_data.columns
import pandas as pd
melb_data= pd.read_csv(r'G:\kaggle\melb_data.csv')
#target
y= melb_data.Price
#剔除Price列
melb_predictors= melb_data.drop(['Price'], axis=1)
#剔除非数值特征
melb_numeric_predictors= melb_predictors.select_dtypes(exclude=['object'])
#Approach1——删除有缺失值的列
#找到有缺失值的列
col_with_missing= [col for col in melb_numeric_predictors
if melb_numeric_predictors[col].isnull().any()]
#删掉有缺失值的列
reduced_melb_numeric_predictors= melb_numeric_predictors.drop(col_with_missing, axis=1)
#Approach2——缺失值插补(估算值) strategy='mean'
from sklearn.impute import SimpleImputer
my_imputer= SimpleImputer()
melb_numeric_predictors_with_imputed_values = my_imputer.fit_transform(melb_numeric_predictors)
#不同处理方式下,得到的训练样本不一样。
#就定义一个Function队不同的训练样本得到的 model score进行测试
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
def score(X, y):
#split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=0)
#model
melb_model= RandomForestRegressor()
#fit
clf= melb_model.fit(X_train, y_train)
#score
score= clf.score(X_test, y_test)
return score
#test
score_drop_approach= score(reduced_melb_numeric_predictors, y)
score_impute_values= score(melb_numeric_predictors_with_imputed_values, y)
print("drop approach:",score_drop_approach)
print('impute approach:',score_impute_values)
('drop approach:', 0.7251907026905651)
('impute approach:', 0.74245443764218)
通过实验可以看到:这里,使用插值法的效果比直接删除法效果好。