import pandas as pd
data= pd.read_csv(r'G:\kaggle\melb_data.csv')
missing_val_count_by_column= data.isnull().sum()
missing_val_count_by_column #pandas Series类型
Suburb 0
Address 0
Rooms 0
Type 0
Price 0
Method 0
SellerG 0
Date 0
Distance 0
Postcode 0
Bedroom2 0
Bathroom 0
Car 62
Landsize 0
BuildingArea 6450
YearBuilt 5375
CouncilArea 1369
Lattitude 0
Longtitude 0
Regionname 0
Propertycount 0
dtype: int64
print(missing_val_count_by_column[missing_val_count_by_column > 0])
Car 62
BuildingArea 6450
YearBuilt 5375
CouncilArea 1369
dtype: int64
data['Car'].isnull().any() #该列有缺失值,则out:True
Name: Car, Length: 13580, dtype: bool
data_without_missing_values = original_data.dropna(axis=1)
有training dataset和 test dataset, 假如想删除两个DataFrame中相同的列:
col_with_missing= [col for col in original_data.columns
if original_data[col].isnull().any()] #ifTrue的时候执行 什么时候True: 该列有缺失值的时候
reduced_original_data= original_data.drop(col_with_missing, axis=1) #drop啊 not dropna
reduced_test_data= test_data.drop(col_with_missing, axis=1)
以下默认填补了mean value
from sklearn.impute import SimpleImputer
my_imputer= SimpleImputer() #default: missing_values=np.nan, strategy='mean' 处理稀疏矩阵:missing_values=-1 其他策略:strategy="most_frequent"
data_with_imputed_values= my_imputer.fit_transform(original_data)
# make copy to avoid changing original data (when Imputing)
new_data = original_data.copy()
# make new columns indicating what will be imputed
cols_with_missing = (col for col in new_data.columns
if new_data[col].isnull().any())
for col in cols_with_missing:
new_data[col + '_was_missing'] = new_data[col].isnull()
# Imputation
my_imputer = SimpleImputer()
new_data = pd.DataFrame(my_imputer.fit_transform(new_data))
new_data.columns = original_data.columns
import pandas as pd
melb_data= pd.read_csv(r'G:\kaggle\melb_data.csv')
y= melb_data.Price
melb_predictors= melb_data.drop(['Price'], axis=1)
melb_numeric_predictors= melb_predictors.select_dtypes(exclude=['object'])
col_with_missing= [col for col in melb_numeric_predictors
if melb_numeric_predictors[col].isnull().any()]
reduced_melb_numeric_predictors= melb_numeric_predictors.drop(col_with_missing, axis=1)
#Approach2——缺失值插补(估算值) strategy='mean'
from sklearn.impute import SimpleImputer
my_imputer= SimpleImputer()
melb_numeric_predictors_with_imputed_values = my_imputer.fit_transform(melb_numeric_predictors)
#就定义一个Function队不同的训练样本得到的 model score进行测试
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
def score(X, y):
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=0)
melb_model= RandomForestRegressor()
clf=, y_train)
score= clf.score(X_test, y_test)
return score
score_drop_approach= score(reduced_melb_numeric_predictors, y)
score_impute_values= score(melb_numeric_predictors_with_imputed_values, y)
print("drop approach:",score_drop_approach)
print('impute approach:',score_impute_values)
('drop approach:', 0.7251907026905651)
('impute approach:', 0.74245443764218)