Kaggle波士顿房价预测数据预处理部分

题目略 ~

  1. 导库导数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
%matplotlib inline

train = pd.read_csv(...)
test = pd.read_csv(...)
train.head()
train.shape
train.info()   #查看训练集、测试集的基本信息
  1. 查看标签数据的分布
train.SalePrice.describe()
sns.distplot(train.SalePrice)

Kaggle波士顿房价预测数据预处理部分_第1张图片

SalePriceLog = np.log(train.SalePrice)   
sns.distplot(SalePriceLog)
SalePrice = SalePriceLog

Kaggle波士顿房价预测数据预处理部分_第2张图片

  1. 相关性分析
#绘制相关性矩阵
corrdata = train.corr()
fig,ax = plt.subplots(figsize=(30,20))
sns.heatmap(corrdata,annot=True)
bottom,top = ax.get_ylim()
ax.set_ylim(bottom + 0.5,top - 0.5)
#查看各特征与房价的相关性大小
corr_rank = corrdata['SalePrice'].sort_values(ascending=False)
corr_rank

Kaggle波士顿房价预测数据预处理部分_第3张图片

#绘制更为细致的相关性矩阵
max_feature_corr = corrdata.index[abs(corrdata['SalePrice'])>0.5]
fig,ax = plt.subplots(figsize=(15,10))
sns.heatmap(train[max_feature_corr].corr(),annot=True,cmap='RdYlGn')
bottom,top = ax.get_ylim()
ax.set_ylim(bottom + 0.5,top - 0.5)
  1. 缺失值处理
#查看训练集和测试集的缺失值
null = train.isnull().sort_values(ascending = False)
missingdata = pd.DataFrame(null,columns=['Sum'])
missingdata.head(20)
#test部分也是相同的操作
#绘制缺失值的分布图,一切从简 ~ 
plt.figure(figsize=(20,8))
plt.xticks(rotation='90')
plt.bar(missingdata.index,missingdata['Sum'])
plt.xlabel(...)
plt.ylabel(...)
plt.show()
#可以将缺失值过多(超过50%)的特征直接删除
train.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'],axis=1,inplace=True) 
test.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'],axis=1,inplace=True) 
#对定量特征进行缺失值填补
for i in ('LotFrontage','GarageYrBlt','GarageCars','BsmtFinSF1','TotalBsmtSF','GarageArea','BsmtFinSF2','BsmtUnfSF','LotFrontage','GarageYrBlt','BsmtFullBath','BsmtHalfBath'):
    train[i] = train[i].fillna(train[i].mean())
    test[i] = test[i].fillna(test[i].mean())
#对定性特征进行缺失值填补
for i in ('BsmtQual','BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrArea', 'Electrical','Exterior2nd','Exterior1st','KitchenQual','Functional','SaleType','Utilities','MSZoning','BsmtQual','BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrArea', 'Electrical'):
    test[i]=test[i].fillna(test[i].mode()[0])
    train[i]=train[i].fillna(train[i].mode()[0])
#检查是否还有缺失值
pd.DateFrame(train.isnull().sum().sort_values(ascending=False)).head()
pd.DataFrame(test.isnull().sum().sort_values(ascending=False)).head()
  1. 变量类型转换
#不可将分类型变量看作是数值型变量
train['MSSubClass'] = train['MSSubClass'].astype(str)
test['MSSubClass'] = test['MSSubClass'].astype(str)

train['YrSold'] = train['YrSold'].astype(str)
test['YrSold'] = test['YrSold'].astype(str)
#手动分类定量、定性变量
quantitative_list = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt','YearRemodAdd',
'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces','GarageYrBlt', 'GarageCars', 
'GarageArea', 'WoodDeckSF','OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold','MSSubClass','YrSold']
qualitative_list = ['MSZoning','Street','LotShape','LandContour','Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
'BldgType','HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
'BsmtQual', 'BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
'Functional','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive', 'SaleType', 'SaleCondition']
  1. 标签编码
train.loc[:,qualitative_list] = OrdinalEncoder().fit_transform(train.loc[:,qualitative_list])
test.loc[:,qualitative_list] = OrdinalEncoder().fit_transform(test.loc[:,qualitative_list]) 

你可能感兴趣的:(Kaggle波士顿房价预测数据预处理部分)