I select California Housing dataset for data analysis.
I download California Housing dataset from https://www.kaggle.com/camnugent/california-housing-prices and save it as housing.csv.
Then,I load it.
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
import numpy as np
housing_data = pd.read_csv('housing.csv')
Then,split train data and test data
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing_data, test_size=0.2, random_state=42)
I explore the dataset using the histogram.
First,we need to copy the train data
housing = train_set.copy()
start to explore the dataset
housing.hist(bins=50,figsize=(20,15))
plt.show()
From the picture we can see all the features.
I display data through visualization using matplotlib.
Let’s look at the impact of longitude and latitude on house prices.
plt.scatter(x = housing['longitude'],y = housing['latitude'],alpha = 0.1)
plt.show()
housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,
s=housing["population"]/100,label="population",
c="median_house_value",cmap=plt.get_cmap("jet"),colorbar=True,sharex=False)
plt.legend()
plt.show()
Let’s look at correlation between each pair of feature.
corr_matrix = housing.corr()
print(corr_matrix['median_house_value'].sort_values(ascending = False))
median_house_value 1.000000
median_income 0.688075
total_rooms 0.134153
housing_median_age 0.105623
households 0.065843
total_bedrooms 0.049686
population -0.024650
longitude -0.045967
latitude -0.144160
Name: median_house_value, dtype: float64
attributes = ['median_house_value','median_income','total_rooms','housing_median_age']
pd.plotting.scatter_matrix(housing[attributes],figsize = (12,8))
plt.show()
From the picture,we can know that median_income maybe is the most important feature to predict median_house_value.
Let’s see it.
housing.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.1)
plt.show()
Backup
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()
Let check our dataset information。
print(housing.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude 20640 non-null float64
latitude 20640 non-null float64
housing_median_age 20640 non-null float64
total_rooms 20640 non-null float64
total_bedrooms 20433 non-null float64
population 20640 non-null float64
households 20640 non-null float64
median_income 20640 non-null float64
median_house_value 20640 non-null float64
ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None
We notice that all the features are filled (“non-null”) with numerical (“float64”) value on the 2046 records except for 2 features:
total_bedrooms has only 20433 with non null values. meaning that on 207 records, there is no value
ocean_proximity is not a number but a word tag.
We notice that ‘total_bedrooms’ and ‘ocean_proximity’ is different from other features.
So,we need to make it.
(But I still don’t have a good grasp of how to handle missing data and the code for this part comes from the internet. )
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace=True)
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
housing_cat = housing["ocean_proximity"]
encoder = LabelEncoder()
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded,housing_categories=housing_cat.factorize()
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
Conversion pipeline
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
class LabelBinarizerPipelineFriendly(LabelBinarizer):
def fit(self, X, y=None):
"""this would allow us to fit the model based on the X input."""
super(LabelBinarizerPipelineFriendly, self).fit(X)
def transform(self, X, y=None):
return super(LabelBinarizerPipelineFriendly, self).transform(X)
def fit_transform(self, X, y=None):
return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizerPipelineFriendly()),
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
I train a linear regression model.
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
evaluate the linear regression model using RMSE.
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)
error
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)
error
Obviously,it’s overfitting.And I don’t know how to hand it.
I have to learn how to deal with overfitting condition,then I can calculate the error.