import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path = housing_path)
housing_tgz.close()
fetch_housing_data()
import pandas as pd
def load_housing_data(housing_path = HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
# data_frame = load_housing_data()
housing = load_housing_data()
housing.head() #DataFrame.head():show top five rows:how many attributes and what are they
housing.info() #data.info():get quick description of th data
housing["ocean_proximity"].value_counts() #value_count():show details of each columns of the data
housing.describe() #describe():shows a summary of numerical attributes
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize = (20,15))
生成测试集可以归纳为以下几个方法(对后文的概述):
(所述方法是循序渐进的,后者可以一定程度上解决前者的一些问题)
1)构造 split_train_test()函数:
主要思路是对现有数据进行打乱和按比例选择,
存在的问题是每次选择的测试集不固定,即便使用同一种子等方法看似可以解决问题,
但是如若加入新数据,所选测试集就又难以保证了。
2)设置一个identifier,通过id来选取测试集组成,构造split_train_test_by_id()函数
问题在于,现有实例的数据中没有可以做identifier的项,需要构造,接下来我们的主
要附加工作就是构造一个合适的可做identifier的项,通过reset_index()实现。
3)此外,还可以使用sickit learn所带有的函数:
sklearn.model_selection中的train_test_split()可以直接生成测试集和训练集
上述方法中存在的问题是,面对大量数据时,偏差的影响还较小,但是当数据量较小的时候,
偏差的影响将加剧,这个问题可以通过分层抽样来解决:
4)分层抽样:思路是:按什么划分层->划分层->层内抽样
import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data)*test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
# use function split_train_test
# train_set, test_set = split_train_test(housing, 0.2)
# len(train_set)
# len(test_set)
Problems & Solution in this part
problem-1:
if you run the program again, it’ll generate a different test set.
Overtime, the machine learning algorithms will get to see the whole dataset.
solution-1-1:
save the test set on the first run, load it in subsequent runs.
solution-1-2:
set the random number generator’s seed(e.g., np.random.seed(42)) before
calling np.random.permutation()
problem-2:
2 solutions above will break next time you feth an updated dataset.
solution-2-1:
use each instance’s identifier to decide whether or not it should go in the test set.
e.g.,you can compute a hash of each instance’s identifier and put that instance in
the test if the hash is lower or equal to 20% of the maximum hash value.
from zlib import crc32
def test_set_check(identifier, test_ratio):
return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32
p.s.crc32(np.int64(identifier))& 0xffffffff < test_ratio * 2**32
这一段还不是太明白=.=
def split_train_test_by_id(data, test_ratio, id_column):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
return data.loc[~in_test_set], data.loc[in_test_set]
由于一开始对data.loc[~in_test_set]
不太能理解,写了一小段验证了这段函数的功能:
import pandas as pd
import numpy as np
#创建一个data
data = pd.DataFrame(np.arange(9).reshape(3,3),index=list('012'),columns=list('ABC'))
#为data添加一个布尔值的列
data['D']=True,False,True
#查看一下已经创建好的data
data
Out[5]:
A B C D
0 0 1 2 True
1 3 4 5 False
2 6 7 8 True
#接下来看下data.loc[~in_test_set]的效果
check=data['D']
data.loc[check]
Out[7]:
A B C D
0 0 1 2 True
2 6 7 8 True
data.loc[~check]
Out[8]:
A B C D
1 3 4 5 False
problem-3:
the housing dataset does not have an identifier columns
solution-3-1:
use the row index as the ID
housing_with_id = housing.reset_index() # add an 'index' column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')
'''
ATTENTION:
if use the row index as a unique identifier, you need to make sure that
new data gets appended to the end of the dataset.
'''
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
problem-4:
just considered purely random sampling method (generally fine when dataset is large)
not large–>run the risk of introducing a signifiant sampling bias.
solution-4-1:
stratified sampling
#creat a new column to store the category
housing['income_cat'] = pd.cut(housing['median_income'],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
# represent the income categories
housing['income_cat'].hist()
# Stratified sampling based on the income_cat
# Use Scikit-Learn's StratifiedShuffleSplit class
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
# remove the income_cat attribute to make the data back to its original state
for set_ in(strat_train_set, strat_test_set):
set_.drop('income_cat', axis=1, inplace=True)
So far, we have spend a lot of time on test set generationlots of ideas will be useful when we discuss cross validation later.
What we have already done:
take a quick look at the data–> get a general understanding of the data.
The new goal:
make sure you have put the test set aside, and you are only exploring the training set. when training set is so large, sample an exploration set. creat a copy so you can play with it without harming the training set.
# creat a scatterplot
housing.plot(kind='scatter', x='longitude', y='latitude')
Problems & Solution in this part
problem-1:
hard to see any particular pattern0
solution-1-1:
set the alpha option to highlight high-density areas
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)
# have a better look at the data
'''
radius of the circle -- the district's population
color -- the price
use a prededined color map -- option cmap -- jet
'''
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
s=housing['population']/100, label='population', figsize=(10,7),
c='median_house_value', cmap=plt.cm.get_cmap('jet'), colorbar=True)
# plt.legend()
# solution-1
corr_matrix = housing.corr()
# solution-2
from pandas.plotting import scatter_matrix
attributes = ['median_house_value', 'median_income',
'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes],figsize=(12,8))
housing.plot(kind='scatter', x='median_income',
y='median_house_value', alpha=0.1)
need2do-1:
you want to clear out some data quirks you found
need2do-2:
try out various attribute combination; creat new attributes
# Creat new attributes
housing['rooms_per_household'] = housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']