此次主要使用1994年美国人口普查收集的数据,选用几个监督学习算法以准确地建模被调查者的收入。
然后,你将根据初步结果从中选择出最佳的候选算法,并进一步优化该算法以最好地建模这些数据。
数据来源:https://archive.ics.uci.edu/ml/datasets/Census+Income
# 为这个项目导入需要的库
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # 允许为DataFrame使用display()
# 导入附加的可视化代码visuals.py
import visuals as vs
# 为notebook提供更加漂亮的可视化
%matplotlib inline
# 导入人口普查数据
data = pd.read_csv("census.csv")
# 成功 - 显示第一条记录
display(data.head(n=10))
# 总的记录数
n_records = data.shape[0]
# 被调查者的收入大于$50,000的人数
n_greater_50k = data[data.income.str.contains('>50K')].shape[0]
# 被调查者的收入最多为$50,000的人数
n_at_most_50k = data[data.income.str.contains('<=50K')].shape[0]
# 被调查者收入大于$50,000所占的比例
greater_percent = (n_greater_50k/n_records)*100
# 打印结果
print ("Total number of records: {}".format(n_records))
print ("Individuals making more than $50,000: {}".format(n_greater_50k))
print ("Individuals making at most $50,000: {}".format(n_at_most_50k))
print ("Percentage of individuals making more than $50,000: {:.2f}%".format(greater_percent))
Total number of records: 45222
Individuals making more than $50,000: 11208
Individuals making at most $50,000: 34014
Percentage of individuals making more than $50,000: 24.78%
# 将数据切分成特征和对应的标签
income_raw = data['income']
features_raw = data.drop('income', axis = 1)
# 可视化 'capital-gain'和'capital-loss' 两个特征
vs.distribution(features_raw)
数据集有时会有一些相对来说存在极大值或者极小值的不平凡分布的的特征。算法对这种分布的数据会十分敏感,并且如果这种数据没有能够很好地规一化处理会使得算法表现不佳。如下面两个量。
对于高度倾斜分布的特征如’capital-gain’和’capital-loss’,常见的做法是对数据施加一个对数转换,将数据转换成对数,这样非常大和非常小的值不会对学习算法产生负面的影响。
# 对于倾斜的数据使用Log转换
skewed = ['capital-gain', 'capital-loss']
features_raw[skewed] = data[skewed].apply(lambda x: np.log(x + 1))
# 可视化对数转换后 'capital-gain'和'capital-loss' 两个特征
vs.distribution(features_raw, transformed = True)
from sklearn.preprocessing import MinMaxScaler
# 初始化一个 scaler,并将它施加到特征上
scaler = MinMaxScaler()
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
features_raw[numerical] = scaler.fit_transform(data[numerical])
# 显示一个经过缩放的样例记录
display(features_raw.head(n = 1))
# 使用pandas.get_dummies()对'features_raw'数据进行独热编码
features = pd.get_dummies(features_raw)
# 将'income_raw'编码成数字值
income = income_raw.map({'<=50K':0,'>50K':1})
# 打印经过独热编码之后的特征数量
encoded = list(features.columns)
print ("{} total features after one-hot encoding.".format(len(encoded)))
# 移除下面一行的注释以观察编码的特征名字
# print (encoded)
103 total features after one-hot encoding.
# 导入 train_test_split
from sklearn.model_selection import train_test_split
# 将'features'和'income'数据切分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features, income, test_size = 0.2, random_state = 0,
stratify = income)
# 将'X_train'和'y_train'进一步切分为训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0,
stratify = y_train)
# 显示切分的结果
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Validation set has {} samples.".format(X_val.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))
Training set has 28941 samples.
Validation set has 7236 samples.
Testing set has 9045 samples.