深入浅出python机器学习_6.3.1_随机森林实例——要不要和相亲对象进一步发展

# 6.3.1 数据集的准备

import pandas as pd

data=pd.read_csv('adult.csv',header=None,index_col=False,
                names=['年龄','单位性质','权重','学历','受教育时长',
                       '婚姻状况','职业','家庭状况','种族','性别',
                      '资产所得','资产损失','周工作时长','原籍',
                      '收入'])
data_lite=data[['年龄','单位性质','学历','性别','周工作时长','职业','收入']]

display(data_lite.head())
年龄 单位性质 学历 性别 周工作时长 职业 收入
0 39 State-gov Bachelors Male 40 Adm-clerical <=50K
1 50 Self-emp-not-inc Bachelors Male 13 Exec-managerial <=50K
2 38 Private HS-grad Male 40 Handlers-cleaners <=50K
3 53 Private 11th Male 40 Handlers-cleaners <=50K
4 28 Private Bachelors Female 40 Prof-specialty <=50K
# 6.3.2 用get_dummies处理数据

data_dummies=pd.get_dummies(data_lite)

print('样本原始特征:\n',list(data_lite.columns),'\n')

print('虚拟变量特征:\n',list(data_dummies.columns),'\n')

print('data_dummies.shape:\n',data_dummies.shape,'\n')

print('data_dummies的类型:\n',type(data_dummies))
样本原始特征:
 ['年龄', '单位性质', '学历', '性别', '周工作时长', '职业', '收入'] 

虚拟变量特征:
 ['年龄', '周工作时长', '单位性质_ ?', '单位性质_ Federal-gov', '单位性质_ Local-gov', '单位性质_ Never-worked', '单位性质_ Private', '单位性质_ Self-emp-inc', '单位性质_ Self-emp-not-inc', '单位性质_ State-gov', '单位性质_ Without-pay', '学历_ 10th', '学历_ 11th', '学历_ 12th', '学历_ 1st-4th', '学历_ 5th-6th', '学历_ 7th-8th', '学历_ 9th', '学历_ Assoc-acdm', '学历_ Assoc-voc', '学历_ Bachelors', '学历_ Doctorate', '学历_ HS-grad', '学历_ Masters', '学历_ Preschool', '学历_ Prof-school', '学历_ Some-college', '性别_ Female', '性别_ Male', '职业_ ?', '职业_ Adm-clerical', '职业_ Armed-Forces', '职业_ Craft-repair', '职业_ Exec-managerial', '职业_ Farming-fishing', '职业_ Handlers-cleaners', '职业_ Machine-op-inspct', '职业_ Other-service', '职业_ Priv-house-serv', '职业_ Prof-specialty', '职业_ Protective-serv', '职业_ Sales', '职业_ Tech-support', '职业_ Transport-moving', '收入_ <=50K', '收入_ >50K'] 

data_dummies.shape:
 (32561, 46) 

data_dummies的类型:
 
# 显示数据集中的前五行

#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)

data_dummies.head()
年龄 周工作时长 单位性质_ ? 单位性质_ Federal-gov 单位性质_ Local-gov 单位性质_ Never-worked 单位性质_ Private 单位性质_ Self-emp-inc 单位性质_ Self-emp-not-inc 单位性质_ State-gov 单位性质_ Without-pay 学历_ 10th 学历_ 11th 学历_ 12th 学历_ 1st-4th 学历_ 5th-6th 学历_ 7th-8th 学历_ 9th 学历_ Assoc-acdm 学历_ Assoc-voc 学历_ Bachelors 学历_ Doctorate 学历_ HS-grad 学历_ Masters 学历_ Preschool 学历_ Prof-school 学历_ Some-college 性别_ Female 性别_ Male 职业_ ? 职业_ Adm-clerical 职业_ Armed-Forces 职业_ Craft-repair 职业_ Exec-managerial 职业_ Farming-fishing 职业_ Handlers-cleaners 职业_ Machine-op-inspct 职业_ Other-service 职业_ Priv-house-serv 职业_ Prof-specialty 职业_ Protective-serv 职业_ Sales 职业_ Tech-support 职业_ Transport-moving 收入_ <=50K 收入_ >50K
0 39 40 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
1 50 13 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
2 38 40 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0
3 53 40 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0
4 28 40 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0
features=data_dummies.loc[:,'年龄':'职业_ Transport-moving']

print('features的类型:\n',type(features),'\n')

X=features.values

print('X的类型:\n',type(X),'\n')

print('打印X的前五行:\n',X[:5,:])

y=data_dummies['收入_ >50K'].values

print('特征形态:{} 标签形态{}'.format(X.shape,y.shape))


# 避免显示不全:
# import numpy as np
# np.set_printoptions(threshold=np.inf)
features的类型:
  

X的类型:
  

打印X的前五行:
 [[39 40  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  1  0  0  0
   0  0  0  0  1  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [50 13  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
   0  0  0  0  1  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [38 40  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0
   0  0  0  0  1  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [53 40  0  0  0  0  1  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  1  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [28 40  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
   0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0]]
特征形态:(32561, 44) 标签形态(32561,)
features.columns

type(features.columns)
pandas.core.indexes.base.Index
# 用决策树建模并作出预测
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

from sklearn import tree

go_dating_tree=tree.DecisionTreeClassifier(max_depth=5)

go_dating_tree.fit(X_train,y_train)

print('模型的分:{:.2f}'.format(go_dating_tree.score(X_test,y_test)))
模型的分:0.80

你可能感兴趣的:(深入浅出,python机器学习)