import pandas as pd
data=pd.read_csv('adult.csv',header=None,index_col=False,
names=['年龄','单位性质','权重','学历','受教育时长',
'婚姻状况','职业','家庭状况','种族','性别',
'资产所得','资产损失','周工作时长','原籍',
'收入'])
data_lite=data[['年龄','单位性质','学历','性别','周工作时长','职业','收入']]
display(data_lite.head())
|
年龄 |
单位性质 |
学历 |
性别 |
周工作时长 |
职业 |
收入 |
0 |
39 |
State-gov |
Bachelors |
Male |
40 |
Adm-clerical |
<=50K |
1 |
50 |
Self-emp-not-inc |
Bachelors |
Male |
13 |
Exec-managerial |
<=50K |
2 |
38 |
Private |
HS-grad |
Male |
40 |
Handlers-cleaners |
<=50K |
3 |
53 |
Private |
11th |
Male |
40 |
Handlers-cleaners |
<=50K |
4 |
28 |
Private |
Bachelors |
Female |
40 |
Prof-specialty |
<=50K |
data_dummies=pd.get_dummies(data_lite)
print('样本原始特征:\n',list(data_lite.columns),'\n')
print('虚拟变量特征:\n',list(data_dummies.columns),'\n')
print('data_dummies.shape:\n',data_dummies.shape,'\n')
print('data_dummies的类型:\n',type(data_dummies))
样本原始特征:
['年龄', '单位性质', '学历', '性别', '周工作时长', '职业', '收入']
虚拟变量特征:
['年龄', '周工作时长', '单位性质_ ?', '单位性质_ Federal-gov', '单位性质_ Local-gov', '单位性质_ Never-worked', '单位性质_ Private', '单位性质_ Self-emp-inc', '单位性质_ Self-emp-not-inc', '单位性质_ State-gov', '单位性质_ Without-pay', '学历_ 10th', '学历_ 11th', '学历_ 12th', '学历_ 1st-4th', '学历_ 5th-6th', '学历_ 7th-8th', '学历_ 9th', '学历_ Assoc-acdm', '学历_ Assoc-voc', '学历_ Bachelors', '学历_ Doctorate', '学历_ HS-grad', '学历_ Masters', '学历_ Preschool', '学历_ Prof-school', '学历_ Some-college', '性别_ Female', '性别_ Male', '职业_ ?', '职业_ Adm-clerical', '职业_ Armed-Forces', '职业_ Craft-repair', '职业_ Exec-managerial', '职业_ Farming-fishing', '职业_ Handlers-cleaners', '职业_ Machine-op-inspct', '职业_ Other-service', '职业_ Priv-house-serv', '职业_ Prof-specialty', '职业_ Protective-serv', '职业_ Sales', '职业_ Tech-support', '职业_ Transport-moving', '收入_ <=50K', '收入_ >50K']
data_dummies.shape:
(32561, 46)
data_dummies的类型:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data_dummies.head()
|
年龄 |
周工作时长 |
单位性质_ ? |
单位性质_ Federal-gov |
单位性质_ Local-gov |
单位性质_ Never-worked |
单位性质_ Private |
单位性质_ Self-emp-inc |
单位性质_ Self-emp-not-inc |
单位性质_ State-gov |
单位性质_ Without-pay |
学历_ 10th |
学历_ 11th |
学历_ 12th |
学历_ 1st-4th |
学历_ 5th-6th |
学历_ 7th-8th |
学历_ 9th |
学历_ Assoc-acdm |
学历_ Assoc-voc |
学历_ Bachelors |
学历_ Doctorate |
学历_ HS-grad |
学历_ Masters |
学历_ Preschool |
学历_ Prof-school |
学历_ Some-college |
性别_ Female |
性别_ Male |
职业_ ? |
职业_ Adm-clerical |
职业_ Armed-Forces |
职业_ Craft-repair |
职业_ Exec-managerial |
职业_ Farming-fishing |
职业_ Handlers-cleaners |
职业_ Machine-op-inspct |
职业_ Other-service |
职业_ Priv-house-serv |
职业_ Prof-specialty |
职业_ Protective-serv |
职业_ Sales |
职业_ Tech-support |
职业_ Transport-moving |
收入_ <=50K |
收入_ >50K |
0 |
39 |
40 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
50 |
13 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
2 |
38 |
40 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
3 |
53 |
40 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
4 |
28 |
40 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
features=data_dummies.loc[:,'年龄':'职业_ Transport-moving']
print('features的类型:\n',type(features),'\n')
X=features.values
print('X的类型:\n',type(X),'\n')
print('打印X的前五行:\n',X[:5,:])
y=data_dummies['收入_ >50K'].values
print('特征形态:{} 标签形态{}'.format(X.shape,y.shape))
features的类型:
X的类型:
打印X的前五行:
[[39 40 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[50 13 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
[38 40 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
[53 40 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
[28 40 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]]
特征形态:(32561, 44) 标签形态(32561,)
features.columns
type(features.columns)
pandas.core.indexes.base.Index
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
from sklearn import tree
go_dating_tree=tree.DecisionTreeClassifier(max_depth=5)
go_dating_tree.fit(X_train,y_train)
print('模型的分:{:.2f}'.format(go_dating_tree.score(X_test,y_test)))
模型的分:0.80