标签有两种:>50K, <=50K.
import pandas as pd
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
dataframe = pd.read_table('datasets/Adult/adult.data',sep=',',header=None)
dataframe.columns=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
"hours-per-week", "native-country","salary"]
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | |
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
(32561, 15)
(dataframe==" ?").sum()
age 0
workclass 1836
fnlwgt 0
education 0
education-num 0
marital-status 0
occupation 1843
relationship 0
race 0
sex 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 583
salary 0
dtype: int64
Private 22696
Self-emp-not-inc 2541
Local-gov 2093
? 1836
State-gov 1298
Self-emp-inc 1116
Federal-gov 960
Without-pay 14
Never-worked 7
Name: workclass, dtype: int64
dataframe.workclass.replace(" ?", " Private", inplace=True)
Private 24532
Self-emp-not-inc 2541
Local-gov 2093
State-gov 1298
Self-emp-inc 1116
Federal-gov 960
Without-pay 14
Never-worked 7
Name: workclass, dtype: int64
Prof-specialty 4140
Craft-repair 4099
Exec-managerial 4066
Adm-clerical 3770
Sales 3650
Other-service 3295
Machine-op-inspct 2002
? 1843
Transport-moving 1597
Handlers-cleaners 1370
Farming-fishing 994
Tech-support 928
Protective-serv 649
Priv-house-serv 149
Armed-Forces 9
Name: occupation, dtype: int64
dataframe.occupation.replace(" ?", " Other", inplace=True)
Prof-specialty 4140
Craft-repair 4099
Exec-managerial 4066
Adm-clerical 3770
Sales 3650
Other-service 3295
Machine-op-inspct 2002
Other 1843
Transport-moving 1597
Handlers-cleaners 1370
Farming-fishing 994
Tech-support 928
Protective-serv 649
Priv-house-serv 149
Armed-Forces 9
Name: occupation, dtype: int64
United-States 29170
Mexico 643
? 583
Philippines 198
Germany 137
Canada 121
Puerto-Rico 114
El-Salvador 106
India 100
Cuba 95
England 90
Jamaica 81
South 80
China 75
Italy 73
Dominican-Republic 70
Vietnam 67
Guatemala 64
Japan 62
Poland 60
Columbia 59
Taiwan 51
Haiti 44
Iran 43
Portugal 37
Nicaragua 34
Peru 31
Greece 29
France 29
Ecuador 28
Ireland 24
Hong 20
Trinadad&Tobago 19
Cambodia 19
Thailand 18
Laos 18
Yugoslavia 16
Outlying-US(Guam-USVI-etc) 14
Hungary 13
Honduras 13
Scotland 12
Holand-Netherlands 1
Name: native-country, dtype: int64
dataframe["native-country"].replace(" ?", " United-States", inplace=True)
United-States 29753
Mexico 643
Philippines 198
Germany 137
Canada 121
Puerto-Rico 114
El-Salvador 106
India 100
Cuba 95
England 90
Jamaica 81
South 80
China 75
Italy 73
Dominican-Republic 70
Vietnam 67
Guatemala 64
Japan 62
Poland 60
Columbia 59
Taiwan 51
Haiti 44
Iran 43
Portugal 37
Nicaragua 34
Peru 31
France 29
Greece 29
Ecuador 28
Ireland 24
Hong 20
Cambodia 19
Trinadad&Tobago 19
Laos 18
Thailand 18
Yugoslavia 16
Outlying-US(Guam-USVI-etc) 14
Honduras 13
Hungary 13
Scotland 12
Holand-Netherlands 1
Name: native-country, dtype: int64
# sns.set(style="whitegrid")
sns.countplot(dataframe.salary, palette="rocket")
plt.title("distribution of salary")
Text(0.5, 1.0, 'distribution of salary')
plt.title("countgrid of education level over salary")
sns.countplot(x="education", data=dataframe[dataframe["salary"]==" <=50K"], palette="rocket")
sns.countplot(x="education", data=dataframe[dataframe["salary"]==" >50K"], palette="rocket")
# plt.legend(loc="upper right")
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
sns.kdeplot(dataframe[dataframe["salary"]==" <=50K"].age, shade=True)
sns.kdeplot(dataframe[dataframe["salary"]==" >50K"].age, shade=True)
plt.title("age distribution over salary")
Text(0.5, 1.0, 'age distribution over salary')
plt.title("countgrid of gender over salary")
sns.countplot(x="sex", data=dataframe[dataframe["salary"]==" <=50K"], palette="rocket")
sns.countplot(x="sex", data=dataframe[dataframe["salary"]==" >50K"], palette="rocket")
Text(0.5, 0, '>50K')
sns.distplot(dataframe[dataframe["salary"]==" <=50K"]["hours-per-week"],vertical=True)
sns.distplot(dataframe[dataframe["salary"]==" >50K"]["hours-per-week"],vertical=True)
plt.title("age distribution over salary")
Text(0.5, 1.0, 'age distribution over salary')
cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)
ax = sns.scatterplot(x="age", y="hours-per-week",
# 对标记变量转换成哑变量 使用pd.get_dummies方法,将salary转换成 <=50K:0,>50K:1
X = dataframe.join(pd.get_dummies(dataframe.loc[:,["workclass","education","occupation","marital-status","relationship",
"race","sex","native-country"], axis=1)
X.salary.replace(" <=50K",0,inplace=True)
X.salary.replace(" >50K",1,inplace=True)
age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | salary | workclass_ Federal-gov | workclass_ Local-gov | workclass_ Never-worked | ... | native-country_ Portugal | native-country_ Puerto-Rico | native-country_ Scotland | native-country_ South | native-country_ Taiwan | native-country_ Thailand | native-country_ Trinadad&Tobago | native-country_ United-States | native-country_ Vietnam | native-country_ Yugoslavia | |
0 | 39 | 77516 | 13 | 2174 | 0 | 40 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
1 | 50 | 83311 | 13 | 0 | 0 | 13 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 38 | 215646 | 9 | 0 | 0 | 40 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 rows × 107 columns
X=X.drop(["salary"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y)
age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | workclass_ Federal-gov | workclass_ Local-gov | workclass_ Never-worked | workclass_ Private | ... | native-country_ Portugal | native-country_ Puerto-Rico | native-country_ Scotland | native-country_ South | native-country_ Taiwan | native-country_ Thailand | native-country_ Trinadad&Tobago | native-country_ United-States | native-country_ Vietnam | native-country_ Yugoslavia | |
16834 | 26 | 131686 | 13 | 0 | 0 | 40 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
14311 | 33 | 136331 | 9 | 0 | 0 | 50 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
32421 | 31 | 298995 | 9 | 0 | 0 | 35 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 rows × 106 columns
clf = svm.SVC(C=1.0, kernel="linear", verbose=True, max_iter=10000, cache_size=500,class_weight="balanced")
clf.fit(X_train, y_train)
SVC(C=1.0, cache_size=500, class_weight='balanced', coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='linear', max_iter=10000, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=True)
# 进行预测
y_pred_linear = clf.predict(X_test)
# 查看得分
clf = svm.SVC(C=1.0, kernel="poly", degree=2, verbose=True, max_iter=10000, cache_size=500,class_weight="balanced")
clf.fit(X_train, y_train)
SVC(C=1.0, cache_size=500, class_weight='balanced', coef0=0.0,
decision_function_shape='ovr', degree=2, gamma='auto_deprecated',
kernel='poly', max_iter=10000, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=True)
y_pred_poly = clf.predict(X_test)
clf = svm.SVC(C=1.0, kernel="rbf", verbose=True, gamma='auto', max_iter=10000, cache_size=1000)
clf.fit(X_train, y_train)
SVC(C=1.0, cache_size=1000, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
max_iter=10000, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=True)
y_pred_rbf = clf.predict(X_test)
clf = svm.SVC(C=1.0, kernel="sigmoid", verbose=True, max_iter=10000, cache_size=500)
clf.fit(X_train, y_train)
SVC(C=1.0, cache_size=500, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='sigmoid', max_iter=10000, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=True)
y_pred_sigmoid = clf.predict(X_test)
iter_num=[x for x in range(1000,11000,1000)]
for iters in iter_num:
clf1 = svm.SVC(C=1.0, kernel="linear", max_iter=iters, cache_size=500)
clf1.fit(X_train, y_train)
clf2 = svm.SVC(C=1.0, kernel="poly", degree=2, max_iter=iters, cache_size=500)
clf2.fit(X_train, y_train)
clf3 = svm.SVC(C=0.1, kernel="rbf", gamma='scale', max_iter=iters, cache_size=500)
clf4 = svm.SVC(C=1.0, kernel="sigmoid", max_iter=iters, cache_size=500)
data = pd.DataFrame({"linear":linear,"poly":poly, "rbf": rbf, "sigmoid": sigmoid},index=iter_num)
sns.lineplot(data=data, palette="tab10", linewidth=2.5)
plt.title("score variation over rounds of four kinds of kernels")
Text(0.5, 1.0, 'score variation over rounds of four kinds of kernels')
from sklearn import metrics
# 线性核的报告
print(metrics.classification_report(y_test, y_pred_linear))
precision recall f1-score support
0 0.76 0.93 0.84 6181
1 0.21 0.06 0.09 1960
accuracy 0.72 8141
macro avg 0.48 0.49 0.46 8141
weighted avg 0.63 0.72 0.66 8141
# 多项式核的报告
print(metrics.classification_report(y_test, y_pred_poly))
precision recall f1-score support
0 0.76 0.97 0.85 6181
1 0.20 0.02 0.04 1960
accuracy 0.74 8141
macro avg 0.48 0.50 0.45 8141
weighted avg 0.62 0.74 0.66 8141
# 高斯核的报告
print(metrics.classification_report(y_test, y_pred_rbf))
precision recall f1-score support
0 0.77 0.97 0.86 6181
1 0.42 0.07 0.12 1960
accuracy 0.75 8141
macro avg 0.59 0.52 0.49 8141
weighted avg 0.68 0.75 0.68 8141
# sigmoid核的报告
print(metrics.classification_report(y_test, y_pred_sigmoid))
precision recall f1-score support
0 0.76 1.00 0.86 6181
1 0.00 0.00 0.00 1960
accuracy 0.76 8141
macro avg 0.38 0.50 0.43 8141
weighted avg 0.58 0.76 0.66 8141