实训六 回顾与学习
美国人口普查数据进行收入预测分类(可以参考:https://www.jianshu.com/p/a6d615f272f6)
import pandas as pd
df = pd.read_csv("data/adult.data", header=None)
df.head()
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
0 |
39 |
State-gov |
77516 |
Bachelors |
13 |
Never-married |
Adm-clerical |
Not-in-family |
White |
Male |
2174 |
0 |
40 |
United-States |
<=50K |
1 |
50 |
Self-emp-not-inc |
83311 |
Bachelors |
13 |
Married-civ-spouse |
Exec-managerial |
Husband |
White |
Male |
0 |
0 |
13 |
United-States |
<=50K |
2 |
38 |
Private |
215646 |
HS-grad |
9 |
Divorced |
Handlers-cleaners |
Not-in-family |
White |
Male |
0 |
0 |
40 |
United-States |
<=50K |
3 |
53 |
Private |
234721 |
11th |
7 |
Married-civ-spouse |
Handlers-cleaners |
Husband |
Black |
Male |
0 |
0 |
40 |
United-States |
<=50K |
4 |
28 |
Private |
338409 |
Bachelors |
13 |
Married-civ-spouse |
Prof-specialty |
Wife |
Black |
Female |
0 |
0 |
40 |
Cuba |
<=50K |
df.info()
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
0 32561 non-null int64
1 32561 non-null object
2 32561 non-null int64
3 32561 non-null object
4 32561 non-null int64
5 32561 non-null object
6 32561 non-null object
7 32561 non-null object
8 32561 non-null object
9 32561 non-null object
10 32561 non-null int64
11 32561 non-null int64
12 32561 non-null int64
13 32561 non-null object
14 32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())
import numpy as np
df.replace("?",np.nan,inplace=True)
df.dropna(inplace=True)
from sklearn.preprocessing import LabelEncoder
label_encoder=[]
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
encoder=None
if df.iloc[:,col].dtype==object:
encoder=LabelEncoder()
encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
else:
encoded_set[:,col]=df.iloc[:,col]
label_encoder.append(encoder)
data = np.delete(encoded_set, [2,10,11], axis=1)
from sklearn.model_selection import train_test_split
X, y = data[:, :-1], data[:,-1]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.naive_bayes import GaussianNB
gaussianNB=GaussianNB()
gaussianNB.fit(train_X,train_y)
from sklearn.cross_validation import cross_val_score
num_validations=5
accuracy=cross_val_score(gaussianNB,test_X,test_y,
scoring='accuracy',cv=num_validations)
print('准确率:{:.2f}%'.format(accuracy.mean()*100))
precision=cross_val_score(gaussianNB,test_X,test_y,
scoring='precision_weighted',cv=num_validations)
print('精确度:{:.2f}%'.format(precision.mean()*100))
recall=cross_val_score(gaussianNB,test_X,test_y,
scoring='recall_weighted',cv=num_validations)
print('召回率:{:.2f}%'.format(recall.mean()*100))
f1=cross_val_score(gaussianNB,test_X,test_y,
scoring='f1_weighted',cv=num_validations)
print('F1 值:{:.2f}%'.format(f1.mean()*100))
from sklearn.metrics import confusion_matrix
y_pred=gaussianNB.predict(test_X)
confusion_mat = confusion_matrix(test_y, y_pred)
print(confusion_mat)
from sklearn.metrics import classification_report
target_names = ['<=50K', '>50K']
print(classification_report(test_y, y_pred, target_names=target_names))
准确率:76.41%
精确度:79.53%
召回率:76.41%
F1 值:77.40%
[[5359 1408]
[ 672 1610]]
precision recall f1-score support
<=50K 0.89 0.79 0.84 6767
>50K 0.53 0.71 0.61 2282
avg / total 0.80 0.77 0.78 9049
df[1].values[:100]
array(['State-gov', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
'Private', 'Private', 'Self-emp-not-inc', 'Private', 'Private',
'Private', 'State-gov', 'Private', 'Private', 'Private',
'Self-emp-not-inc', 'Private', 'Private', 'Self-emp-not-inc',
'Private', 'Private', 'Federal-gov', 'Private', 'Private',
'Local-gov', 'Private', 'Private', 'Private', 'Local-gov',
'Private', 'Private', 'Federal-gov', 'State-gov', 'Private',
'Private', 'Private', 'Self-emp-not-inc', 'Private',
'Self-emp-not-inc', 'Private', 'Private', 'Private', 'Federal-gov',
'Private', 'Private', 'State-gov', 'Private', 'Private', 'Private',
'Federal-gov', 'Self-emp-inc', 'Private', 'Private', 'Private',
'Private', 'Private', 'Private', 'Private', 'Private', 'Private',
'Private', 'Private', 'Private', 'Self-emp-inc', 'Private',
'Private', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
'Private', 'Private', 'Local-gov', 'Private', 'Private', 'Private',
'Private', 'Private', 'Private', 'Local-gov', 'Private', 'Private',
'Federal-gov', 'Private', 'Private', 'Private', 'Local-gov',
'Local-gov', 'Self-emp-not-inc', 'Private', 'Private',
'Federal-gov', 'Private', 'Private', 'Self-emp-not-inc', 'Private',
'Private', 'Self-emp-inc', 'Private', 'Local-gov'], dtype=object)
data[10].values[:100]
array([ 2174, 0, 0, 0, 0, 0, 0, 0, 14084,
5178, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 5013, 2407, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 14344, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0], dtype=int64)