上课笔记-机器学习(5)-美国人口普查数据进行收入预测分类

实训六 回顾与学习

美国人口普查数据进行收入预测分类(可以参考:https://www.jianshu.com/p/a6d615f272f6)

# 读入数据
import pandas as pd
df = pd.read_csv("data/adult.data", header=None)
df.head()
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
# 数据信息
df.info()

RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
0     32561 non-null int64
1     32561 non-null object
2     32561 non-null int64
3     32561 non-null object
4     32561 non-null int64
5     32561 non-null object
6     32561 non-null object
7     32561 non-null object
8     32561 non-null object
9     32561 non-null object
10    32561 non-null int64
11    32561 non-null int64
12    32561 non-null int64
13    32561 non-null object
14    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
# 数据处理一:去除字符串数值前面的空格
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
    df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())
# 数据处理二: 删除缺失值样本
# 将?字符串替换为NaN缺失值标志
import numpy as np
df.replace("?",np.nan,inplace=True)
# 此处直接删除缺失值样本(包含缺失值的行都删除)
df.dropna(inplace=True)
# 数据处理三:对字符数据进行编码
from sklearn.preprocessing import LabelEncoder
label_encoder=[] # 放置每一列的encoder
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
    encoder=None
    if df.iloc[:,col].dtype==object: # 字符型数据
        encoder=LabelEncoder()
        encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
    else:  # 数值型数据
        encoded_set[:,col]=df.iloc[:,col]
    label_encoder.append(encoder)
# 删除序号为2、10、11的列(老师的方法)
data = np.delete(encoded_set, [2,10,11], axis=1)
# 划分训练集集和测试集
from sklearn.model_selection import train_test_split
X, y = data[:, :-1], data[:,-1]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42) 
# 建立朴素贝叶斯分类器模型
from sklearn.naive_bayes import GaussianNB
gaussianNB=GaussianNB()
gaussianNB.fit(train_X,train_y)

# 2 用交叉验证来检验模型的准确性,只是在test set上验证准确性
from sklearn.cross_validation import cross_val_score
num_validations=5
accuracy=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='accuracy',cv=num_validations)
print('准确率:{:.2f}%'.format(accuracy.mean()*100))
precision=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='precision_weighted',cv=num_validations)
print('精确度:{:.2f}%'.format(precision.mean()*100))
recall=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='recall_weighted',cv=num_validations)
print('召回率:{:.2f}%'.format(recall.mean()*100))
f1=cross_val_score(gaussianNB,test_X,test_y,
                         scoring='f1_weighted',cv=num_validations)
print('F1  值:{:.2f}%'.format(f1.mean()*100))
                   
# 3 打印性能报告
from sklearn.metrics import confusion_matrix
y_pred=gaussianNB.predict(test_X)
confusion_mat = confusion_matrix(test_y, y_pred)
print(confusion_mat) #看看混淆矩阵长啥样

from sklearn.metrics import classification_report
# 直接使用sklearn打印精度,召回率和F1值
target_names = ['<=50K', '>50K']
print(classification_report(test_y, y_pred, target_names=target_names))

准确率:76.41%
精确度:79.53%
召回率:76.41%
F1  值:77.40%
[[5359 1408]
 [ 672 1610]]
             precision    recall  f1-score   support

      <=50K       0.89      0.79      0.84      6767
       >50K       0.53      0.71      0.61      2282

avg / total       0.80      0.77      0.78      9049
df[1].values[:100]
array(['State-gov', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Self-emp-not-inc', 'Private', 'Private',
       'Private', 'State-gov', 'Private', 'Private', 'Private',
       'Self-emp-not-inc', 'Private', 'Private', 'Self-emp-not-inc',
       'Private', 'Private', 'Federal-gov', 'Private', 'Private',
       'Local-gov', 'Private', 'Private', 'Private', 'Local-gov',
       'Private', 'Private', 'Federal-gov', 'State-gov', 'Private',
       'Private', 'Private', 'Self-emp-not-inc', 'Private',
       'Self-emp-not-inc', 'Private', 'Private', 'Private', 'Federal-gov',
       'Private', 'Private', 'State-gov', 'Private', 'Private', 'Private',
       'Federal-gov', 'Self-emp-inc', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Private', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Private', 'Self-emp-inc', 'Private',
       'Private', 'Self-emp-not-inc', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Local-gov', 'Private', 'Private', 'Private',
       'Private', 'Private', 'Private', 'Local-gov', 'Private', 'Private',
       'Federal-gov', 'Private', 'Private', 'Private', 'Local-gov',
       'Local-gov', 'Self-emp-not-inc', 'Private', 'Private',
       'Federal-gov', 'Private', 'Private', 'Self-emp-not-inc', 'Private',
       'Private', 'Self-emp-inc', 'Private', 'Local-gov'], dtype=object)
data[10].values[:100]
array([ 2174,     0,     0,     0,     0,     0,     0,     0, 14084,
        5178,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,  5013,  2407,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0, 14344,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int64)

你可能感兴趣的:(机器学习,机器学习)