这是一个相对简单而且有趣的分类案例,此次比赛意在区分食尸鬼、地精和灵魂。每种物体的特征由骨骼长度,腐烂的严重程度,头发长度,无灵魂的程度,颜色构成。详细内容请看下图:
由上图可以看出,这是一个典型的机器学习分类物体,下面来看下具体怎么实现吧!
# -*- coding: utf-8 -*-
#导入一些将会被用到的库
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
#matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import warnings
warnings.filterwarnings("ignore")
#读取文件
train_data_orig = pd.read_csv('C:/Users/new/Desktop/data/train.csv')
test_data_orig = pd.read_csv('C:/Users/new/Desktop/data/test.csv')
#把文件中的id这一列去掉
train_data = train_data_orig.drop(['id'], axis = 1)
test_data = test_data_orig.drop(['id'], axis = 1)
#拿到训练集中的标签
le = preprocessing.LabelEncoder().fit(train_data.type) #对数据的标签进行编码,从string到float类型
labels = le.transform(train_data.type) # encode species strings
#这里是提取数据中的类别,下面有用到
classes = list(le.classes_)
#测试数据的id,在保存测试结果时会用到
test_ids = test_data_orig.id
#把数据转换成标签,其实是把字符属性转换为数值属性/类别
#训练数据
color_le = preprocessing.LabelEncoder()
color_le.fit(train_data['color'])
#测试数据
color_test_le=preprocessing.LabelEncoder()
color_test_le.fit(test_data['color'])
#这里把训练和测试数据的color转换为数值类型,类似于标签,这样才能够被机器学习模型处理
train_data['color_int'] = color_le.transform(train_data['color'])
train_data=train_data.drop(['color','type'], axis = 1)
test_data['color_int'] = color_test_le.transform(test_data['color'])
test_data=test_data.drop(['color'], axis = 1)
# print(train_data)
# print(test_data)
#print(labels)
#print(test_ids)
#对上面的train_data进行划分,分为train_data和test_data(进行模型评估)
sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23)
for train_index, test_index in sss:
X_train, X_test = train_data.values[train_index], train_data.values[test_index]
y_train, y_test = labels[train_index], labels[test_index]
#建立十种分类器
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="rbf", C=0.025, probability=True),
NuSVC(probability=True),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
for clf in classifiers:
clf.fit(X_train, y_train)
name = clf.__class__.__name__
print("="*30)
print(name)
print('****Results****')
train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))
train_predictions = clf.predict_proba(X_test)
ll = log_loss(y_test, train_predictions)
print("Log Loss: {}".format(ll))
log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
log = log.append(log_entry)
print("="*30)
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")
plt.xlabel('Accuracy %')
plt.title('Classifier Accuracy')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=log, color="g")
plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()
下面是不同模型的测试结果,可以发现,最后三个模型准确率相同,在这里选取LinearDiscriminantAnalysis作为分类器模型。
==============================
KNeighborsClassifier
****Results****
Accuracy: 69.3333%
Log Loss: 3.6023867502177156
==============================
SVC
****Results****
Accuracy: 34.6667%
Log Loss: 1.1088815337732691
==============================
NuSVC
****Results****
Accuracy: 78.6667%
Log Loss: 0.5199398109239035
==============================
DecisionTreeClassifier
****Results****
Accuracy: 58.6667%
Log Loss: 14.276027576563083
==============================
RandomForestClassifier
****Results****
Accuracy: 80.0000%
Log Loss: 0.4967723491575394
==============================
AdaBoostClassifier
****Results****
Accuracy: 65.3333%
Log Loss: 0.9052712341643282
==============================
GradientBoostingClassifier
****Results****
Accuracy: 78.6667%
Log Loss: 0.49213620556208354
==============================
GaussianNB
****Results****
Accuracy: 82.6667%
Log Loss: 0.3855068195590601
==============================
LinearDiscriminantAnalysis
****Results****
Accuracy: 82.6667%
Log Loss: 0.48017310243083433
==============================
QuadraticDiscriminantAnalysis
****Results****
Accuracy: 82.6667%
Log Loss: 0.3934316190564318
==============================
# Predict Test Set
favorite_clf = LinearDiscriminantAnalysis()
favorite_clf.fit(X_train, y_train)
test_predictions = favorite_clf.predict_proba(test_data)
# Format DataFrame
submission = pd.DataFrame(test_predictions, columns=classes)
submission.insert(0, 'id', test_ids)
submission.reset_index()
# Export Submission
submission.to_csv('C:/Users/new/Desktop/data/submission.csv', index = False)#把结果写进提交文件中
submission.tail()
实验数据:Ghouls, Goblins, and Ghosts