鸢尾花分类(KNN,SVM,XGBOOST)

https://www.kaggle.com/uciml/iris#


三种花的区分:(每种花50个样本)

image.png

data:里面是花萼长度花萼宽度花瓣长度花瓣宽度的测量数据, [4.6 3.1 1.5 0.2]

target:结果(分类的结果,这里一共三个分类,分别是0、1、2)

target_name:里面是对应三类的名字,['setosa' 'versicolor' 'virginica']

DESCR:数据的介绍

filename:文件所在路径

feature_names:数据描述,['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


1. 打乱随机抽取数据:

train_test_split是交叉验证中常用的函数,功能是从样本中随机的按比例选取train_data和test_data,默认75%:25%

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)

print('x_train length is:', len(x_train))
print('x_test length is:', len(x_test))
print('y_train length is:', len(y_train))
print('y_test length is:', len(y_test))
image.png

2. 各种解法

1)构建K近邻分类算法

from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=1)

接下来调用knn对象的fit方法,输入参数分本别为X_train,y_train

knn.fit(X_train,y_train)
  • 做出预测

现在可以用这个模型对新数据进行预测了,假设新的鸢尾花花萼长度和宽度为5cm和2.9cm,花瓣长和宽为1cm和0.2cm,将数据存放在一个numpy数组中

import numpy as np
X_new=np.array([[5,2.9,1,0.2]])

将这朵花的测量数据转化为二维的numpy数组中的一行,因为scikit-learn的输入数据必须是二维数组,接下来调用knn对象的predict方法进行预测

prediction=knn.predict(X_new)
print("Preiction: {}".format(prediction))
print("Predicted target name : {}".format(iris_dataset['target_names'][prediction]))

根据模型预测,这朵花属于setosa品种。但是这个模型是否准确可靠我们并不清楚。

  • 评估模型

通过对测试数据中的每朵鸢尾花进行预测,并将预测结果与标签进行对比,通过计算精度(品种预测正确的花所占的比例)来衡量模型的好坏。

y_pred=knn.predict(X_test)
print(np.mean(y_pred==y_test))

我的代码:

import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)

# 训练集合,选定参数k
def train(n_neighbor,x_train,y_train):
    knn=KNeighborsClassifier(n_neighbors=n_neighbor)
    knn.fit(x_train,y_train)
    return knn
# 计算准确率
def test(x_test,y_test,knn):
    prediction=knn.predict(x_test)
    num=0
    for i in range(len(prediction)):
        if(prediction[i]==y_test[i]):
            num+=1
    return num/len(prediction)
choosed_k=1;choosed_accurate=0
# 通过1到20遍历得到最佳的k
for k in range(1,20):
    knn=train(k,x_train,y_train)
    accurate=test(x_test,y_test,knn)
    print(accurate)
    if(accurate>choosed_accurate):
        choosed_accurate=accurate
        choosed_k=k
print("Final:",choosed_accurate)
# Final: 0.9736842105263158

基于KD树的改良版:

from sklearn import datasets# 导入内置数据集模块
from sklearn.neighbors import KNeighborsClassifier# 导入sklearn.neighbors模块中KNN类
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree#导入KD树类
data = load_iris()
# x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=random.randint())
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)
def train(x_train):
    tree = KDTree(x_train)
    return tree
# ind:最近的3个邻居的索引
# dist:距离最近的3个邻居
def test(tree,y_train,x_test,y_test,k):
    counts=0
    for i in range(len(x_test)):
        X=x_test[i].reshape(1,-1)
        dist, ind = tree.query(X,k)
        a={}
        for j in range(len(ind[0])):
            if(ind[0][j] in a):
                a[y_train[ind[0][j]]]+=1
            else:
                a[y_train[ind[0][j]]]=1
        a=sorted(a.items(), key=lambda item:item[1])
        if(a[-1][0]==y_test[i]):
            counts+=1
    return counts/len(x_test)
for k in range(1,20):
    tree=train(x_train)
    acc=test(tree,y_train,x_test,y_test,k)
    print(acc)
# 随着k值的上升,准确率下降
0.9736842105263158
0.9473684210526315
0.9210526315789473
0.8947368421052632
0.868421052631579

2) 使用XGBOOST

import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
# x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=random.randint())
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)
data_train = xgb.DMatrix(x_train, label=y_train,missing = 0)
data_test = xgb.DMatrix(x_test, label=y_test, missing = 0)

params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',  # 多分类的问题
    'num_class': 3,               # 类别数,与 multisoftmax 并用
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度,越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
    'silent': 1,                   # 设置成1则没有运行信息输出,最好是设置为0.
    'eta': 0.007,                  # 如同学习率
    'seed': 1000,
    'nthread': 4,                  # cpu 线程数
}
num_round = 100
bst = xgb.train(params, data_train, num_round)

# 对测试集进行预测
dtest = xgb.DMatrix(x_test)
ans = bst.predict(dtest)

# 计算准确率
cnt1 = 0
cnt2 = 0
for i in range(len(y_test)):
    if ans[i] == y_test[i]:
        cnt1 += 1
    else:
        cnt2 += 1
print("acc:",cnt1 / (cnt1 + cnt2))
# acc: 0.9736842105263158

3)使用SVM

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import svm  # 导入svm模块
data = load_iris()
# x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=random.randint())
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)
model = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)  # 初始化模型
model.fit(x_train, y_train)  # 训练
a=model.score(x_test, y_test)
print(a)
#0.9736842105263158

你可能感兴趣的:(鸢尾花分类(KNN,SVM,XGBOOST))