https://www.kaggle.com/uciml/iris#
三种花的区分:(每种花50个样本)
data:里面是花萼长度
、花萼宽度
、花瓣长度
、花瓣宽度
的测量数据, [4.6 3.1 1.5 0.2]
target:结果(分类的结果,这里一共三个分类,分别是0、1、2)
target_name:里面是对应三类的名字,['setosa' 'versicolor' 'virginica']
DESCR:数据的介绍
filename:文件所在路径
feature_names:数据描述,['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
1. 打乱随机抽取数据:
train_test_split是交叉验证中常用的函数,功能是从样本中随机的按比例选取train_data和test_data,默认75%:25%
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)
print('x_train length is:', len(x_train))
print('x_test length is:', len(x_test))
print('y_train length is:', len(y_train))
print('y_test length is:', len(y_test))
2. 各种解法
1)构建K近邻分类算法
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=1)
接下来调用knn对象的fit方法,输入参数分本别为X_train,y_train
knn.fit(X_train,y_train)
- 做出预测
现在可以用这个模型对新数据进行预测了,假设新的鸢尾花花萼长度和宽度为5cm和2.9cm,花瓣长和宽为1cm和0.2cm,将数据存放在一个numpy数组中
import numpy as np
X_new=np.array([[5,2.9,1,0.2]])
将这朵花的测量数据转化为二维的numpy数组中的一行,因为scikit-learn的输入数据必须是二维数组,接下来调用knn对象的predict方法进行预测
prediction=knn.predict(X_new)
print("Preiction: {}".format(prediction))
print("Predicted target name : {}".format(iris_dataset['target_names'][prediction]))
根据模型预测,这朵花属于setosa品种。但是这个模型是否准确可靠我们并不清楚。
- 评估模型
通过对测试数据中的每朵鸢尾花进行预测,并将预测结果与标签进行对比,通过计算精度(品种预测正确的花所占的比例)来衡量模型的好坏。
y_pred=knn.predict(X_test)
print(np.mean(y_pred==y_test))
我的代码:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)
# 训练集合,选定参数k
def train(n_neighbor,x_train,y_train):
knn=KNeighborsClassifier(n_neighbors=n_neighbor)
knn.fit(x_train,y_train)
return knn
# 计算准确率
def test(x_test,y_test,knn):
prediction=knn.predict(x_test)
num=0
for i in range(len(prediction)):
if(prediction[i]==y_test[i]):
num+=1
return num/len(prediction)
choosed_k=1;choosed_accurate=0
# 通过1到20遍历得到最佳的k
for k in range(1,20):
knn=train(k,x_train,y_train)
accurate=test(x_test,y_test,knn)
print(accurate)
if(accurate>choosed_accurate):
choosed_accurate=accurate
choosed_k=k
print("Final:",choosed_accurate)
# Final: 0.9736842105263158
基于KD树的改良版:
from sklearn import datasets# 导入内置数据集模块
from sklearn.neighbors import KNeighborsClassifier# 导入sklearn.neighbors模块中KNN类
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree#导入KD树类
data = load_iris()
# x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=random.randint())
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)
def train(x_train):
tree = KDTree(x_train)
return tree
# ind:最近的3个邻居的索引
# dist:距离最近的3个邻居
def test(tree,y_train,x_test,y_test,k):
counts=0
for i in range(len(x_test)):
X=x_test[i].reshape(1,-1)
dist, ind = tree.query(X,k)
a={}
for j in range(len(ind[0])):
if(ind[0][j] in a):
a[y_train[ind[0][j]]]+=1
else:
a[y_train[ind[0][j]]]=1
a=sorted(a.items(), key=lambda item:item[1])
if(a[-1][0]==y_test[i]):
counts+=1
return counts/len(x_test)
for k in range(1,20):
tree=train(x_train)
acc=test(tree,y_train,x_test,y_test,k)
print(acc)
# 随着k值的上升,准确率下降
0.9736842105263158
0.9473684210526315
0.9210526315789473
0.8947368421052632
0.868421052631579
2) 使用XGBOOST
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
# x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=random.randint())
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)
data_train = xgb.DMatrix(x_train, label=y_train,missing = 0)
data_test = xgb.DMatrix(x_test, label=y_test, missing = 0)
params = {
'booster': 'gbtree',
'objective': 'multi:softmax', # 多分类的问题
'num_class': 3, # 类别数,与 multisoftmax 并用
'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
'max_depth': 12, # 构建树的深度,越大越容易过拟合
'lambda': 2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
'subsample': 0.7, # 随机采样训练样本
'colsample_bytree': 0.7, # 生成树时进行的列采样
'min_child_weight': 3,
'silent': 1, # 设置成1则没有运行信息输出,最好是设置为0.
'eta': 0.007, # 如同学习率
'seed': 1000,
'nthread': 4, # cpu 线程数
}
num_round = 100
bst = xgb.train(params, data_train, num_round)
# 对测试集进行预测
dtest = xgb.DMatrix(x_test)
ans = bst.predict(dtest)
# 计算准确率
cnt1 = 0
cnt2 = 0
for i in range(len(y_test)):
if ans[i] == y_test[i]:
cnt1 += 1
else:
cnt2 += 1
print("acc:",cnt1 / (cnt1 + cnt2))
# acc: 0.9736842105263158
3)使用SVM
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import svm # 导入svm模块
data = load_iris()
# x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=random.randint())
x_train, x_test, y_train, y_test = train_test_split(data['data'],data['target'],random_state=0)
model = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False) # 初始化模型
model.fit(x_train, y_train) # 训练
a=model.score(x_test, y_test)
print(a)
#0.9736842105263158