KNN实现小麦种子分类问题

1、KNN

# -*- coding: utf-8 -*-
"""
Created on Tue Dec 21 12:16:41 2021

@author: Administrator
"""


import numpy as np
#inx-未知样本(测试集)
#dataSet-训练样本(训练集)
#labels-训练集分类标签向量
#k-选择距离最小的k个点
#maxType-分类结果

def knn(inX, dataSet,labels,k):
    dist=(((dataSet-inX)**2).sum(1))**0.5         #计算欧氏距离
    sortedDist = dist.argsort()      #输出排序后索引(表示第几个、第几个)优点:通过索引号可以在原序列中直接找到该数值
    classCount={}       #计数
    for i in range(k):
        voteLabel = labels[sortedDist[i]]     
        classCount[voteLabel]=classCount.get(voteLabel,0)+1
    maxType=0   #求最大
    maxCount=0
    for key, value in classCount.items():  #遍历字典
        if value > maxCount:
            maxType = key
            maxCount = value
    return maxType

2、具体实现

# -*- coding: utf-8 -*-
"""
Created on Tue Dec 21 12:16:51 2021

@author: Administrator
"""

#%%
import numpy as np
import random
import matplotlib.pyplot as plt
import KNN
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False
#%%
#读取数据集,将样本数据的格式转换为分类模型可以接收的格式,需要将数据分为特征值矩阵和对应的分类标签向量
#打开并解析文件
#returnMat-特征矩阵
#classLabelVector-分类标签向量
def file2Matrix(filename):
    data = np.genfromtxt(filename, delimiter=',')
    np.random.seed(1)
    np.random.shuffle(data)
    returnMat = data[:,:7]
    classLabelVector = data[:,7].flatten()
    classLabelVector = classLabelVector.astype(np.int)
    
    return returnMat,classLabelVector
#%%
#数据归一化处理(0-1标准化)
#归一化方法(0-1标准化,Z-score标准化,sigmoid压缩法)
#dataSet-特征矩阵
#normDataSet-归一化后的特征矩阵
def autoNorm(dataSet):
    minVals = dataSet.min(0)  #按列取最小值
    maxVals = dataSet.max(0)  #按列取最大值
    normDataSet = np.zeros(dataSet.shape)  #初始化一个零矩阵
    normDataSet = (dataSet - minVals) / (maxVals - minVals)
    
    return normDataSet

#%%
#读取文件
datingDataMat,datingLabels = file2Matrix('./seeds_dataset.xls')
print(datingDataMat)
print(datingLabels)
#%%

#不同特征值之间的关系
for i in range(7):
    plt.scatter(datingDataMat[:,i],datingDataMat[:,i+1],c=datingLabels)
    plt.show()
#%%

#归一化特征矩阵
dataSet = autoNorm(datingDataMat)
# dataSet = datingDataMat
print(dataSet)
#%%

#划分训练集、测试集
m = 0.8
dataSize = dataSet.shape[0]
print("数据集总行数:",dataSize)
# random.shuffle(dataSet)
trainSize = int(m*dataSize)   
testSize = int((1-m)*dataSize+1)
print(trainSize,testSize)

#%%
#测试knn计算结果
k=5
predictlist = []
truthlist = []
correct = 0
for i in range(testSize):
    predict = KNN.knn(dataSet[trainSize+i-1,:],dataSet[0:trainSize,:],datingLabels[0:trainSize],k)
    predictlist.append(predict)
    truth = datingLabels[trainSize+i-1]
    truthlist.append(truth)
    print("预测值为:{0} 真实值为:{1}".format(predict,truth))
    if predict == datingLabels[trainSize+i-1]:
        correct = correct + 1
print('正确率:',correct/testSize)

#%%
#并对模型的准确性进行计算、分析(评价指标至少2种)
#准确率、查准率、查全率、F1值
accuracy = accuracy_score(predictlist,truthlist)
F1_Measure = f1_score(predictlist, truthlist, average='weighted')
precision = precision_score(predictlist,truthlist, average='weighted')
recall = recall_score(predictlist, truthlist, average='weighted')
print("准确率(accuracy):", accuracy)
print("召回率(recall):", recall)
print("精确率(Precision):", precision)
print("F1-Measure:", F1_Measure)


3、数据集

链接: https://pan.baidu.com/s/1OZz3lpWXSU_Gc1fMVWv2Og?pwd=iwak 提取码: iwak 复制这段内容后打开百度网盘手机App,操作更方便哦

你可能感兴趣的:(分类,机器学习,人工智能)