进阶分析:【 k k k近邻法】
数据集来源 | UCI 数据库 Iris |
---|---|
概述 | UCI Iris 数据集原始数据,根据鸢花萼片和花瓣4项指标对花的种类进行分类。 |
数据介绍 | UCI Iris是常用分类建模数据集,通过花萼长度,花萼宽度,花瓣长度,花瓣宽度4个属性预测鸢尾花卉属于(Setosa,Versicolour,Virginica)三个种类中的哪一类。 |
属性数 | 5 |
记录数 | 150 |
无缺失值记录数 | 150 |
名称 | 数据类型 |
---|---|
萼片宽度 | float |
萼片长度 | float |
花瓣长度 | float |
花瓣宽度 | float |
鸢花种类 | string |
中文 | 英文 |
---|---|
萼片 | C a l y x Calyx Calyx |
花瓣 | P e t a l Petal Petal |
长度 | L e n g t h Length Length |
宽度 | W i d t h Width Width |
鸢花 | I r i s Iris Iris |
# 导包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 读取文件
fname = 'iris.data'
with open(fname,'r+',encoding='utf-8') as f:
s = [i[:-1].split(',') for i in f.readlines()]
# pandas读取数据,样本数各50个
names = ['slength','swidth','plength','pwidth','name']
iris = pd.DataFrame(data=s,columns=names)
iris
索引 | slength | swidth | plength | pwidth | name |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
… | … | … | … | … | … |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 | None | None | None | None | None |
151 rows × 5 columns
# 三种类别
Setosa = iris.iloc[0:50,0:4].astype('float')
Versicolour = iris.iloc[50:100,0:4].astype('float')
Virginica = iris.iloc[100:150,0:4].astype('float')
# 统计每个品种有多少个样本
iris['name'].value_counts()
整体样本数
:
Iris-versicolor 50
Iris-virginica 50
Iris-setosa 50
Name: name, dtype: int64
统计每个类别每个数据点出现的个数
:
# setosa
Setosa.loc[:,'slength'].value_counts(),Setosa.loc[:,'swidth'].value_counts(),Setosa.loc[:,'plength'].value_counts(),Setosa.loc[:,'pwidth'].value_counts()
# versicolour
Versicolour.loc[:,'slength'].value_counts(),Versicolour.loc[:,'swidth'].value_counts(),Versicolour.loc[:,'plength'].value_counts(),Versicolour.loc[:,'pwidth'].value_counts()
# virginica
Virginica.loc[:,'slength'].value_counts(),Virginica.loc[:,'swidth'].value_counts(),Virginica.loc[:,'plength'].value_counts(),Virginica.loc[:,'pwidth'].value_counts()
分组样本数
:
# Setosa:
(5.0 8
5.1 8
4.8 5
5.4 5
4.9 4
4.6 4
5.2 3
4.4 3
4.7 2
5.7 2
5.5 2
4.5 1
5.8 1
4.3 1
5.3 1
Name: slength, dtype: int64,
3.4 9
3.5 6
3.0 6
3.1 5
3.2 5
3.8 4
3.7 3
3.9 2
3.3 2
3.6 2
2.9 1
4.1 1
4.4 1
4.2 1
2.3 1
4.0 1
Name: swidth, dtype: int64,
1.5 14
1.4 12
1.6 7
1.3 7
1.7 4
1.9 2
1.2 2
1.1 1
1.0 1
Name: plength, dtype: int64,
0.2 28
0.4 7
0.3 7
0.1 6
0.5 1
0.6 1
Name: pwidth, dtype: int64)
# Versicolour:
(5.6 5
5.7 5
5.5 5
6.0 4
6.1 4
6.3 3
6.7 3
5.8 3
6.2 2
5.0 2
6.4 2
6.6 2
5.9 2
5.2 1
5.4 1
5.1 1
7.0 1
6.8 1
4.9 1
6.5 1
6.9 1
Name: slength, dtype: int64,
3.0 8
2.9 7
2.8 6
2.7 5
2.5 4
2.6 3
2.4 3
3.1 3
2.3 3
3.2 3
2.2 2
2.0 1
3.3 1
3.4 1
Name: swidth, dtype: int64,
4.5 7
4.7 5
4.0 5
4.4 4
4.2 4
3.9 3
4.1 3
4.6 3
3.5 2
4.9 2
4.8 2
4.3 2
3.3 2
3.8 1
3.7 1
5.1 1
3.6 1
5.0 1
3.0 1
Name: plength, dtype: int64,
1.3 13
1.5 10
1.0 7
1.4 7
1.2 5
1.6 3
1.1 3
1.8 1
1.7 1
Name: pwidth, dtype: int64)
# Virginica:
(6.3 6
6.7 5
6.4 5
7.7 4
6.5 4
6.9 3
7.2 3
5.8 3
6.1 2
6.8 2
6.2 2
6.0 2
5.6 1
7.3 1
7.6 1
7.9 1
7.1 1
5.7 1
4.9 1
5.9 1
7.4 1
Name: slength, dtype: int64,
3.0 12
2.8 8
3.2 5
2.5 4
3.1 4
2.7 4
3.3 3
2.6 2
3.8 2
2.9 2
3.4 2
2.2 1
3.6 1
Name: swidth, dtype: int64,
5.1 7
5.6 6
5.8 3
4.9 3
6.1 3
5.7 3
5.0 3
5.5 3
4.8 2
5.4 2
5.3 2
6.0 2
5.9 2
5.2 2
6.7 2
6.6 1
6.4 1
4.5 1
6.3 1
6.9 1
Name: plength, dtype: int64,
1.8 11
2.3 8
2.0 6
2.1 6
1.9 5
2.4 3
2.5 3
2.2 3
1.5 2
1.6 1
1.7 1
1.4 1
Name: pwidth, dtype: int64)
求和
:
# 分组求和
print('Setosa: ',np.sum(np.array(Setosa), axis=0))
print('Versicolour: ',np.sum(np.array(Versicolour), axis=0))
print('Virginica: ',np.sum(np.array(Virginica), axis=0))
Setosa: [250.3 170.9 73.2 12.2]
Versicolour: [296.8 138.5 213. 66.3]
Virginica: [329.4 148.7 277.6 101.3]
均值
:
# 分组求均值
print('Setosa: ',np.mean(np.array(Setosa), axis=0))
print('Versicolour: ',np.mean(np.array(Versicolour), axis=0))
print('Virginica: ',np.mean(np.array(Virginica), axis=0))
Setosa: [5.006 3.418 1.464 0.244]
Versicolour: [5.936 2.77 4.26 1.326]
Virginica: [6.588 2.974 5.552 2.026]
最大值
:
# 分组求最大值
print('Setosa: ',np.amax(np.array(Setosa), axis=0))
print('Versicolour: ',np.amax(np.array(Versicolour), axis=0))
print('Virginica: ',np.amax(np.array(Virginica), axis=0))
Setosa: [5.8 4.4 1.9 0.6]
Versicolour: [7. 3.4 5.1 1.8]
Virginica: [7.9 3.8 6.9 2.5]
最小值
:
# 分组求最小值
print('Setosa: ',np.amin(np.array(Setosa), axis=0))
print('Versicolour: ',np.amin(np.array(Versicolour), axis=0))
print('Virginica: ',np.amin(np.array(Virginica), axis=0))
Setosa: [4.3 2.3 1. 0.1]
Versicolour: [4.9 2. 3. 1. ]
Virginica: [4.9 2.2 4.5 1.4]
平方根
:
# 分组求平方根
print('Setosa: ')
print(np.sqrt(np.array(Setosa)))
print('Versicolour: ')
print(np.sqrt(np.array(Versicolour)))
print('Virginica: ')
print(np.sqrt(np.array(Virginica)))
Setosa:
[[2.25831796 1.87082869 1.18321596 0.4472136 ]
[2.21359436 1.73205081 1.18321596 0.4472136 ]
[2.16794834 1.78885438 1.14017543 0.4472136 ]
[2.14476106 1.76068169 1.22474487 0.4472136 ]
[2.23606798 1.8973666 1.18321596 0.4472136 ]
[2.32379001 1.97484177 1.30384048 0.63245553]
[2.14476106 1.84390889 1.18321596 0.54772256]
[2.23606798 1.84390889 1.22474487 0.4472136 ]
[2.0976177 1.70293864 1.18321596 0.4472136 ]
[2.21359436 1.76068169 1.22474487 0.31622777]
[2.32379001 1.92353841 1.22474487 0.4472136 ]
[2.19089023 1.84390889 1.26491106 0.4472136 ]
[2.19089023 1.73205081 1.18321596 0.31622777]
[2.07364414 1.73205081 1.04880885 0.31622777]
[2.40831892 2. 1.09544512 0.4472136 ]
[2.38746728 2.0976177 1.22474487 0.63245553]
[2.32379001 1.97484177 1.14017543 0.63245553]
[2.25831796 1.87082869 1.18321596 0.54772256]
[2.38746728 1.94935887 1.30384048 0.54772256]
[2.25831796 1.94935887 1.22474487 0.54772256]
[2.32379001 1.84390889 1.30384048 0.4472136 ]
[2.25831796 1.92353841 1.22474487 0.63245553]
[2.14476106 1.8973666 1. 0.4472136 ]
[2.25831796 1.81659021 1.30384048 0.70710678]
[2.19089023 1.84390889 1.37840488 0.4472136 ]
[2.23606798 1.73205081 1.26491106 0.4472136 ]
[2.23606798 1.84390889 1.26491106 0.63245553]
[2.28035085 1.87082869 1.22474487 0.4472136 ]
[2.28035085 1.84390889 1.18321596 0.4472136 ]
[2.16794834 1.78885438 1.26491106 0.4472136 ]
[2.19089023 1.76068169 1.26491106 0.4472136 ]
[2.32379001 1.84390889 1.22474487 0.63245553]
[2.28035085 2.02484567 1.22474487 0.31622777]
[2.34520788 2.04939015 1.18321596 0.4472136 ]
[2.21359436 1.76068169 1.22474487 0.31622777]
[2.23606798 1.78885438 1.09544512 0.4472136 ]
[2.34520788 1.87082869 1.14017543 0.4472136 ]
[2.21359436 1.76068169 1.22474487 0.31622777]
[2.0976177 1.73205081 1.14017543 0.4472136 ]
[2.25831796 1.84390889 1.22474487 0.4472136 ]
[2.23606798 1.87082869 1.14017543 0.54772256]
[2.12132034 1.51657509 1.14017543 0.54772256]
[2.0976177 1.78885438 1.14017543 0.4472136 ]
[2.23606798 1.87082869 1.26491106 0.77459667]
[2.25831796 1.94935887 1.37840488 0.63245553]
[2.19089023 1.73205081 1.18321596 0.54772256]
[2.25831796 1.94935887 1.26491106 0.4472136 ]
[2.14476106 1.78885438 1.18321596 0.4472136 ]
[2.30217289 1.92353841 1.22474487 0.4472136 ]
[2.23606798 1.81659021 1.18321596 0.4472136 ]]
Versicolour:
[[2.64575131 1.78885438 2.16794834 1.18321596]
[2.52982213 1.78885438 2.12132034 1.22474487]
[2.62678511 1.76068169 2.21359436 1.22474487]
[2.34520788 1.51657509 2. 1.14017543]
[2.54950976 1.67332005 2.14476106 1.22474487]
[2.38746728 1.67332005 2.12132034 1.14017543]
[2.50998008 1.81659021 2.16794834 1.26491106]
[2.21359436 1.54919334 1.81659021 1. ]
[2.56904652 1.70293864 2.14476106 1.14017543]
[2.28035085 1.64316767 1.97484177 1.18321596]
[2.23606798 1.41421356 1.87082869 1. ]
[2.42899156 1.73205081 2.04939015 1.22474487]
[2.44948974 1.4832397 2. 1. ]
[2.46981781 1.70293864 2.16794834 1.18321596]
[2.36643191 1.70293864 1.8973666 1.14017543]
[2.58843582 1.76068169 2.0976177 1.18321596]
[2.36643191 1.73205081 2.12132034 1.22474487]
[2.40831892 1.64316767 2.02484567 1. ]
[2.48997992 1.4832397 2.12132034 1.22474487]
[2.36643191 1.58113883 1.97484177 1.04880885]
[2.42899156 1.78885438 2.19089023 1.34164079]
[2.46981781 1.67332005 2. 1.14017543]
[2.50998008 1.58113883 2.21359436 1.22474487]
[2.46981781 1.67332005 2.16794834 1.09544512]
[2.52982213 1.70293864 2.07364414 1.14017543]
[2.56904652 1.73205081 2.0976177 1.18321596]
[2.60768096 1.67332005 2.19089023 1.18321596]
[2.58843582 1.73205081 2.23606798 1.30384048]
[2.44948974 1.70293864 2.12132034 1.22474487]
[2.38746728 1.61245155 1.87082869 1. ]
[2.34520788 1.54919334 1.94935887 1.04880885]
[2.34520788 1.54919334 1.92353841 1. ]
[2.40831892 1.64316767 1.97484177 1.09544512]
[2.44948974 1.64316767 2.25831796 1.26491106]
[2.32379001 1.73205081 2.12132034 1.22474487]
[2.44948974 1.84390889 2.12132034 1.26491106]
[2.58843582 1.76068169 2.16794834 1.22474487]
[2.50998008 1.51657509 2.0976177 1.14017543]
[2.36643191 1.73205081 2.02484567 1.14017543]
[2.34520788 1.58113883 2. 1.14017543]
[2.34520788 1.61245155 2.0976177 1.09544512]
[2.46981781 1.73205081 2.14476106 1.18321596]
[2.40831892 1.61245155 2. 1.09544512]
[2.23606798 1.51657509 1.81659021 1. ]
[2.36643191 1.64316767 2.04939015 1.14017543]
[2.38746728 1.73205081 2.04939015 1.09544512]
[2.38746728 1.70293864 2.04939015 1.14017543]
[2.48997992 1.70293864 2.07364414 1.14017543]
[2.25831796 1.58113883 1.73205081 1.04880885]
[2.38746728 1.67332005 2.02484567 1.14017543]]
Virginica:
[[2.50998008 1.81659021 2.44948974 1.58113883]
[2.40831892 1.64316767 2.25831796 1.37840488]
[2.66458252 1.73205081 2.42899156 1.44913767]
[2.50998008 1.70293864 2.36643191 1.34164079]
[2.54950976 1.73205081 2.40831892 1.4832397 ]
[2.75680975 1.73205081 2.56904652 1.44913767]
[2.21359436 1.58113883 2.12132034 1.30384048]
[2.70185122 1.70293864 2.50998008 1.34164079]
[2.58843582 1.58113883 2.40831892 1.34164079]
[2.68328157 1.8973666 2.46981781 1.58113883]
[2.54950976 1.78885438 2.25831796 1.41421356]
[2.52982213 1.64316767 2.30217289 1.37840488]
[2.60768096 1.73205081 2.34520788 1.44913767]
[2.38746728 1.58113883 2.23606798 1.41421356]
[2.40831892 1.67332005 2.25831796 1.54919334]
[2.52982213 1.78885438 2.30217289 1.51657509]
[2.54950976 1.73205081 2.34520788 1.34164079]
[2.77488739 1.94935887 2.58843582 1.4832397 ]
[2.77488739 1.61245155 2.62678511 1.51657509]
[2.44948974 1.4832397 2.23606798 1.22474487]
[2.62678511 1.78885438 2.38746728 1.51657509]
[2.36643191 1.67332005 2.21359436 1.41421356]
[2.77488739 1.67332005 2.58843582 1.41421356]
[2.50998008 1.64316767 2.21359436 1.34164079]
[2.58843582 1.81659021 2.38746728 1.44913767]
[2.68328157 1.78885438 2.44948974 1.34164079]
[2.48997992 1.67332005 2.19089023 1.34164079]
[2.46981781 1.73205081 2.21359436 1.34164079]
[2.52982213 1.67332005 2.36643191 1.44913767]
[2.68328157 1.73205081 2.40831892 1.26491106]
[2.7202941 1.67332005 2.46981781 1.37840488]
[2.81069386 1.94935887 2.52982213 1.41421356]
[2.52982213 1.67332005 2.36643191 1.4832397 ]
[2.50998008 1.67332005 2.25831796 1.22474487]
[2.46981781 1.61245155 2.36643191 1.18321596]
[2.77488739 1.73205081 2.46981781 1.51657509]
[2.50998008 1.84390889 2.36643191 1.54919334]
[2.52982213 1.76068169 2.34520788 1.34164079]
[2.44948974 1.73205081 2.19089023 1.34164079]
[2.62678511 1.76068169 2.32379001 1.44913767]
[2.58843582 1.76068169 2.36643191 1.54919334]
[2.62678511 1.76068169 2.25831796 1.51657509]
[2.40831892 1.64316767 2.25831796 1.37840488]
[2.60768096 1.78885438 2.42899156 1.51657509]
[2.58843582 1.81659021 2.38746728 1.58113883]
[2.58843582 1.73205081 2.28035085 1.51657509]
[2.50998008 1.58113883 2.23606798 1.37840488]
[2.54950976 1.73205081 2.28035085 1.41421356]
[2.48997992 1.84390889 2.32379001 1.51657509]
[2.42899156 1.73205081 2.25831796 1.34164079]]
标准差
:
# 分组求标准差
print('Setosa: ',np.std(np.array(Setosa), axis=0))
print('Versicolour: ',np.std(np.array(Versicolour), axis=0))
print('Virginica: ',np.std(np.array(Virginica), axis=0))
Setosa: [0.34894699 0.37719491 0.17176728 0.10613199]
Versicolour: [0.51098337 0.31064449 0.46518813 0.19576517]
Virginica: [0.62948868 0.31925538 0.54634787 0.27188968]
方差
:
# 分组求方差
print('Setosa: ',np.var(np.array(Setosa), axis=0))
print('Versicolour: ',np.var(np.array(Versicolour), axis=0))
print('Virginica: ',np.var(np.array(Virginica), axis=0))
Setosa: [0.121764 0.142276 0.029504 0.011264]
Versicolour: [0.261104 0.0965 0.2164 0.038324]
Virginica: [0.396256 0.101924 0.298496 0.073924]
协方差
:
# 分组求协方差
print('Setosa: ',np.cov(np.mean(np.array(Setosa), axis=0)))
print('Versicolour: ',np.cov(np.mean(np.array(Versicolour), axis=0)))
print('Virginica: ',np.cov(np.mean(np.array(Virginica), axis=0)))
Setosa: 4.427078666666667
Versicolour: 3.916518666666666
Virginica: 4.576966666666664
# 散点图
ax = Setosa.plot.scatter(x='slength', y='swidth', color='tab:blue', label='Calyx');
Setosa.plot.scatter(x='plength', y='pwidth', color='tab:orange', label='Petal', ax=ax);
plt.xlabel('$Length$');
plt.ylabel('$Width$');
plt.title('$Setosa$');
ax = Versicolour.plot.scatter(x='slength', y='swidth', color='tab:blue', label='Calyx');
Versicolour.plot.scatter(x='plength', y='pwidth', color='tab:orange', label='Petal', ax=ax);
plt.xlabel('$Length$');
plt.ylabel('$Width$');
plt.title('$Versicolour$');
ax = Virginica.plot.scatter(x='slength', y='swidth', color='tab:blue', label='Calyx');
Virginica.plot.scatter(x='plength', y='pwidth', color='tab:orange', label='Petal', ax=ax);
plt.xlabel('$Length$');
plt.ylabel('$Width$');
plt.title('$Virginica$');
# 直方图
plt.bar([1,2,3,4],np.mean(np.array(Setosa), axis=0),label='Setosa');
plt.bar([8,9,10,11],np.mean(np.array(Versicolour), axis=0),label='Versicolour');
plt.bar([15,16,17,18],np.mean(np.array(Virginica), axis=0),label='Virginica');
plt.legend();
plt.xticks((1,2,3,4,8,9,10,11,15,16,17,18),('sl','sw','pl','pw','sl','sw','pl','pw','sl','sw','pl','pw'));
plt.title('The different kinds of mean in three kinds of flowers');
# 盒图
blt = plt.boxplot(np.array(Setosa), notch=False, sym='o',vert=True, patch_artist=True);
colors = ['pink', 'lightblue', 'lightgreen']
for pacthes, color in zip(bplt['boxes'], colors):
pacthes.set_facecolor(color)
plt.xticks((1,2,3,4),('slength','swidth','plength','pwidth'))
plt.title('Setosa')
plt.show()
blt = plt.boxplot(np.array(Versicolour), notch=False, sym='o',vert=True, patch_artist=True);
colors = ['pink', 'lightblue', 'lightgreen']
for pacthes, color in zip(bplt['boxes'], colors):
pacthes.set_facecolor(color)
plt.xticks((1,2,3,4),('slength','swidth','plength','pwidth'))
plt.title('Versicolour')
plt.show()
blt = plt.boxplot(np.array(Virginica), notch=False, sym='o',vert=True, patch_artist=True);
colors = ['pink', 'lightblue', 'lightgreen']
for pacthes, color in zip(bplt['boxes'], colors):
pacthes.set_facecolor(color)
plt.xticks((1,2,3,4),('slength','swidth','plength','pwidth'))
plt.title('Virginica')
plt.show()