机器学习之sklearn工具包(KNN分类(二))

目录

  • KNN算法
    • 用鸢尾花数据做分类
    • 通过交叉验证的方式筛选参数
    • 用KNN进行癌症的预测
    • 将字符串转换成数据值

KNN算法

用鸢尾花数据做分类

import numpy as np
import matplotlib.pylab as pyb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
X,y =load_iris(True)
# 4个属性,4维空间,4维数据,
# 150代表样本的数量
print(X.shape)
# 降维,切片处理
X=X[:,:2]
print(X.shape)
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X,y)
# 获取测试数据
# 横坐标4~8,纵坐标2~4.5
# 去除图像中的背景点,meshgrid
x1=np.linspace(4,8,100)
y1=np.linspace(2,4.5,80)
X1,Y1=np.meshgrid(x1,y1)
print(X1.shape,Y1.shape)
# 将测试数据变成(?,2)形状
# X1=X1.reshape(-1,1)
# Y1=Y1.reshape(-1,1)
# X_test=np.concatenate([X1,Y1],axis=1)
# ravel表示将数组平铺,一维化
X_test=np.c_[X1.ravel(),Y1.ravel()]
print(X_test.shape)
y_=knn.predict(X_test)
from matplotlib.colors import ListedColormap
lc=ListedColormap(['#FFAAAA','#AAFFAA','#AAAAFF'])
lc2=ListedColormap(['#FF0000','#00FF00','#0000FF'])
pyb.scatter(X_test[:,0],X_test[:,1],c=y_,cmap=lc)
pyb.scatter(X[:,0],X[:,1],c=y,cmap=lc2)
pyb.show()
#结果
(150, 4)
(150, 2)
(80, 100) (80, 100)
(8000, 2)

机器学习之sklearn工具包(KNN分类(二))_第1张图片

通过交叉验证的方式筛选参数

import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
# model_selection: 模型选择
# cross_val_score :交叉验证
from sklearn.model_selection import cross_val_score

X,y=load_iris(True)
# KNN的邻居数通常可以取值为样本数的开平方
# 演示了交叉验证如何使用
knn=KNeighborsClassifier()
score=cross_val_score(knn,X,y,scoring='accuracy',cv=6)

# 应用交叉验证筛选最合适的邻居数
erros=[]
for k in range(1,14):
    knn=KNeighborsClassifier(n_neighbors=k)
    score=cross_val_score(knn,X,y,scoring='accuracy',cv=6).mean()
    # 误差越小,说明k选择越合适,越好
    erros.append(1-score)
import matplotlib.pyplot as plt
# k=11时,误差最小,说明k=11对鸢尾花分类效果最好
plt.plot(np.arange(1,14),erros)
plt.show()

# 应用交叉验证帅选最合适的权重分配方式
weights=['uniform','distance']
for w in weights:
    knn = KNeighborsClassifier(n_neighbors=11,weights=w)
    score = cross_val_score(knn, X, y, scoring='accuracy', cv=6).mean()
    print(w,score)


# 通过交叉验证进行多参数的筛选
result={}
for k in range(1,14):
    for w in weights:
        knn=knn = KNeighborsClassifier(n_neighbors=k,weights=w)
        sm=cross_val_score(knn, X, y, scoring='accuracy', cv=6).mean()
        result[w+str(k)]=sm
max=np.array(list(result.values())).argmax()
print(max)
#结果
uniform 0.98070987654321
distance 0.9799382716049383
20

机器学习之sklearn工具包(KNN分类(二))_第2张图片

用KNN进行癌症的预测

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.model_selection import train_test_split
# grid网格,search 搜素,cv:cross_vaildation
# 搜素算法最适合的参数
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

cancer=pd.read_csv('./cancer.csv',sep='\t')
cancer.drop('ID',axis=1,inplace=True)
X=cancer.iloc[:,1:]
y=cancer['Diagnosis']
X_train ,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
# 进行网格搜素
knn=KNeighborsClassifier()
params={'n_neighbors':[i for i in range(1,30)],
        'weights':['uniform','distance'],
        'p':[1,2]}
gcv=GridSearchCV(knn,params,scoring='accuracy',cv=6)
gcv.fit(X_train,y_train)
# 得到最好的模型,进行预测
knn_best=gcv.best_estimator_
y_=knn_best.predict(X_test)
print(accuracy_score(y_test,y_))
# 利用混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_,target_names=['B','M']))

将字符串转换成数据值

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.model_selection import train_test_split
# cv int 6 数据分成6份
from sklearn.model_selection import cross_val_score,GridSearchCV
# KFold、StratifiedKFole 将数据分成多少份
from sklearn.model_selection import KFold,StratifiedKFold

# data=np.random.randint(0,10,size=(8,2))
# target=np.array([0,0,1,0,1,1,1,0])
# train_test_split(data,target)
# kFold=KFold(n_splits=4)
# # train,test 是索引值,只要有索引就可以获取数据
# for trian,test in kFold.split(data,target):
#     print(target[trian],target[test])
#
# print('-----------------------------')
#
# # 分成4份,每一份数据特征,数据样本比例和原来是一样的
# sKFold=StratifiedKFold(n_splits=4)
# for trian,test in sKFold.split(data,target):
#     print(target[trian],target[test])

data=pd.read_csv('./salary.txt')
data.drop(labels=['final_weight','education','capital_gain','capital_loss'],axis=1,inplace=True)
X=data.iloc[:,0:-1]
y=data['salary']
# 将数据中str转换成int,float 从而算法可以计算
u=X['workclass'].unique()
def convert(x):
   return np.argwhere(u==x)[0,0]
X['workclass']=X['workclass'].map(convert)

cols=['marital_status','occupation','relationship','race','sex','native_country']
for col in cols:
    u=X[col].unique()
    def convert(x):
        return np.argwhere(u==x)[0,0]
    X[col] = X[col].map(convert)
knn=KNeighborsClassifier()
kFold=KFold(10)
accuracy=0
for train,test in kFold.split(X,y):
    knn.fit(X.loc[train],y[train])
    acc=knn.score(X.loc[test],y[test])
    accuracy+=acc/10

print(accuracy)
0.7973345728987424
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
salary=pd.read_csv('./salary.txt')
salary.drop(labels=['final_weight','education_num','capital_gain','capital_loss'],axis=1,inplace=True)
# OrdinalEncoder 函数将数据中的字符串格式变成int,float格式,保存的形式是DataFrame
ordinalEncoder=OrdinalEncoder()
data=ordinalEncoder.fit_transform(salary)
salary_ordinal=DataFrame(data,columns=salary.columns)
salary_ordinal.head()
# LabelEncoder 函数将数据中的字符串格式变成int,float格式,保存的形式是Series
labelEncoder=LabelEncoder()
salary_label=labelEncoder.fit_transform(salary['salary'])
for col in salary.columns:
    salary[col]=labelEncoder.fit_transform(salary[col])

salary.head()

# OneHotEncoder编码
edu=salary[['education']]
onehotEncoder=OneHotEncoder()
onehot=onehotEncoder.fit_transform(edu) #二维数据
onehot.toarray()[:10]

你可能感兴趣的:(机器学习之sklearn工具包(KNN分类(二)))