sklearn中的model_selection

import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
X,y = datasets.make_moons(n_samples=100,noise = 0.3)
plt.scatter(X[y==0,0],X[y==0,1],color = 'r')
plt.scatter(X[y==1,0],X[y==1,1],color = 'b')
plt.show()

sklearn中的model_selection_第1张图片

数据集分割

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =  train_test_split(X, y, test_size=0.2, random_state=666,shuffle=True)
# Parameters:	
# *arrays :需要进行划分的X ;
# target :数据集的结果
# test_size :测试集占整个数据集的多少比例
# train_size :test_size +train_size = 1
# random_state : 随机种子
# shuffle : 是否洗牌 在进行划分前

# 返回 X_train,X_test,y_train,y_test 
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1])
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1])
plt.scatter(X_test[y_test==0,0],X_test[y_test==0,1])
plt.scatter(X_test[y_test==1,0],X_test[y_test==1,1])
plt.show()

sklearn中的model_selection_第2张图片

交叉验证

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
cross_val_score(knn,X,y,cv = 3) # cv表示将数据分成几份
array([0.88235294, 0.85294118, 0.875     ])

使用交叉验证获得最优参数

%%time 
best_k, best_p, best_score = 0, 0, 0
for k in range(2, 11):
    for p in range(1, 6):
        knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
        scores = cross_val_score(knn_clf, X_train, y_train)
        score = np.mean(scores)
        if score > best_score:
            best_k, best_p, best_score = k, p, score
            
print("Best K =", best_k)
print("Best P =", best_p)
print("Best Score =", best_score)
Best K = 8
Best P = 3
Best Score = 0.8874643874643874
Wall time: 236 ms

Grid超参数搜索

# gridSearchCV这个是属于网格搜索超参数,这个类需要我y必须放入的一些参数
# 第一个参数是实例化的模型,这里用的knn模型,
# 第二个参数是我们需要网格搜索的超参数,这里param_grid需要有格式;
# 及param_grid必须是一个列表,这个列表是由多个字典组成,字典中的键就是我们需要放入之前的超参数;
# 假如放入knn模型 ,则键可以是weights,n_neihbors或者是p这三个变量其实也就是KNeighborsClassifier的超参数 。
# 我们也可以倒过来理解,每个字典里面都是一组超参数网格搜索,最后再比较多组最优超参数值,从里面挑出一组最优的超参数
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
param_grid=[
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1,11)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,11)],
        'p':[i for i in range(1,6)]
    }
]

grid_search = GridSearchCV(KNeighborsClassifier(),param_grid)
%%time
grid_search.fit(X_train, y_train)
Wall time: 375 ms





GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
grid_search.best_estimator_
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=3,
           weights='distance')
grid_search.best_score_ # 获得grid_search产生的
0.8875
%%time
#  verbose表示:
# verbose:日志冗长度,int:冗长度,0:不输出训练过程,1:偶尔输出,>1:对每个子模型都输出。
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1, verbose=4)
grid_search.fit(X_train, y_train)
Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.9s


Wall time: 4.41 s


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    4.1s finished
grid_search.best_params_  # 获得grid_search产生的最优超参数
{'n_neighbors': 8, 'p': 3, 'weights': 'distance'}
knn_clf = grid_search.best_estimator_  # 获得加入最优超参数后的生成的最优机器学习模型
knn_clf.fit(X_train,y_train)
knn_clf.score(X_test,y_test)
0.9

在网格搜索中增加交叉验证

GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1, verbose=4,cv =4)
GridSearchCV(cv=4, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

你可能感兴趣的:(python,sklearn)