Cross-validation 交叉验证与训练集测试集划分

由于代码比较简单,直接看代码生成的结果更容易理解代码的功能和作用,所以没有添加更多的说明。
参考官方文档地址:http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-evaluating-estimator-performance

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

划分训练集和测试集(train_set,test_set)

iris=datasets.load_iris()
iris.data.shape,iris.target.shape
((150, 4), (150,))
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.4,random_state=0)
X_train.shape,y_train.shape
((90, 4), (90,))
X_test.shape,y_test.shape
((60, 4), (60,))
clf=svm.SVC(kernel='linear',C=1).fit(X_train,y_train)
clf.score(X_test,y_test)
0.96666666666666667

计算交叉验证的指标

from sklearn.model_selection import cross_val_score
clf=svm.SVC(kernel='linear',C=1)
scores=cross_val_score(clf,iris.data,iris.target,cv=5)#5折交叉验证
scores
array([ 0.96666667,  1.        ,  0.96666667,  0.96666667,  1.        ])
#平均值和95%的置信区间可以计算得出
print("Accuary: %0.2f(+/-%0.2f)" % (scores.mean(),scores.std()*2))
Accuary: 0.98(+/-0.03)
#默认情况下,每次CV迭代计算score是估计量的得分方法。可以通过使用评分参数来改变其评分参数:
from sklearn import metrics
scores=cross_val_score(clf,iris.data,iris.target,cv=5,scoring='f1_macro')
scores
array([ 0.96658312,  1.        ,  0.96658312,  0.96658312,  1.        ])
#由于水仙花数据集中的正负样本个数是均衡的,所以准确率和F1值的得分很相似
#也可以通过传递交叉验证迭代器参数,使用其他交叉验证策略,例如:
from sklearn.model_selection import ShuffleSplit
n_samples=iris.data.shape[0]
cv=ShuffleSplit(n_splits=3,test_size=0.3,random_state=0)
cross_val_score(clf,iris.data,iris.target,cv=cv)
array([ 0.97777778,  0.97777778,  1.        ])
#数据转换和留出数据
from sklearn import preprocessing
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.4,random_state=0)
scaler=preprocessing.StandardScaler().fit(X_train)
X_train_transformed=scaler.transform(X_train)
clf=svm.SVC(kernel='linear',C=1.0).fit(X_train_transformed,y_train)
X_test_transformed=scaler.transform(X_test)
clf.score(X_test_transformed,y_test)
0.93333333333333335
#管道可以更容易地组合估计器,在交叉验证下使用更便捷的写法:
from sklearn.pipeline import make_pipeline
clf=make_pipeline(preprocessing.StandardScaler(),svm.SVC(C=1))
cross_val_score(clf,iris.data,iris.target,cv=cv)
array([ 0.97777778,  0.93333333,  0.95555556])
#通过交叉验证获得预测值
#函数cross_val_predict和cross_val_score功能相似,但前者对每个输入返回一个预测元素
from sklearn.model_selection import cross_val_predict
predicted=cross_val_predict(clf,iris.data,iris.target,cv=10)
metrics.accuracy_score(iris.target,predicted)
0.96666666666666667

交叉验证迭代器

#对于假设其独立同分布的数据
#KFold 将样本划分为k组,如果k=n,就是所谓的留一法。
import numpy as np
from sklearn.model_selection import KFold
X=["a","b","c","d"]
kf=KFold(n_splits=2)
for train,test in kf.split(X):
    print("%s %s" %(train,test))#生成的折用下标表示原始数据位置
[2 3] [0 1]
[0 1] [2 3]
X=np.array([[0.,0.],[1.,1.],[-1.,-1.],[2.,2.]])
y=np.array([0,1,0,1])
X_train,X_test,y_train,y_test=X[train],X[test],y[train],y[test]
# Leave One Out (LOO) 留一法
from sklearn.model_selection import LeaveOneOut
X=[1,2,3,4]
loo=LeaveOneOut()
for train,test in loo.split(X):
    print("%s %s"%(train,test))
[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]
#Leave P Out (LPO) 与LeaveOneOut相似,从n个样本中选出p个样本的排列
from sklearn.model_selection import LeavePOut
X=np.ones(4)
lpo=LeavePOut(p=2)
for train,test in lpo.split(X):
    print("%s %s" %(train,test))
[2 3] [0 1]
[1 3] [0 2]
[1 2] [0 3]
[0 3] [1 2]
[0 2] [1 3]
[0 1] [2 3]

随机排列交叉验证(洗牌)和分裂 Random permutations cross-validation a.k.a. Shuffle & Split

#ShuffleSplit 可以定义划分迭代次数和训练测试集的划分比例
from sklearn.model_selection import ShuffleSplit
X=np.arange(5)
ss=ShuffleSplit(n_splits=3,test_size=0.25,random_state=0)#random_state保证了随机的可再现性
for train_index,test_index in ss.split(X):
    print("%s %s" %(train_index,test_index))
[1 3 4] [2 0]
[1 4 3] [0 2]
[4 0 2] [1 3]

基于类标签的分层交叉验证迭代器

#有监督分类问题中,如果正负样本的个数不均衡,这种情况下就要考虑使用分层的方法划分训练集和测试集
#StratifiedKFold and StratifiedShuffleSplit

#分层k折
from sklearn.model_selection import StratifiedKFold
X=np.ones(10)
y=[0,0,0,0,1,1,1,1,1,1]
skf=StratifiedKFold(n_splits=3)
for train,test in skf.split(X,y):
     print("%s %s"%(train,test))
[2 3 6 7 8 9] [0 1 4 5]
[0 1 3 4 5 8 9] [2 6 7]
[0 1 2 4 5 6 7] [3 8 9]
#StratifiedShuffleSplit是shufflesplit的一个变种,返回分层分割结果,即在训练集和验证集中保留相同的正负样本比例

分组数据的交叉验证迭代器

#Group k-fold 分组k折验证 ,让相同组的样本只出现在训练集或测试集中。
from sklearn.model_selection import GroupKFold
X=[0.1,0.2,2.2,2.4,2.3,4.55,5.8,8.8,9,10]
y=["a","b","b","b","c","c","c","d","d","d"]
groups=[1,1,1,2,2,2,3,3,3,3]
gkf=GroupKFold(n_splits=3)
for train,test in gkf.split(X,y,groups=groups):
    print("%s %s" %(train,test))
[0 1 2 3 4 5] [6 7 8 9]
[0 1 2 6 7 8 9] [3 4 5]
[3 4 5 6 7 8 9] [0 1 2]
#Leave One Group Out
from sklearn.model_selection import LeaveOneGroupOut
X=[1,5,10,50,60,70,80]
y=[0,1,1,2,2,2,2]
groups=[1,1,2,2,3,3,3]
logo=LeaveOneGroupOut()
for train,test in logo.split(X,y,groups=groups):
    print("%s %s"%(train,test))
[2 3 4 5 6] [0 1]
[0 1 4 5 6] [2 3]
[0 1 2 3] [4 5 6]
# Leave P Groups Out
from sklearn.model_selection import LeavePGroupsOut

X = np.arange(6)
y = [1, 1, 1, 2, 2, 2]
groups = [1, 1, 2, 2, 3, 3]
lpgo = LeavePGroupsOut(n_groups=2)
for train, test in lpgo.split(X, y, groups=groups):
    print("%s %s" % (train, test))
[4 5] [0 1 2 3]
[2 3] [0 1 4 5]
[0 1] [2 3 4 5]
#Group Shuffle Split
from sklearn.model_selection import GroupShuffleSplit

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("%s %s" % (train, test))
[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]

时间序列数据的交叉验证 Cross validation of time series data

#Time Series Split
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv=TimeSeriesSplit(n_splits=3)
print(tscv)
for train,test in tscv.split(X):
    print("%s %s"%(train,test))
TimeSeriesSplit(n_splits=3)
[0 1 2] [3]
[0 1 2 3] [4]
[0 1 2 3 4] [5]

你可能感兴趣的:(sklearn,数据预处理,交叉验证,备忘录,sklearn,交叉验证,数据集划分)