1、train_test_split(数据分割):
from sklearn.model_selection import train_test_split
X=np.random.randint(0,100,(10,4))
y=np.random.randint(0,3,10)
y.sort()
print('样本:')
print(X)
print('标签:')
print(y)
# 分割训练集、测试集
# random_state确保每次随机分割得到相同的结果
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=1/3,random_state=7)
print('训练集:')
print(X_train)
print(y_train)
print('测试集:')
print(X_test)
print(y_test)
2、preprocessing(归一化)
from sklearn import preprocessing
x1=np.random.randint(1,100,5).reshape(5,1)
x2=np.random.randint(1,10,5).reshape(5,1)
x3=np.random.randint(1,100000,5).reshape(5,1)
X=np.concatenate([x1,x2,x3],axis=1)
print(X)
print(preprocessing.scale(X))
# 生成分类数据进行验证scale的必要性
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
%matplotlib inline
X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2, random_state=25, n_clusters_per_class=1, scale=100)
plt.scatter(X[:,0], X[:,1], c=y)
plt.show()
#归一化前后比较
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=1/3)
svm_classifier=svm.SVC()
svm_classifier.fit(X_train,y_train)
svm_classifier.score(X_test,y_test)
X=preprocessing.scale(X) #X归一化
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=1/3)
svm_classifier=svm.SVC()
svm_classifier.fit(X_train,y_train)
svm_classifier.score(X_test,y_test)
3、cross_val_score(交叉验证)
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
iris=datasets.load_iris()
X=iris.data
y=iris.target
X_train, X_test, y_train, y_test=train_test_split(X,y,train_size=1/3,random_state=7)
k_range=range(1,31)
cv_scores=[]
for n in k_range:
knn=KNeighborsClassifier(n)
scores=cross_val_score(knn,X_train,y_train,cv=10,scoring='accuracy') # 分类问题使用
#scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='neg_mean_squared_error') # 回归问题使用
cv_scores.append(scores.mean())
plt.plot(k_range,cv_scores)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.show()
#选择最优的K
best_knn=KNeighborsClassifier(n_neighbors=8)
best_knn.fit(X_train,y_train)
print(best_knn.score(X_test,y_test))
print(best_knn.predict(X_test))