莫烦python bilibili视频
视频时长一个小时,下面是分模块整理的笔记,需要哪块就复制就可以用
python3.6:
先安装numpy、scipy,再安装 scikit-learn
from sklearn import datasets # sklearn自带数据集
from sklearn.model_selection import learning_curve # 验证过拟合learning curve
from sklearn.model_selection import validation_curve # 选择超参数 validation curve
from sklearn.model_selection import cross_val_score # 交叉验证 cross val score
from sklearn.model_selection import train_test_split # 分训练集和测试集
from sklearn import preprocessing # normalization 归一化
from sklearn.neighbors import KNeighborsClassifier # KNN模型
from sklearn.linear_model import LinearRegression # 线性回归模型(拟合预测)
from sklearn.linear_model import LogisticRegression # 逻辑回归(分类)
from sklearn.ensemble import RandomForestClassifier # 随机森林(分类)
from sklearn.svm import SVC # SVC支持向量机模型(分类)
下面基本用的都是这个数据
#鸢尾花数据
iris = datasets.load_iris()
自己生成的训练回归网络的数据的代码
# 自己生成数据 ,一百个样本,n_features:1个属性
x,y = datasets.make_regression(n_samples=100, n_features=1, n_targets=1,noise=1)
# 训练集和测试集
iris_x = iris.data # 取数据 n行4列
iris_y = iris.target # 取标签 n行
x_train,x_test,y_train,y_test = train_test_split(iris_x,iris_y,test_size=0.3)
# 归一化
x = preprocessing.scale(x)
# 训练KNN模型
knn =KNeighborsClassifier(n_neighbors=5) # 定义KNN模型
knn.fit(x_train,y_train) # 训练
'''回归模型'''
model = LinearRegression()
model.fit(data_x,data_y) # 训练
'''CSV支持向量机模型'''
df = SVC()
df.fit(x_train,y_train)
测试集测试准确度
print(knn.predict(x_test)) # 预测test
print(y_test) #真实test
'''交叉验证,取平均'''
# 可判断多个网络(如knn)的识别准确度然后选择网络
scores =cross_val_score(knn,iris_x,iris_y,cv=5,scoring="accuracy")
print(scores.mean())
'''对网络参数进行选取'''
# 找到最合适的n_neighbors
k_range=range(1,31)
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k) #网络
# classification用:accuracy
scores = cross_val_score(knn, iris_x, iris_y, cv=10, scoring="accuracy")
# regression用 加负号,neg_mean_squared_error
# loss = -cross_val_score(knn, iris_x, iris_y, cv=10, scoring="neg_mean_squared_error")
k_scores.append(scores.mean())
plt.plot(k_range, k_scores)
'''选择最合适的超参数'''
param_range = np.logspace(-6,-2.3,5) #a的选择范围
train_loss1,test_loss1 = validation_curve(
SVC(),x,y,param_name="gamma",param_range=param_range,
cv=10,scoring="neg_mean_squared_error",)
train_loss_mean1 = -np.mean(train_loss1, axis=1)
test_loss_mean1 = -np.mean(test_loss1, axis=1)
plt.plot(param_range, train_loss_mean1, 'o-', color="r",
label="Training")
plt.plot(param_range, test_loss_mean1, 'o-', color="g",
label="Cross-validation")
'''过拟合曲线'''
train_sizes,train_loss,test_loss = learning_curve(
SVC(gamma=0.001),x,y,cv=10,scoring="neg_mean_squared_error",
train_sizes=[0.1,0.25,0.75,1]) # 在这四个位置记录loss
train_loss_mean = -np.mean(train_loss,axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)
plt.plot(train_sizes,train_loss_mean,color="orange", linestyle="--")
plt.plot(train_sizes,test_loss_mean,color="cyan",linestyle="-")
'''储存'''
# 第一种方法
import pickle
with open("save/clf.pickle","wb") as f: # weite bite
pickle.dump(knn, f) #dump:保存模型
with open("save/clf.pickle","rb") as f:
knn2 = pickle.load(f) # load 读取文件
print(knn2.predict(x_test[0:1]),y_test[0:1])
#第二种方法
import joblib
joblib.dump(knn, "save/clf.pkl")
knn3 = joblib.load("save/clf.pkl")