任务:
拟合反应速率(rate)与温度(temperature)数据,预测85度时的反应速率
欠拟合
过拟合
例2:
欠拟合
过拟合
欠拟合与过拟合
模型不合适,导致其无法对数据实现有效预测
模型对数据的预测情况
训练数据 | 预测数据 | |
---|---|---|
欠拟合 | 不准确 | 不准确 |
过拟合 | 准确 | 不准确 |
好模型 | 准确 | 准确 |
欠拟合可以通过观察训练数据及时发现,通过优化模型结果解决
如何解决过拟合问题
原因:
解决办法:
建立模型的意义,不在于对训练数据做出准确预测,更在与对新数据的准确预测
对全数据进行数据分离,部分用于训练,部分用于新数据的结果预测
分离训练数据与测试数据
混淆矩阵(Confusion Matrix)
分类任务中,计算测试数据集预测准确率(accuracy)以评估模型表现
局限性:无法真实反映模型针对各个分类的预测准确度
准确率可以方便的用于衡量模型的整体预测效果,但无法反应细节信息,具体表现在:
混淆矩阵,又称为误差矩阵,用于衡量分类算法的准确程度
通过混淆矩阵,计算更丰富的模型评估指标
特点:
衡量指标的选择取决于应用场景
数据质量决定模型表现的上限
数据操作:
目标:
在确定模型类别后,如何让模型表现更好、
三方面:数据、模型核心参数、正则化
尝试以下方法:
选择使用KNN模型,尝试不同n_neighbors值对结果的影响
训练数据集准确率 随着模型复杂而提高
测试数据集准确率 在模型过于简单或过于复杂的情况时下降
X_range = np.linspace(40,90,300).reshape(-1,1)
y_range_predict = lr1.predict(X_range)
生成多项式(二次)数据:
from sklearn.preprocessing import PolynomialFeatures
poly2=PolynomialFeatures(degree=2)
X_2_train = poly2.fit_transform(X_train)
X_2_test = poly2.transform(X_test)
from skklearn.model_selection import train_test_split
X_train,X_test,Y_train,y_test = train_test_split(X,y,random_state=4,test_size=0.4)
xx,yy = np.meshgrid(np.arange(0,10,0.05),np.arange(0,10,0.05))
x_range = np.c_[xx.ravel(),yy.ravel()]
y_range_predict = knn.predict(x_range)
可视化决策区域
bad_knn = plt.scatter(x_range[:,0][y_range_predict==0],x_range[:,1][y_range_predict==0])
good_knn = plt.scatter(x_range[:,0][y_range_predict==1],x_range[:,1][y_range_predict==1])
计算混淆矩阵:
from sklearn.metrics import confusion_matrix
cm= = confusion_matrix(y_test,y_test_predict)
TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]
导包加载数据
import pandas as pd
import numpt as np
data_train = pd.read_csv('T-R-train.csv')
赋值
X_train = data_train.loc[:,'T']
y_train = data_train.loc[:,'rate']
可视化
from matplotlib import pyplot as plt
fig1 = plt.figure(figsize=(5,5))
plt.scatter(X_train,y_train)
plt.title('raw data')
plt.xlabel('temperature')
plt.ylabel('rate')
plt.show()
线性回归
X_train = np.array(X_train).reshape(-1,1)
from sklearn.linear_model import LinearRegression
lr1 = LinearRegression()
lr1.fit(X_train,y_train)
加载测试数据
data_test = pd.read_csv('T-R-test.csv')
X_test = data_test.loc[:,'T']
y_test = data_test.loc[:,'rate']
计算r2分数
X_test = np.array(X_test).reshape(-1,1)
y_train_predict = lr1.predict(X_train)
y_test_predict = lr1.predict(X_test)
from sklearn.metrics import r2_score
r2_train = r2_score(y_train,y_train_predict)
r2_test = r2_score(y_test,y_test_predict)
print('training r2:',r2_train)
print('test r2',r2_test)
X_range = np.linspace(40,90,300).reshape(-1,1)
y_range_predict = lr1.predict(X_range)
fig2=plt.figure(figsize=(5,5))
plt.plot(X_range,y_range_predict)
plt.scatter(X_train,y_train)
plt.title('prediction data')
plt.xlabel('trediction data')
plt.ylabel('rate')
plt.show()
from sklearn.preprocessing import PolynomialFeatures
poly2 = PolynomialFeatures(degree=2)
X_2_train = poly2.fit_transform(X_train)
X_2_test = poly2.transform(X_test)
r2分数
lr2 = LinearRegression()
lr2.fit(X_2_train,y_train)
y_2_train_predict = lr2.predict(X_2_train)
y_2_test_predict = lr2.predict(X_2_test)
r2_2_train = r2_score(y_train,y_2_train_predict)
r2_2_test = r2_score(y_test,y_2_test_predict)
print('training r2_2:',r2_2_train)
print('test r2_2',r2_2_test)
X_2_range = np.linspace(40,90,300).reshape(-1,1)
X_2_range = poly2.transform(X_2_range)
y_2_range_predict = lr2.predict(X_2_range)
fi3=plt.figure(figsize=(5,5))
plt.plot(X_range,y_2_range_predict)
plt.scatter(X_train,y_train)
plt.scatter(X_test,y_test)
plt.title('polynomial prediction resilt(2)')
plt.xlabel('temperature')
plt.ylabel('rate')
plt.show()
from sklearn.preprocessing import PolynomialFeatures
poly5 = PolynomialFeatures(degree=5)
X_5_train = poly5.fit_transform(X_train)
X_5_test = poly5.transform(X_test)
lr5 = LinearRegression()
lr5.fit(X_5_train,y_train)
y_5_train_predict = lr5.predict(X_5_train)
y_5_test_predict = lr5.predict(X_5_test)
r2_5_train = r2_score(y_train,y_5_train_predict)
r2_5_test = r2_score(y_test,y_5_test_predict)
print('training r2_5:',r2_5_train)
print('test r2_5',r2_5_test)
X_5_range = np.linspace(40,90,300).reshape(-1,1)
X_5_range = poly5.transform(X_5_range)
y_5_range_predict = lr5.predict(X_5_range)
fi4=plt.figure(figsize=(5,5))
plt.plot(X_range,y_5_range_predict)
plt.scatter(X_train,y_train)
plt.scatter(X_test,y_test)
plt.title('polynomial prediction resilt(5)')
plt.xlabel('temperature')
plt.ylabel('rate')
plt.show()
导包 加载数据
import pandas as pd
import numpy as np
data = pd.read_csv('data_class_raw.csv')
赋值
X=data.drop(['y'],axis=1)
y=data.loc[:,'y']
可视化
from matplotlib import pyplot as plt
fig1 = plt.figure(figsize=(5,5))
bad = plt.scatter(X.loc[:,'x1'][y==0],X.loc[:,'x2'][y==0])
good = plt.scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
plt.legend((good,bad),('good','bad'))
plt.title('raw data')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
from sklearn.covariance import EllipticEnvelope
ad_model = EllipticEnvelope(contamination=0.02)
ad_model.fit(X[y==0])
y_predict_bad = ad_model.predict(X[y==0])
可视化
fig2 = plt.figure(figsize=(5,5))
bad = plt.scatter(X.loc[:,'x1'][y==0],X.loc[:,'x2'][y==0])
good = plt.scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
plt.scatter(X.loc[:,'x1'][y==0][y_predict_bad==-1],X.loc[:,'x2'][y==0][y_predict_bad==-1],marker='x',s=150)
plt.legend((good,bad),('good','bad'))
plt.title('raw data')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
data = pd.read_csv('data_class_processed.csv')
data.head()
#define X and y
X = data.drop(['y'],axis=1)
y = data.loc[:,'y']
PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
X_norm = StandardScaler().fit_transform(X)
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_norm)
var_ratio = pca.explained_variance_ratio_
print(var_ratio)
fig4 = plt.figure(figsize=(5,5))
plt.bar([1,2],var_ratio)
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=4,test_size=0.4)
print(X_train.shape,X_test.shape,X.shape)
#knn model
from sklearn.neighbors import KNeighborsClassifier
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_10.fit(X_train,y_train)
y_train_predict = knn_10.predict(X_train)
y_test_predict = knn_10.predict(X_test)
#calculate the accuracy
from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(y_train,y_train_predict)
accuracy_test = accuracy_score(y_test,y_test_predict)
print("trianing accuracy:",accuracy_train)
print('testing accuracy:',accuracy_test)
xx, yy = np.meshgrid(np.arange(0,10,0.05),np.arange(0,10,0.05))
x_range = np.c_[xx.ravel(),yy.ravel()]
y_range_predict = knn_10.predict(x_range)
fig4 = plt.figure(figsize=(10,10))
knn_bad = plt.scatter(x_range[:,0][y_range_predict==0],x_range[:,1][y_range_predict==0])
knn_good = plt.scatter(x_range[:,0][y_range_predict==1],x_range[:,1][y_range_predict==1])
bad = plt.scatter(X.loc[:,'x1'][y==0],X.loc[:,'x2'][y==0])
good = plt.scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
plt.legend((good,bad,knn_good,knn_bad),('good','bad','knn_good','knn_bad'))
plt.title('prediction result')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
计算测试数据集对应的混淆矩阵,计算准确率、召回率、特异度、精确率、F1分数
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_test_predict)
print(cm)
TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]
print(TP,TN,FP,FN)
accuracy = (TP + TN)/(TP + TN + FP + FN)
print(accuracy)
recall = TP/(TP + FN)
print(recall)
specificity = TN/(TN + FP)
print(specificity)
precision = TP/(TP + FP)
print(precision)
f1 = 2*precision*recall/(precision+recall)
print(f1)
尝试不同的n_neighbors(1-20),计算其在训练数据集、测试数据集上的准确率并作图
n = [i for i in range(1,21)]
accuracy_train = []
accuracy_test = []
for i in n:
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
y_train_predict = knn.predict(X_train)
y_test_predict = knn.predict(X_test)
accuracy_train_i = accuracy_score(y_train,y_train_predict)
accuracy_test_i = accuracy_score(y_test,y_test_predict)
accuracy_train.append(accuracy_train_i)
accuracy_test.append(accuracy_test_i)
print(accuracy_train,accuracy_test)
可视化
fig5 = plt.figure(figsize=(12,5))
plt.subplot(121)
plt.plot(n,accuracy_train,marker='o')
plt.title('training accuracy vs n_neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('accuracy')
plt.subplot(122)
plt.plot(n,accuracy_test,marker='o')
plt.title('testing accuracy vs n_neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('accuracy')
plt.show()