分类准确度就是将算法分类正确的样本数除以全量样本数得出的结果。通常情况下,准确度越高,分类模型越好。
不过有时候准确度高并不代表算法一定好,比如对某一地区某天的地震预测,假设有一堆特征作为地震分类的依据属性,分类的标签是发生地震和不发生地震。一个不加思考的分类器将每个评估样本都划分为不发生地震,不过它的分类准确度却高达99%。为什么会这样。因为数据分布太不均匀了。发生地震的样本太少,不发生地震的样本太多。这样训练出来的模型就不能仅仅根据准确度去评判了。
因此在面对分类标签不均匀的情况下,尽量避免使用准确度去评估模型。
①导入数据
# 导入数据
import pandas as pd
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
df=pd.read_csv('pima_data.csv',names=names)
# X为特征集
X=df.iloc[:,0:8].values
# Y为标签集
Y=df.iloc[:,8].values
# 查砍前5行
df.head()
# 通过10折交叉验证评估模型的准确率
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
num_folds=10
seed=4
kfold=KFold(n_splits=num_folds,random_state=seed)
lr=LogisticRegression()
result=cross_val_score(lr,X,Y,cv=kfold)
print('通过10折交叉验证评估模型的准确率')
print('算法评估结果准确度:%.3f%%(%.3f%%)' %(result.mean()*100,result.std()*100))
# 通过分离训练数据集和评估数据集评估模型的准确率
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=1/3,random_state=4)
lr=LogisticRegression()
lr.fit(X_train,Y_train)
result=lr.score(X_test,Y_test)
print('通过分离训练数据集和评估数据集评估模型的准确率')
print('算法评估结果准确度:%.3f%%'% (result*100))
①导入数据
# 导入数据
import pandas as pd
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
df=pd.read_csv('pima_data.csv',names=names)
# X为特征集
X=df.iloc[:,0:8].values
# Y为标签集
Y=df.iloc[:,8].values
# 查砍前5行
df.head()
# 通过分10折交叉验证评估模型的对数损失函数
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
num_flods=10
seed=4
kfold=KFold(n_splits=num_folds,random_state=seed)
scoring='neg_log_loss'
lr=LogisticRegression()
result=cross_val_score(lr,X,Y,cv=kfold,scoring=scoring)
print(result)
print('算法评估结果对数损失函数:%.3f%% (%.3f%%)' % (result.mean()*100,result.std()*100))
# 通过分离训练数据集和评估数据集评估模型的对数损失函数
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=1/3,random_state=4)
lr=LogisticRegression()
lr.fit(X_train,Y_train)
y_pred=lr.predict(X_test)
result=log_loss(Y_test,y_pred)
print('算法评估结果对数损失函数:',result)
①导入数据
# 导入数据
import pandas as pd
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
df=pd.read_csv('pima_data.csv',names=names)
# X为特征集
X=df.iloc[:,0:8].values
# Y为标签集
Y=df.iloc[:,8].values
# 查砍前5行
df.head()
# 通过分10折交叉验证评估模型的AUC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
num_flods=10
seed=4
kfold=KFold(n_splits=num_flods,random_state=seed)
lr=LogisticRegression()
scoring='roc_auc'
result=cross_val_score(lr,X,Y,cv=kfold,scoring=scoring)
print('算法评估结果AUC:%.3f%%(%.3f%%)' % (result.mean()*100,result.std()*100))
③通过分离训练数据集和评估数据集评估模型的AUC
# 通过分离训练数据集和评估数据集评估模型的AUC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=1/3,random_state=4)
lr=LogisticRegression()
lr.fit(X_train,Y_train)
y_pred=lr.predict(X_test)
result=roc_auc_score(Y_test,y_pred)
print('算法评估结果AUC:%.3f%%' % (result*100))
①导入数据
# 导入数据
import pandas as pd
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
df=pd.read_csv('pima_data.csv',names=names)
# X为特征集
X=df.iloc[:,0:8].values
# Y为标签集
Y=df.iloc[:,8].values
# 查砍前5行
df.head()
# 通过分离训练数据集和评估数据集评估模型的混淆矩阵
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=1/3,random_state=4)
lr=LogisticRegression()
lr.fit(X_train,Y_train)
y_pred=lr.predict(X_test)
result=confusion_matrix(Y_test,y_pred)
classes=['0','1']
df_cm=pd.DataFrame(data=result,index=classes,columns=classes)
print('算法评估结果混淆矩阵:\n',df_cm)
①导入数据
# 导入数据
import pandas as pd
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
df=pd.read_csv('pima_data.csv',names=names)
# X为特征集
X=df.iloc[:,0:8].values
# Y为标签集
Y=df.iloc[:,8].values
# 查砍前5行
df.head()
# 通过分离训练数据集和评估数据集评估模型的混淆矩阵
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=1/3,random_state=4)
lr=LogisticRegression()
lr.fit(X_train,Y_train)
y_pred=lr.predict(X_test)
result=classification_report(Y_test,y_pred)
print('算法评估结果分类报告:\n',result)