M | B | |
---|---|---|
F(False,0) | TN | FP |
T(True,1) | FN | TP |
② 准确率
准确率是最常见的评价指标,预测正确的样本数占所有的样本数的比例;通常来说,准确率越高分类器越好。
③ 敏感度(召回率)
灵敏度表示的是样本中所有正例中被识别的比例,衡量了分类器对正例的识别能力。
④ 特效性检验(特效度)
特效度表示的是样本中所有负例中被识别的比例,衡量了分类器对负例的识别能力。
#读取数据集
data = pd.read_excel('分类作业数据集.xlsx')
#数据集查看
print(data.info())
print(data.columns)
print(data.head(5))
print(data.describe())
#数据清洗
#“ID"列没有实际意义,删除
data.drop('ID',axis = 1,inplace=True)
#将类型的B,M用0,1代替
data['类型'] = data['类型'].map({'M':1,'B':0})
# 特征字段放入features_mean
features_mean= list(data.columns[1:12])
# 可视化两种类型分布
sns.countplot(x="类型",data=data)
plt.show()
# 用热力图呈现features_mean字段之间的相关性
corr = data[features_mean].corr()
plt.figure(figsize=(14,14))
# annot=True显示每个方格的数据
sns.heatmap(corr, annot=True)
plt.show()
# 特征选择后得到的6个特征
features_remain = ['radius','texture', 'smoothnesscompactness','compactness','symmetry ', 'fractal dimension ']
#抽取40%的数据作为测试集,其余60%作为训练集
train,test = train_test_split(data,test_size = 0.4)
#抽取特征选择的数值作为训练和测试数据
train_X = train[features_remain]
train_y = train['类型']
test_X = test[features_remain]
test_y = test['类型']
#采用Z-Score标准化,保证每个特征维度的数据均值为0,方差为1
ss = StandardScaler()
train_X = ss.fit_transform(train_X)
test_X = ss.transform(test_X)
print("%%%%%%%准确度%%%%%%%")
print("%%%%%%%敏感度%%%%%%%")
print("%%%%%%%特效度%%%%%%%")
print("%%%%%%%F1_score%%%%%%%")
kernelList = ['linear','rbf','sigmoid']
for kernel in kernelList:
svc = SVC(kernel=kernel).fit(train_X,train_y)
y_pred = svc.predict(test_X)
# 计算准确度
score_svc = metrics.accuracy_score(test_y,y_pred)
print(kernel+":")
print(score_svc)
# 计算召回率(敏感度)
print(recall_score(test_y, y_pred))
# 混淆矩阵
C = confusion_matrix(test_y, y_pred)
TN=C[0][0]
FP=C[0][1]
FN=C[1][0]
TP=C[1][1]
# 计算特效度
specificity=TN/(TN+FP)
print(specificity)
# 计算f1_score
# print(f1_score(test_y, y_pred))
# print(classification_report(test_y, y_pred))
以下分别是三个核函数模型的准确度,敏感度,特效度结果:
为了更直观看到SVM三个核函数建立模型的性能度量值,将所得结果绘制成表格如下(保留小数点后五位):
由表格可见,对于该数据集,核函数rbf建立的模型在准确度和敏感度上均高于其余两种核函数,特效度略低于linear核函数模型,从整体来看,核函数rbf建立的分类模型应用于该数据集性能更好。
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# 解决坐标轴刻度负号乱码
plt.rcParams['axes.unicode_minus'] = False
# 解决中文乱码问题
plt.rcParams['font.sans-serif'] = ['Simhei']
# 显示数据集的所有列
pd.set_option('display.max_columns', None)
#读取数据集
data = pd.read_excel('分类作业数据集.xlsx')
#数据集查看
print(data.info())
print(data.columns)
print(data.head(5))
print(data.describe())
#数据清洗
#“ID"列没有实际意义,删除
data.drop('ID',axis = 1,inplace=True)
#将类型的B,M用0,1代替
data['类型'] = data['类型'].map({'M':1,'B':0})
# 特征字段放入features_mean
features_mean= list(data.columns[1:12])
# 可视化两种类型分布
sns.countplot(x="类型",data=data)
plt.show()
# 用热力图呈现features_mean字段之间的相关性
corr = data[features_mean].corr()
plt.figure(figsize=(14,14))
# annot=True显示每个方格的数据
sns.heatmap(corr, annot=True)
plt.show()
# 特征选择后得到的6个特征
features_remain = ['radius','texture', 'smoothnesscompactness','compactness','symmetry ', 'fractal dimension ']
#抽取40%的数据作为测试集,其余60%作为训练集
train,test = train_test_split(data,test_size = 0.4)
train_X = train[features_remain] #抽取特征选择的数值作为训练和测试数据
train_y = train['类型']
test_X = test[features_remain]
test_y = test['类型']
#采用Z-Score标准化,保证每个特征维度的数据均值为0,方差为1
ss = StandardScaler()
train_X = ss.fit_transform(train_X)
test_X = ss.transform(test_X)
print("%%%%%%%准确度%%%%%%%")
print("%%%%%%%敏感度%%%%%%%")
print("%%%%%%%特效度%%%%%%%")
print("%%%%%%%F1_score%%%%%%%")
kernelList = ['linear','rbf','sigmoid']
for kernel in kernelList:
svc = SVC(kernel=kernel).fit(train_X,train_y)
y_pred = svc.predict(test_X)
# 计算准确度
score_svc = metrics.accuracy_score(test_y,y_pred)
print(kernel+":")
print(score_svc)
# 计算召回率(敏感度)
print(recall_score(test_y, y_pred))
# 混淆矩阵
C = confusion_matrix(test_y, y_pred)
TN=C[0][0]
FP=C[0][1]
FN=C[1][0]
TP=C[1][1]
# 计算特效度
specificity=TN/(TN+FP)
print(specificity)
# 计算f1_score
# print(f1_score(test_y, y_pred))
# print(classification_report(test_y, y_pred))