train.csv可在这里获取
或者用其他相似的数据集作为练习
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
df = pd.read_csv('train.csv')
# df = df.drop(['ID','V_Time'],axis=1) # 可以不要这行代码
# 训练集
train_df = df.sample(frac=0.8, random_state=42)
train_data = train_df.to_numpy()[:,:-1]
train_label = train_df.to_numpy()[:,-1]
# 验证集
valid_df = df.drop(train_df.index)
valid_data = valid_df.to_numpy()[:,:-1]
valid_label = valid_df.to_numpy()[:,-1]
print(f"train_data:{train_data.shape}")
print(f"valid_data:{valid_data.shape}")
# 创建逻辑回归分类器对象
classifier = LogisticRegression()
# 在训练集上训练分类器
classifier.fit(train_data, train_label)
# 在测试集上进行预测
y_pred = classifier.predict(valid_data)
# 计算分类器的准确率
accuracy = accuracy_score(valid_label, y_pred)
print("准确率:", accuracy)
# 计算混淆矩阵
cm = confusion_matrix(valid_label, y_pred)
# 提取真正例和假反例的数量
TP = cm[1, 1]
FN = cm[1, 0]
# 计算召回率
recall = TP / (TP + FN)
print(f"召回率:{recall:.4f}")
auroc = roc_auc_score(valid_label, y_pred)
print("AUROC: %.4f" %auroc)
# 假设valid_label是真实标签,y_pred是模型预测的概率得分(例如,预测为正例的概率)
precision, recall, thresholds = precision_recall_curve(valid_label, y_pred)
# 绘制P-R曲线
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('P-R Curve')
plt.show()
运行结果如下
train_data:(80000, 30)
valid_data:(20000, 30)
准确率: 0.99875
召回率:0.6364
AUROC: 0.8181