一个最简单的数据分析实例(含计算AUROC,绘制P-R曲线)

train.csv可在这里获取
或者用其他相似的数据集作为练习

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

df = pd.read_csv('train.csv')
# df = df.drop(['ID','V_Time'],axis=1) # 可以不要这行代码

# 训练集
train_df = df.sample(frac=0.8, random_state=42)

train_data = train_df.to_numpy()[:,:-1]
train_label = train_df.to_numpy()[:,-1]
# 验证集
valid_df = df.drop(train_df.index)
valid_data = valid_df.to_numpy()[:,:-1]
valid_label = valid_df.to_numpy()[:,-1]

print(f"train_data:{train_data.shape}")
print(f"valid_data:{valid_data.shape}")

# 创建逻辑回归分类器对象
classifier = LogisticRegression()

# 在训练集上训练分类器
classifier.fit(train_data, train_label)

# 在测试集上进行预测
y_pred = classifier.predict(valid_data)

# 计算分类器的准确率
accuracy = accuracy_score(valid_label, y_pred)
print("准确率:", accuracy)

# 计算混淆矩阵
cm = confusion_matrix(valid_label, y_pred)

# 提取真正例和假反例的数量
TP = cm[1, 1]
FN = cm[1, 0]

# 计算召回率
recall = TP / (TP + FN)
print(f"召回率:{recall:.4f}")

auroc = roc_auc_score(valid_label, y_pred)
print("AUROC: %.4f" %auroc)

# 假设valid_label是真实标签,y_pred是模型预测的概率得分(例如,预测为正例的概率)
precision, recall, thresholds = precision_recall_curve(valid_label, y_pred)

# 绘制P-R曲线
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('P-R Curve')
plt.show()

运行结果如下

train_data:(80000, 30)
valid_data:(20000, 30)
准确率: 0.99875
召回率:0.6364
AUROC: 0.8181

一个最简单的数据分析实例(含计算AUROC,绘制P-R曲线)_第1张图片

你可能感兴趣的:(数据分析,python)