import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,roc_auc_score
# 特征名称
feature_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
names=feature_names)
data.head()
Sample code number | Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1000025 | 5 | 1 | 1 | 1 | 2 | 1 | 3 | 1 | 1 | 2 |
1 | 1002945 | 5 | 4 | 4 | 5 | 7 | 10 | 3 | 2 | 1 | 2 |
2 | 1015425 | 3 | 1 | 1 | 1 | 2 | 2 | 3 | 1 | 1 | 2 |
3 | 1016277 | 6 | 8 | 8 | 1 | 3 | 4 | 3 | 7 | 1 | 2 |
4 | 1017023 | 4 | 1 | 1 | 3 | 2 | 1 | 3 | 1 | 1 | 2 |
在实际工作中,要弄清楚每一个肿瘤特征代表什么含义,这样才能做好异常值缺失值的处理
data.Class.value_counts()
2 458
4 241
Name: Class, dtype: int64
# 替换缺失值
data = data.replace(to_replace='?', value=np.nan)
# 删除缺失值的样本
data = data.dropna()
X = data[feature_names[0:-1]] # ==> X = data.iloc[:, 0:-1]
y = data["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
transform = StandardScaler()
X_train = transform.fit_transform(X_train)
X_test = transform.fit_transform(X_test)
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
LogisticRegression()
lr_model.score(X_train, y_train)
0.9688644688644689
lr_model.score(X_test, y_test)
0.9781021897810219
y_pred = lr_model.predict(X_test)
report = classification_report(y_test, y_pred, labels=(2, 4), target_names=("良性", "恶性"))
print(report)
precision recall f1-score support
良性 0.99 0.98 0.98 96
恶性 0.95 0.98 0.96 41
accuracy 0.98 137
macro avg 0.97 0.98 0.97 137
weighted avg 0.98 0.98 0.98 137