逻辑回归(Logistic Regression)是机器学习中的一种分类模型,逻辑回归是一种分类算法,虽然名字中带有回归,但是它与回归之间有一定的联系。由于算法的简单和高效。在实际中应用非常广泛。
g ( θ T x ) = 1 1 + e − θ T x g(θ^Tx)=\frac{1}{1+e^{-θTx}} g(θTx)=1+e−θTx1
sklearn.linear_model LogisticRegression(solver='liblinear',penalty='l2',C=1.0)
import pandas as pd
import numpy as np
# 1.读取数据
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv(path,names=column_name)
Sample code number | Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class | |
0 | 1000025 | 5 | 1 | 1 | 1 | 2 | 1 | 3 | 1 | 1 | 2 |
1 | 1002945 | 5 | 4 | 4 | 5 | 7 | 10 | 3 | 2 | 1 | 2 |
2 | 1015425 | 3 | 1 | 1 | 1 | 2 | 2 | 3 | 1 | 1 | 2 |
3 | 1016277 | 6 | 8 | 8 | 1 | 3 | 4 | 3 | 7 | 1 | 2 |
4 | 1017023 | 4 | 1 | 1 | 3 | 2 | 1 | 3 | 1 | 1 | 2 |
# 2.缺失值处理
# 1)替换---->np.nan
data = data.replace(to_replace="?",value=np.nan)
# 2)缺失值处理(删除缺失样本)
Sample code number False
Clump Thickness False
Uniformity of Cell Size False
Uniformity of Cell Shape False
Marginal Adhesion False
Single Epithelial Cell Size False
Bare Nuclei False
Bland Chromatin False
Normal Nucleoli False
Mitoses False
Class False
dtype: bool
# 3.划分数据集
from sklearn.model_selection import train_test_split
# 筛选特征值和目标值
x = data.iloc[:,1:-1]
y = data["Class"]
Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | |
0 | 5 | 1 | 1 | 1 | 2 | 1 | 3 | 1 | 1 |
1 | 5 | 4 | 4 | 5 | 7 | 10 | 3 | 2 | 1 |
2 | 3 | 1 | 1 | 1 | 2 | 2 | 3 | 1 | 1 |
3 | 6 | 8 | 8 | 1 | 3 | 4 | 3 | 7 | 1 |
4 | 4 | 1 | 1 | 3 | 2 | 1 | 3 | 1 | 1 |
x_train,x_test,y_train,y_test = train_test_split(x,y)
# 4.特征工程:标准化
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
array([[-0.86871961, -0.70102688, -0.74295671, ..., -0.17939874,
-0.61860282, -0.36126301],
[-0.86871961, 0.63949885, -0.05321267, ..., 1.46817358,
0.68747819, -0.36126301],
[-0.51599602, 1.30976171, 1.3262754 , ..., 0.23249434,
1.66703895, -0.36126301],
[-0.16327244, -0.70102688, -0.74295671, ..., -1.00318491,
-0.61860282, -0.36126301],
[ 0.18945114, -0.70102688, -0.05321267, ..., -1.00318491,
-0.61860282, -0.36126301],
[-0.16327244, -0.70102688, -0.74295671, ..., -1.00318491,
-0.61860282, -0.36126301]])
# 5.逻辑回归预估器
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()
e:\python37\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='warn',
tol=0.0001, verbose=0, warm_start=False)
# 逻辑回归模型参数:回归系数和偏置
array([[1.27684228, 0.39555928, 0.71283597, 0.74425675, 0.14041766,
1.40653816, 1.04294598, 0.68242732, 0.85655016]])
# 6.模型评估
y_predict = estimator.predict(x_test)
# 方法一:直接比对真实值和预测值
# 方法二:计算准确率
score = estimator.score(x_test,y_test)
[4 4 2 4 2 4 4 2 2 2 4 2 4 2 2 2 2 2 4 4 2 4 2 4 2 2 2 2 2 2 4 4 2 2 2 2 2
4 4 2 4 2 2 2 4 2 2 2 4 2 2 2 2 4 4 2 2 4 2 2 2 2 2 2 2 2 2 4 2 4 2 2 2 4
2 2 2 4 4 4 2 4 4 2 2 4 2 2 2 2 4 2 2 4 4 2 2 2 4 2 4 4 4 2 2 2 4 2 2 4 2
4 4 4 2 2 2 2 4 2 2 2 2 2 4 2 2 2 2 4 2 2 4 2 2 2 2 4 2 2 2 2 4 2 2 4 2 2
2 4 2 2 2 4 4 2 2 2 2 4 2 2 2 4 2 4 2 4 2 4 2]
414 True
15 True
501 True
191 True
676 True
215 True
293 True
508 True
443 True
489 False
333 True
380 True
361 True
355 True
524 True
129 True
291 True
375 True
436 True
482 True
634 True
435 True
137 True
214 True
389 True
446 True
72 True
532 True
601 True
16 True
347 True
295 True
671 True
486 True
434 False
181 True
66 True
348 False
570 True
392 True
492 True
502 True
262 True
173 True
462 True
307 True
572 True
395 True
230 True
341 True
37 True
557 True
479 True
624 True
132 True
265 True
681 True
463 True
142 True
209 True
Name: Class, Length: 171, dtype: bool