逻辑回归——kaggle糖尿病预测实战
【实验所需数据私聊可发】
1、糖尿病是一组以高血糖为特征的代谢性疾病,由于胰岛素分泌缺陷或其生物作用受损则引起高血糖。长期存在的高血糖,会导致身体各种组织,特别是眼、肾、心脏、血管、神经的慢性损害和功能障碍。
2、通过2小时血浆葡萄糖浓度、2小时血清胰岛素、身体质量指数等特征来预测某个人是否罹患糖尿病,在众名的因素中,找到最能导致该病的关键特征。
1.1#数据导入
import warnings
warnings.filterwarnings('ignore')#忽略匹配警告
data=np.loadtxt(r"D:\pima-indians-diabetes.data.csv",delimiter=",",skiprows=1,dtype=np.float)
data
运行结果:
array([[ 6. , 148. , 72. , ..., 0.627, 50. , 1. ],
[ 1. , 85. , 66. , ..., 0.351, 31. , 0. ],
[ 8. , 183. , 64. , ..., 0.672, 32. , 1. ],
...,
[ 5. , 121. , 72. , ..., 0.245, 30. , 0. ],
[ 1. , 126. , 60. , ..., 0.349, 47. , 1. ],
[ 1. , 93. , 70. , ..., 0.315, 23. , 0. ]])
1.2#分离特征变量和分类变量
X=data[:,:-1]
y=data[:,-1]
1.3#特征标准化
mu=X.mean(axis=0)
std=X.std(axis=0)
X=(X-mu)/std
1.4#添加全1列
#添加全1列
x_ones=np.ones((X.shape[0],1))
X=np.hstack((X,x_ones))
1.5#拆分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=8)
1.6#将因变量转为列向量
y_train=y_train.reshape(-1,1)
y_test=y_test.reshape(-1,1)
print(y_train.shape,y_test.shape)
结果:
(537, 1) (231, 1)
1.7#初始化theta值
theta=np.ones([X_train.shape[1],1])
theta
结果:
array([[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.]])
1.8#设置步长值
alpha=0.001
1.9#定义sigmoid函数
def sigmoid(z):
s=1.0/(1+np.exp(-z))
return s
num_iters=10000
m=200
for i in range(num_iters):
h=sigmoid(np.dot(X_train,theta))
theta=theta-alpha*np.dot(X_train.T,(h-y_train))/m
print(theta)
结果为:
[[ 0.39210287]
[ 1.10657783]
[-0.24092243]
[ 0.0223229 ]
[-0.17137676]
[ 0.61819121]
[ 0.45880179]
[ 0.12971106]
[-0.84498429]]
1.10#预测
pred_y=sigmoid(np.dot(X_test,theta))
1.11#预测结果二值化
pred_y[pred_y>0.5]=1
pred_y[pred_y<=0.5]=0
print(pred_y.reshape(1,-1))
结果为:
[[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.
0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.
0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0.]]
print(y_test.reshape(1,-1))
结果为:
[[0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.
0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1.
0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0.
1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1.
1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1.]]
1.12#预测准确率:
print("预测准确率为:",np.sum(pred_y==y_test)/len(y_test))
结果为:
预测准确率为: 0.7878787878787878
【sklearn 实现逻辑回归】:
2.1#导入数据
data = np.loadtxt(r"D:\pima-indians-diabetes.data.csv",deliniter = ",",skiprows = 1,dtype = np.float)
2.2#分离特征变量和分类变量
X = data[:,:-1]
y = data[:,-1]
2.3#特征标准化
mu = X.mean(axis = 0)
std = X.std(axis = 0)
X = (X - mu) / std
2.4#拆分训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 8
from sklearn.linear_model import LogisticRegression
#实例化
logist=LogisticRegression()
#模型训练
logist.fit(X_train,y_train)
#模型预测
y_predict=logist.predict(X_test)
print(y_predict)
结果为:
[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.
0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.
0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0.]
#计算模型准确率
print("准确率:",np.sum((y_predict==y_test))/len(y_test))
结果为:
准确率: 0.7792207792207793