import numpy as np
from sklearn import svm
from sklearn.linear_model import LogisticRegression
my_matrix=np.loadtxt("E:\\pima-indians-diabetes.txt",delimiter=",",skiprows=0)
lenth_x=len(my_matrix[0])
data_y=my_matrix[:,lenth_x-1]
data_x=my_matrix[:,0:lenth_x-1]
print(data_x[0:2],len(data_x[0]),len(data_x))
data_shape=data_x.shape
data_rows=data_shape[0]
data_cols=data_shape[1]
data_col_max=data_x.max(axis=0)#获取二维数组列向最大值
data_col_min=data_x.min(axis=0)#获取二维数组列向最小值
for i in xrange(0, data_rows, 1):#将输入数组归一化
for j in xrange(0, data_cols, 1):
data_x[i][j] = \
(data_x[i][j] - data_col_min[j]) / \
(data_col_max[j] - data_col_min[j])
print(data_x[0:2])
(array([[ 6. , 148. , 72. , 35. , 0. , 33.6 ,
0.627, 50. ],
[ 1. , 85. , 66. , 29. , 0. , 26.6 ,
0.351, 31. ]]), 8, 768)
[[ 0.35294118 0.74371859 0.59016393 0.35353535 0. 0.50074516
0.23441503 0.48333333]
[ 0.05882353 0.42713568 0.54098361 0.29292929 0. 0.39642325
0.11656704 0.16666667]]
n_train=int(len(data_y)*0.7)#选择70%的数据作为训练集,30%的数据作为测试集
X_train=data_x[:n_train]
y_train=data_y[:n_train]
X_test=data_x[n_train:]
y_test=data_y[n_train:]
clf1=svm.SVC()#模型1选择SVM经典模型
clf1.fit(X_train,y_train)
clf2=LogisticRegression()#模型2选择逻辑回归
clf2.fit(X_train,y_train)
y_predictions1=clf1.predict(X_test)
y_predictions2=clf2.predict(X_test)
print(y_predictions1)
print(y_predictions2)
[ 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1.
0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0.
0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.
1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0.
1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1.
0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0.
0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1.
1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0.
1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0.
1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.]
k,h=0,0
for i in range(len(y_test)):
if y_predictions1[i]==y_test[i]:
k+=1
for i in range(len(y_test)):
if y_predictions2[i]==y_test[i]:
h+=1
print(k,h)
(177, 181)
accuracy_svm=float(k)/float(len(y_test))
accuracy_LogR=float(h)/float(len(y_test))
print"The accuracy of SVM is %f, and the accuracy of LogisticRegression is %f"%(accuracy_svm,accuracy_LogR)
The accuracy of SVM is 0.766234, and the accuracy of LogisticRegression is 0.783550
通过实际预测效果检验,数据的归一化使得SVM模型的预测效果得到了改善,基本接近了逻辑回归模型,后续将通过对SVM模型的超参数选择来改善预测效果