建立一个逻辑回归模型来预测一个学生是否被大学录取。
1.逻辑回归
#根据两次考试的结果来确定每个申请人的录取机会
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
path = 'ex2data1.txt'
data = pd.read_csv(path,header=None,names=['Exam 1','Exam 2','Admitted'])
data.head()
# print(data.head()) 查看数据前五项
# ***********************画出散点图查看数据分布***********************************
positive = data[data['Admitted'].isin([1])] #筛选数据集,isin(1)筛选出admitted这一列中数据为1的数据
negative = data[data['Admitted'].isin([0])] #筛选数据集,isin(0)筛选出admitted这一列中数据为0的数据
fig, ax = plt.subplots(figsize=(12,8)) #figsize 设置图形的大小,a 为图形的宽, b 为图形的高,单位为英寸
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted') #s代表点的面积 marker代表点的样式
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()# 增加图例,即x,y轴的名称
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
#*************************定义g(z)函数*******************************************#
#g(z)函数的作用是控制假设函数的输出值在0-1之间****************#
def sigmoid(z):
return 1 / (1 + np.exp(-z))
#************************定义损失函数,确定theta的值,然后确定边界函数**********************#
def cost(theta, X, y):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
return np.sum(first - second) / (len(X))
#*********************定义梯度下降***************************#
def gradient(theta, X, y):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
grad[i] = np.sum(term) / len(X)
return grad
#***************************数据预处理****************#
data.insert(0, 'Ones', 1)
# 初始化X,y,θ
cols = data.shape[1] #[1]则表示列数
X = data.iloc[:,:-1]#从右往左数下标第二个,在这代表第二列
y = data.iloc[:,cols-1:cols]
theta = np.zeros(3)
# 转换X,y的类型
X = np.array(X.values)
y = np.array(y.values)
print(X.shape, theta.shape, y.shape) #判断X.shape, theta.shape, y.shape的形状
cost(theta, X, y) #调用从cost函数,计算代价函数
print(cost(theta, X, y))#打印计算结果
#******************调用库函数实现梯度下降********************#
#*************8提供一些参数会获得梯度下降的最小值********8*8***8#
import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
result
#(array([-25.16131872, 0.20623159, 0.20147149]), 36, 0)
cost(result[0],X,y)
print(cost(result[0],X,y))
#返回的是数组,提供决策边界的theta的特征值#
#***************画出决策函数(假设函数,分类函数)判定边界,区分数据******************************#
plotting_x1 =np.linspace(30,100,100) #定间隔取点
plotting_h1 =(-result[0][0]-result[0][1]*plotting_x1)/result[0][2]
# θ0+θ1x+θ2y=0
fig,ax= plt.subplots(figsize=(12,8))
ax.plot(plotting_x1,plotting_h1,'y',label='prediction')
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
#******************计算acc,准确率**********************#
def hfunc1(theta,X):
return sigmoid(np.sum(X*theta))
hfunc1(result[0],[1,45,85])
#print(hfunc1(result[0],[1,45,85]))
def predict(theta,X):
probability =sigmoid(X*theta.T)
return [1 if x>=0.5 else 0 for x in probability]
theta_min=np.matrix(result[0])#result指训练过后,获得theta的最佳值。
predictions=predict(theta_min,X)
correct=[1 if((a==1 and b==1)or (a==0 and b==0))
else 0 for(a,b)in zip(predictions,y)]
# print(sum(map(int ,correct)))
# print(len(correct))
acc=(sum(map(int ,correct))/len(correct)*100)
print('acc={0}%'.format(acc))
```markup
2.正则化逻辑回归
将实施正则化逻辑回归来预测制造厂的微芯片是否通过质量保证。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from 逻辑回归 import predict
#**********查看数据分布************************#
path = 'ex2data2.txt'
data_init=pd.read_csv(path,header=None,names=['Test 1', 'Test 2', 'Accepted'])
data_init.head()
# print(data_init.head())
postive2=data_init[data_init['Accepted'].isin([1])]
negative2=data_init[data_init['Accepted'].isin([0])]
fig,ax=plt.subplots(figsize=(12,8))
ax.scatter(postive2['Test 1'],postive2['Test 2'],s=50,c='b',marker='o',label='Accepted')
ax.scatter(negative2['Test 1'],negative2['Test 2'],s=50,c='r',marker='x',label=' Not Accepted')
ax.set_xlabel('Test 1')
ax.set_ylabel('Test 2')
plt.show()
#**************添加更多特征**************************************#
degree = 6
data2 = data_init
x1 = data2['Test 1']
x2 = data2['Test 2']
data2.insert(3, 'Ones', 1)
for i in range(1, degree + 1):
for j in range(0, i + 1):
data2['F' + str(i - j) + str(j)] = np.power(x1, i - j) * np.power(x2, j)
data2.drop('Test 1', axis=1, inplace=True)
data2.drop('Test 2', axis=1, inplace=True)
data2.head()
#print(data2.head())
#*************************定义g(z)函数*******************************************#
def sigmoid(z):
return 1 / (1 + np.exp(-z))
#***************定义正则化代价函数以及梯度*************************#
def costReg(theta,X,y,learningRate):
theta=np.matrix(theta)
X=np.matrix(X)
y=np.matrix(y)
first=np.multiply(-y,np.log(sigmoid(X*theta.T)))
second=np.multiply((1-y),np.log(1-sigmoid(X*theta.T)))
reg=(learningRate/(2*len(X)))*np.sum(np.power(theta[:,1:],2))
return np.sum(first-second)/len(X)+reg
def gradientReg(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
if (i == 0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:, i])
return grad
#***************************处理数据格式**********************************#
cols=data2.shape[1]
X2 = data2.iloc[:,1:cols]
y2 = data2.iloc[:,0:1]
theta2 = np.zeros(cols-1)
X2=np.array(X2.values)
y2=np.array(y2.values)
learningRate=3 #修改学习率,可以获得不同的分界函数,可以赋不同的值
#**************************8利用库进行预测&************************#
costReg(theta2,X2,y2,learningRate)
print(costReg(theta2,X2,y2,learningRate))
#*******调用SciPy's truncated newton(TNC)实现寻找最优参数。
import scipy.optimize as opt
result2=opt.fmin_tnc(func=costReg,x0=theta2,fprime=gradientReg, args=(X2, y2, learningRate))
print(result2)
theta_min = np.matrix(result2[0])
predictions = predict(theta_min, X2)
correct = [1 if ((a == b)) else 0 for (a, b) in zip(predictions, y2)]
accuracy = (sum(map(int, correct)) /len(correct))*100
print(sum(map(int, correct)))
print ('accuracy = {0}%'.format(accuracy))
#**********************决策曲线**************************************#
def hfunc2(theta, x1, x2):
temp = theta[0][0]
place = 0
for i in range(1, degree + 1):
for j in range(0, i + 1):
temp += np.power(x1, i - j) * np.power(x2, j) * theta[0][place + 1]
place += 1
return temp
def find_decision_boundary(theta):
t1=np.linspace(-1,1.5,1000)
t2=np.linspace(-1,1.5,1000)
cordinates=[(x,y)for x in t1 for y in t2]
x_cord,y_cord=zip(*cordinates)
h_val=pd.DataFrame({'x1':x_cord,'x2':y_cord})
h_val['hval']=hfunc2(theta, h_val['x1'], h_val['x2'])
#print(h_val)
decison=h_val[np.abs(sigmoid(h_val['hval'])-0.5)<0.01]
return decison.x1,decison.x2
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(postive2['Test 1'], postive2['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['Test 1'], negative2['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
x, y = find_decision_boundary(result2)
plt.scatter(x, y, c='y', s=10, label='Prediction')
ax.legend()
plt.show()
课程作业文档连接在第一个博客。
写下这些博客的这是想法是想把这些东西保存下来,方便以后查看,写这些东西的同时,参考许多大神的博客,感谢这些大神!如有侵权,立刻删除!
https://blog.csdn.net/qq_26402041/article/details/109156816