提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
此篇博客主要记录使用python来完成机器学习的某些算法,例如线性回归,逻辑回归,k-mean等,本文参考了吴恩达老师的机器学习课程,详细资料可见Coursera网站机器学习课程。
提示:以下是本篇文章正文内容,下面案例可供参考
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
## 计算代价
def computeCost(X,y,theta):
m=len(X)
J=np.sum(np.power(X*theta-y,2))/(2*m)
return J
## 梯度下降
def gradientDescent(X,y,theta,alpha,iters):
m=len(X)
cost=np.zeros(iters)
for i in range(iters):
temp=((X * theta - y).T * X * alpha / m).T
theta=np.subtract(theta,temp)
cost[i]=computeCost(X,y,theta)
return theta,cost
path=r'E:\python\MyMachineLearning\exp1\ex1data1.txt'
data=pd.read_csv(path,header=None,names=['Population','Profit'])
## 插入x0
data.insert(0,'ones',1)
cols=data.shape[1]
X=data.iloc[:,:-1]
y=data.iloc[:,cols-1:cols]
## 转为矩阵便于后面的矩阵计算
X=np.matrix(X.values)
y=np.matrix(y.values)
## 初始化相应的参数
theta=np.matrix(np.array([0,0])).T
alpha=0.01
max_iters=1500
theta,cost=gradientDescent(X,y,theta,alpha,max_iters)
## 绘制曲线
x=np.linspace(data.Population.min(),data.Population.max())
y=theta[0,0]+theta[1,0]*x
figure,axis=plt.subplots()
axis.plot(x,y,'r',label='Prediction')
axis.scatter(x=data['Population'],y=data['Profit'],label='Training Data')
axis.set_xlabel('Population')
axis.set_ylabel('Profit')
plt.show()
非常类似于上部分的代码,只是样本的特征数量增加了。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def computeCost(X,y,theta):
m=len(X)
J=np.sum(np.power(X*theta-y,2))/(2*m)
return J
def gradientDescent(X,y,theta,alpha,iters):
m=len(X)
cost=np.zeros(iters)
for i in range(iters):
temp=(X*theta-y).T*X*alpha/m
theta=np.subtract(theta,temp.T)
cost[i]=computeCost(X,y,theta)
return theta,cost
file=r'E:\python\MyMachineLearning\exp1\ex1data2.txt'
data=pd.read_csv(file,names=["size",'bedroom','price'],header=None)
# 归一化
data=(data-data.mean())/data.std()
data.insert(0,'ones',1)
cols=data.shape[1]
X=data.iloc[:,:-1]
y=data.iloc[:,cols-1:cols]
X=np.matrix(X.values)
y=np.matrix(y.values)
theta=np.matrix(np.array([0,0,0])).T
alpha=0.01
iters=1500
theta,cost=gradientDescent(X,y,theta,alpha,iters)
figure,axis=plt.subplots()
x=range(1,1501)
y=cost
plt.plot(x,y,color='red')
plt.show()
(X.T*X)-1 X.Ty
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def normalEquation(X,y):
##计算逆矩阵
temp=np.linalg.inv(X.T*X)
theta=temp*X.T*y
return theta
def computeCost(X,y,theta):
m=len(X)
J=np.sum(np.power(X*theta-y,2))/(2*m)
return J
file=r'E:\python\MyMachineLearning\exp1\ex1data2.txt'
data=pd.read_csv(file,names=['Size','Bedroom','Price'])
data=(data-data.mean())/data.std()
data.insert(0,'ones',1)
cols=data.shape[1]
X=data.iloc[:,:-1]
y=data.iloc[:,cols-1:cols]
X=np.matrix(X.values)
y=np.matrix(y.values)
theta=normalEquation(X,y)
cost=computeCost(X,y,theta)
print(cost)
import pandas as pd
import matplotlib.pyplot as plt
file=r'E:\python\MyMachineLearning\exp2\ex2data1.txt'
data=pd.read_csv(file,names=['exam1','exam2','Accepted'])
positive=data[data['Accepted'].isin([1])]
negative=data[data['Accepted'].isin([0])]
fig,ax=plt.subplots()
ax.scatter(positive['exam1'],positive['exam2'],c='b',marker='o',label='Accepted')
ax.scatter(negative['exam1'],negative['exam2'],c='r',marker='x',label='unAccepted')
ax.set_xlabel('exam1Score')
ax.set_ylabel('exam2Score')
plt.show()
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.io import loadmat
def sigmoid(z):
return 1/(1+np.exp(-z))
def computeCost(theta,X,y,lambda_i):
X=np.matrix(X)
y=np.matrix(y)
theta=np.matrix(theta)
m=len(X)
error=(-y.T)*np.log(sigmoid(X*theta.T))-(1-y.T)*np.log(1-sigmoid(X*theta.T))
reg=np.sum(np.power(theta[:,1:theta.shape[1]],2))*lambda_i/(2*m)
return error/m+reg
def gradient(theta,X,y,lambda_i):
X=np.matrix(X)
y=np.matrix(y)
theta=np.matrix(theta)
m=len(X)
error=sigmoid(X*theta.T)-y
grad=((X.T*error)/m).T+(lambda_i*theta)/m
grad[0,0]=np.sum(error.T*X[:,0])/m
return np.array(grad).ravel()
# def gradient(theta, X, y, lambda_i):
# theta = np.matrix(theta)
# X = np.matrix(X)
# y = np.matrix(y)
#
# parameters = int(theta.ravel().shape[1])
# error = sigmoid(X * theta.T) - y
#
# grad = ((X.T * error) / len(X)).T + ((lambda_i / len(X)) * theta)
#
# # intercept gradient is not regularized
# grad[0, 0] = np.sum(np.multiply(error, X[:, 0])) / len(X)
#
# return np.array(grad).ravel()
def one_vs_all(X,y,num_labels,lambda_i):
rows=X.shape[0]
params=X.shape[1]
all_theta=np.zeros((num_labels,params+1))
X=np.insert(X,0,values=np.ones(rows),axis=1)
for i in range(1,num_labels+1):
theta=np.zeros(params+1)
y_i=np.array([1 if label==i else 0 for label in y])
y_i=np.reshape(y_i,(rows,1))
fmin=minimize(fun=computeCost,x0=theta,args=(X,y_i,lambda_i),method='TNC',jac=gradient)
all_theta[i-1,:]=fmin.x
return all_theta
file=r'E:\python\MyMachineLearning\exp3\ex3data1.mat'
data=loadmat(file)
X=data['X']
y=data['y']
all_theta=one_vs_all(data['X'],data['y'],10,1)
print(all_theta)
import numpy as np
from scipy.io import loadmat
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
file=r'E:\python\MyMachineLearning\exp4\ex4data1.mat'
data=loadmat(file)
X=data['X']
y=data['y']
sample_idx=np.random.choice(np.arange(X.shape[0]),100)
sample_img=data['X'][sample_idx,:]
fig,ax=plt.subplots(nrows=10,ncols=10,sharey=True,sharex=True)
for r in range(10):
for c in range(10):
ax[r,c].matshow(np.array(sample_img[10*r+c].reshape((20,20))).T,cmap=matplotlib.cm.binary) ## 显示灰度图像
## 自动调整x,y坐标刻度
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plt.show()
这里的神经网络的参数为theta 包含了w和b
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.io import loadmat
from sklearn.preprocessing import OneHotEncoder
from scipy.optimize import minimize
file=r'E:\python\MyMachineLearning\exp4\ex4data1.mat'
data=loadmat(file)
X=data['X']
y=data['y']
weight = loadmat("E:\python\MyMachineLearning\exp4\ex4weights.mat")
theta1, theta2 = weight['Theta1'], weight['Theta2']
def sigmoid(z):
return 1/(1+np.exp(-z))
##前向传播
def forward_propagate(X,theta1,theta2):
m=X.shape[0]
a1=np.insert(X,0,values=np.ones(m),axis=1)
z2=a1*theta1.T
a2=np.insert(sigmoid(z2),0,np.ones(m),axis=1)
z3=a2*theta2.T
h=sigmoid(z3)
return a1,z2,a2,z3,h
def computeCost(theta1,theta2,input_size,hidden_size,num_labels,X,y,lambda_i):
m=X.shape[0]
X=np.matrix(X)
y=np.matrix(y)
## 正向传播
a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)
## 对应的神经元相乘
error=np.multiply(-y,np.log(h))-np.multiply(1-y,np.log(1-h))
reg=(lambda_i*1.0/(2*m))*(np.sum(np.power(theta1[:,1:],2))+np.sum(np.power(theta2[:,1:],2)))
return np.sum(error)/m+reg
def sigmoid_gradient(z):
return np.multiply(sigmoid(z),(1-sigmoid(z)))
## 反向传播
def backprop(params,input_size,hidden_size,num_labels,X,y):
m=X.shape[0]
X=np.matrix(X)
y=np.matrix(y)
## 参数reshape
theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):],(num_labels,(hidden_size+1))))
a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)
J=0
delta1=np.zeros(theta1.shape)
delta2=np.zeros(theta2.shape)
error=np.multiply((-y),np.log(h))-np.multiply((1-y),np.log(1-h))
J=np.sum(error)/m
## 反向传播
delta3=h-y
delta2=delta3*theta2
delta2=delta2[:,1:]
delta2=np.multiply(delta2,sigmoid_gradient(z2))
Delta1 = np.zeros(theta1.shape)
Delta2 = np.zeros(theta2.shape)
Delta2+=delta3.T*a2
Delta1+=delta2.T*a1
Delta2=Delta2/m
Delta1=Delta1/m
return J,Delta1,Delta2
def backpropReg(params,input_size,hidden_size,num_labels,X,y,lambda_i):
m=X.shape[0]
X=np.matrix(X)
y=np.matrix(y)
theta1=np.matrix(np.reshape(params[:(hidden_size)*(input_size+1)],(hidden_size,(input_size+1))))
theta2=np.matrix(np.reshape(params[(hidden_size)*(input_size+1):],(num_labels,(hidden_size+1))))
a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
J = 0
delta1 = np.zeros(theta1.shape)
delta2 = np.zeros(theta2.shape)
error = np.multiply((-y), np.log(h)) - np.multiply((1 - y), np.log(1 - h))
reg = (lambda_i * 1.0 / (2 * m)) * (np.sum(np.power(theta1[:, 1:], 2)) + np.sum(np.power(theta2[:, 1:], 2)))
J = np.sum(error) / m+reg
delta3=h-y
delta2=delta3*theta2
delta2=delta2[:,1:]
delta2=np.multiply(delta2,sigmoid_gradient(z2))
Delta1 = np.zeros(theta1.shape)
Delta2 = np.zeros(theta2.shape)
Delta2 += delta3.T * a2
Delta1 += delta2.T * a1
Delta2 = Delta2 / m
Delta1 = Delta1 / m
Delta1[:, 1:] = Delta1[:, 1:] + (theta1[:, 1:] * lambda_i) / m
Delta2[:, 1:] = Delta2[:, 1:] + (theta2[:, 1:] * lambda_i) / m
## 将俩个矩阵降维一维
grad = np.concatenate((np.ravel(Delta1), np.ravel(Delta2)))
return J,grad
## 自动转成矩阵形式
encoder=OneHotEncoder(sparse=False)
y_onehot=encoder.fit_transform(y)
input_size = 400
hidden_size = 25
num_labels = 10
lambda_i=1
## 随机初始theta 减去0.5保证均值为0
params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.24
## 或者对于theta1 theta2 分别进行初始化
# theta1=np.random.random(size=hidden_size * (input_size + 1))*(2*0.12)-0.12
fmin=minimize(fun=backpropReg,x0=(params),args=(input_size,hidden_size,num_labels,X,y_onehot,lambda_i),method='TNC',jac=True,options={'maxiter':250})
print(fmin)
本代码中使用的算法为线性回归
file=r'E:\python\MyMachineLearning\exp5\ex5data1.mat'
data=sio.loadmat(file)
X=data['X']
y=data['y']
X, y, Xval, yval, Xtest, ytest = map(np.ravel,[data['X'], data['y'], data['Xval'], data['yval'], data['Xtest'], data['ytest']])
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]
theta=np.ones(X.shape[1])
lambda_i=1
training_cost, cv_cost = [], []
## 样本总数
m=X.shape[0]
## 考虑随着样本数的增加,对应的误差(训练误差和交叉验证误差)
for i in range(1,m+1):
res = linear_regression(X[:i, :], y[:i], 0)
tc = computeCostReg(res.x, X[:i, :], y[:i], 0)
cv = computeCostReg(res.x, Xval, yval, 0)
training_cost.append(tc)
cv_cost.append(cv)
## 寻找最佳的lambda_i
## 一般按照0.003增长
l_candidate=[0,0.001,0.003,0.01,0.03,0.1,0.3,1.3,10]
for l in l_candidate:
res = linear_regression(X, y, l)
tc = computeCost(res.x, X, y)
cv = computeCost(res.x, Xval, yval)
training_cost.append(tc)
cv_cost.append(cv)
## 绘制对应的图像
fig, ax = plt.subplots()
ax.plot(l_candidate, training_cost, label='training')
ax.plot(l_candidate, cv_cost, label='cross validation')
## 添加对应的标签
plt.legend()
plt.xlabel('lambda')
plt.ylabel('cost')
plt.show()
## 在测试集上机选误差,来进行评估
for l in l_candidate:
theta = linear_regression(X, y, l).x
print('test cost(l={}) = {}'.format(l, computeCost(theta, Xtest, ytest)))
这里用的是sklearn 包中的svm。
import numpy as np
import pandas as pd
import scipy.io as sio
from sklearn.metrics import classification_report
from sklearn import svm
import matplotlib.pyplot as plt
def decision_boundary(svc,X1min,X1max,X2min,X2max,diff):
x1=np.linspace(X1min,X1max,1000)
x2=np.linspace(X2min,X2max,1000)
cordinates=[(x,y) for x in x1 for y in x2]
x_cord,y_cord=zip(*cordinates)
c_val=pd.DataFrame({'x1':x_cord,'x2':y_cord})
## 计算与决策边界的距离
c_val['cval']=svc.decision_function(c_val[['x1','x2']])
##取出位于样本分界线的点
decision=c_val[np.abs(c_val['cval'])<diff]
return decision.x1,decision.x2
file=r'E:\python\MyMachineLearning\exp6\ex6data1.mat'
init_data=sio.loadmat(file)
X=init_data['X']
data=pd.DataFrame(X,columns=['X1','X2'])
data['y']=init_data['y']
## 参数c = 1/lambda
## 线性支持向量机 loss hinge,squared_hinge ,对于不同的参数c会有不同的决策边界
svc=svm.LinearSVC(C=1,loss='hinge',max_iter=1000)
svc.fit(data[['X1','X2']],data['y'])
cost=svc.score(data[['X1','X2']],data['y'])
x1, x2 = decision_boundary(svc, 0, 4, 1.5, 5, 2 * 10**-3)
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(x1, x2, s=10, c='r',label='Boundary')
positive=data[data['y'].isin([1])]
negative=data[data['y'].isin([0])]
ax.scatter(positive['X1'],positive['X2'],c='b',marker='o')
ax.scatter(negative['X1'],negative['X2'],c='r',marker='x')
ax.set_title('SVM (C=1) Decision Boundary')
ax.legend()
plt.show()
这里用的是高斯核函数
高斯核函数如下
## 高斯核函数
def gaussian_kernel(x1,x2,sigma):
return np.exp(-np.sum((x1-x2)**2)/(2*(sigma**2)))
训练代码如下
file=r'E:\python\MyMachineLearning\exp6\ex6data2.mat'
init_data=sio.loadmat(file)
X=init_data['X']
data=pd.DataFrame(X,columns=['X1','X2'])
data['y']=init_data['y']
## 使用内置的高斯内核进行数据拟合
svc=svm.SVC(C=100,gamma=10,probability=True)
svc.fit(data[['X1','X2']],data['y'])
res=svc.score(data[['X1','X2']],data['y'])
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import scipy.io as sio
## 计算每个点最近的中心
def find_closet_centroids(X,centroids):
m=X.shape[0]
k=centroids.shape[0]
idx=np.zeros(m)
for i in range(m):
min_dist=1000000
for j in range(k):
## 计算点到所有中心的距离
dist=np.sum((X[i,:]-centroids[j,:])**2)
if(dist < min_dist):
min_dist=dist
idx[i]=j
return idx
#计算聚类中心
def compute_centroids(X,idx,k):
m,n=X.shape
centroids=np.zeros((k,n))
for i in range(k):
## 获取对应中心的所有样本
indices=np.where(idx==i)
centroids[i,:]=(np.sum(X[indices,:],axis=1)/len(indices[0])).ravel()
return centroids
## K_mean 算法
def run_k_means(X,initial_centroids,max_iters):
m,n=X.shape
k=initial_centroids.shape[0]
idx=np.zeros(m)
centroid=initial_centroids
for i in range(max_iters):
idx=find_closet_centroids(X,centroid)
centroid=compute_centroids(X,idx,k)
return idx,centroid
## 随机初始化k个聚类中心
def init_centroids(X,k):
m,n=X.shape
centroids=np.zeros((k,n))
## 随机初始化 0-m 的k个数字
idx=np.random.randint(0,m,k)
for i in range(k):
centroids[i,:]=X[idx[i],:]
return centroids
file=r'E:\python\MyMachineLearning\exp7\ex7data2.mat'
init_data=sio.loadmat(file)
X=init_data['X']
## 随机初始化聚类中心
initial_centroids = init_centroids(X,3)
idx, centroids = run_k_means(X, initial_centroids, 10)
## 位于不同中心的点
cluster1=X[np.where(idx==0)[0],:]
cluster2=X[np.where(idx==1)[0],:]
cluster3=X[np.where(idx==2)[0],:]
fig,ax=plt.subplots()
ax.scatter(cluster1[:,0],cluster1[:,1],c='b',label='Cluster 1')
ax.scatter(cluster2[:,0],cluster2[:,1],c='r',label='Cluster 2')
ax.scatter(cluster3[:,0],cluster3[:,1],c='g',label='Cluster 3')
ax.legend()
plt.show()
同样,我们也可以使用sklearn自带的KMeans来完成训练
## n_init 表示随机初始化聚类中心的次数 n_job表示并行的cpu数,-1表示使用所有cpu
model=KMeans(n_clusters=16,n_init=100,n_jobs=-1)
model.fit(data)
#获取聚簇中心
centroids = model.cluster_centers_
## #获取每个数据点的对应聚簇中心的索引
C = model.predict(data)
import numpy as np
import pandas as pd
import scipy.io as sio
from IPython.display import Image
import matplotlib.pyplot as plt
def computeMinDist(X,centroids):
m = X.shape[0]
k = centroids.shape[0]
idx = np.zeros(m)
for i in range(m):
min_dist = 1000000
for j in range(k):
## 计算点到所有中心的距离
dist = np.sum((X[i, :] - centroids[j, :]) ** 2)
if (dist < min_dist):
min_dist = dist
idx[i] = j
return idx
def compute_centroids(X,idx,k):
m, n = X.shape
centroids = np.zeros((k, n))
for i in range(k):
## 获取对应中心的所有样本
indices = np.where(idx == i)
centroids[i, :] = (np.sum(X[indices, :], axis=1) / len(indices[0])).ravel()
return centroids
def init_centroids(X,k):
m,n=X.shape
idx=np.random.randint(0,m,k)
centroids=np.zeros((k,n))
for i in range(k):
centroids[i,:]=X[idx[i],:]
return centroids
#k_mean 算法
def k_mean(X,initial_centroids,max_iters):
m, n = X.shape
k = initial_centroids.shape[0]
idx = np.zeros(m)
centroid = initial_centroids
for i in range(max_iters):
idx = computeMinDist(X, centroid)
centroid = compute_centroids(X, idx, k)
return idx, centroid
file=r'E:\python\MyMachineLearning\exp7\bird_small.png'
imgFile=r'E:\python\MyMachineLearning\exp7\bird_small.mat'
image_data=sio.loadmat(imgFile)
A=image_data['A']
## normalize
A=A/255
X=np.reshape(A,(A.shape[0]*A.shape[1],A.shape[2]))
initial_centroids=init_centroids(X,16)
idx, centroids = k_mean(X, initial_centroids, 10)
idx = computeMinDist(X, centroids)
## 将每个像素映射到聚类中心的值
X_recovered = centroids[idx.astype(int),:]
## 重新调整维度
X_recovered = np.reshape(X_recovered, (A.shape[0], A.shape[1], A.shape[2]))
plt.imshow(X_recovered)
plt.show()
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
def pca(X):
## standardization
m = X.shape[0]
X = (X - X.mean(axis=0)) / X.std(axis=0)
X=np.matrix(X)
## 计算协方差矩阵
cov=X.T*X/m
## 奇异值分解
U,S,V=np.linalg.svd(cov)
return U,S,V
## 选择k个降维后的数据
def project_data(X,U,k):
U_reduced=U[:,:k]
return X*U_reduced
## 恢复降维的数据
def recover_data(Z,U,k):
U_reduced = U[:, :k]
return Z*U_reduced.T
## 选择合适的k值
def choose_K(X,lambda_i):
U,S,V=pca(X)
res=1000
all_sum=sum(S)
for i in range(1,100):
temp=sum(S[:i])
if(temp/all_sum>=lambda_i):
res=i
break
return res
## pca 可用于数据的降维
file=r'E:\python\MyMachineLearning\exp7\ex7data1.mat'
data=sio.loadmat(file)
X=data['X']
## 数据可视化
# fig,ax=plt.subplots()
# ax.scatter(X[:,0],X[:,1],c='r')
# plt.show()
U, S, V = pca(X)
Z = project_data(X, U, 1)
w=recover_data(Z,U,1)
##第一主成分的投影轴基本上是数据集中的对角线。 当我们将数据减少到一个维度时,
# 我们失去了该对角线周围的变化,所以在我们的再现中,一切都沿着该对角线。
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
def pca(X):
##normalize
X=(X-X.mean())/X.std()
X=np.matrix(X)
m=X.shape[0]
cov=X.T*X/m
U,S,V=np.linalg.svd(cov)
return U,S,V
## 数据压缩
def project_data(X,U,k):
U_reduced=U[:,:k]
return X*U_reduced.T
## 数据恢复
def recover_data(X,U,k):
U_rediced=U[:,:k]
return z*U_rediced.T
## 显示前n张图像
def plod_n_image(X,n):
pic_size=int(np.sqrt(X.shape[1]))
grid_size=int(np.sqrt(n))
face_image=X[:n,:]
fig,ax=plt.subplots(nrows=grid_size,ncols=grid_size,sharex=True,sharey=True)
for r in range(grid_size):
for c in range(grid_size):
ax[r,c].imshow(face_image[grid_size*r+c].reshape((pic_size,pic_size)))
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plt.show()
file=r'E:\python\MyMachineLearning\exp7\ex7faces.mat'
face_data=sio.loadmat(file)
X=face_data['X']
U,S,V=pca(X)
z=project_data(X,U,100)
这里使用scipy库中的内置函数norm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio
from scipy import stats
import sklearn.preprocessing
## 计算每个特征的方差和均值
def compute_gaussian(X):
## axis=0 表示对列进行平均
mu=X.mean(axis=0)
sigma=X.var(axis=0)
return mu,sigma
## 寻找好的阈值
def select_threshold(pval,yval):
best_epsilon=0
best_f1=0
f1=0
## 步长
step=(pval.max()-pval.min())/1000
for epsilon in np.arange(pval.min(),pval.max(),step):
preds=pval<epsilon
tp=np.sum(np.logical_and(preds==1,yval==1))
fp = np.sum(np.logical_and(preds == 1, yval == 0)).astype(float)
fn = np.sum(np.logical_and(preds == 0, yval == 1)).astype(float)
## 计算准确率和召回率 以及 F1
precision=tp/(tp+fp)
recall=tp/(tp+fn)
f1=(2*precision*recall)/(precision+recall)
if(f1>best_f1):
best_f1=f1
best_epsilon=epsilon
return best_epsilon,best_f1
file=r'E:\python\MyMachineLearning\exp8\ex8data1.mat'
data=sio.loadmat(file)
X=data['X']
# 数据可视化
# fig,ax=plt.subplots()
# ax.scatter(X[:,0],X[:,1])
# plt.show()
mu,sigma=compute_gaussian(X)
## 绘制高斯分布的图形
# fig,ax=plt.subplots()
# xplot=np.linspace(0,25,100)
# yplot=np.linspace(0.25,100)
# ## 生成网格,返回x坐标和y坐标
# Xplot,Yplot=np.meshgrid(xplot,yplot)
# #计算 对应的值
# Z = np.exp((-0.5)*((Xplot-mu[0])**2/sigma[0]+(Yplot-mu[1])**2/sigma[1]))
# ## 绘制等值图
# contour = plt.contour(Xplot, Yplot, Z,[10**-11, 10**-7, 10**-5, 10**-3, 0.1],colors='k')
# ax.scatter(X[:,0], X[:,1])
# plt.show()
## 通过验证集来确定 阈值(用于判断异常点)
Xval=data['Xval']
yval=data['yval']
## scipy 内置函数,用于给出对应点正态分布概率
# dist=stats.norm(mu[0],sigma[0])
# dist.pdf(X[:,0])
p=np.zeros((Xval.shape[0],Xval.shape[1]))
p[:,0] = stats.norm(mu[0], sigma[0]).pdf(X[:,0])
p[:,1] = stats.norm(mu[1], sigma[1]).pdf(X[:,1])
pval = np.zeros((Xval.shape[0], Xval.shape[1]))
pval[:,0] = stats.norm(mu[0], sigma[0]).pdf(Xval[:,0])
pval[:,1] = stats.norm(mu[1], sigma[1]).pdf(Xval[:,1])
##可视化分类结果
# epsilon, f1 = select_threshold(pval, yval)
# outliers = np.where(p < epsilon)
# fig,ax=plt.subplots()
# ax.scatter(X[:,0],X[:,1])
# ax.scatter(outliers[:,0],outliers[:,1],c='r',marker='x')
# plt.show()
对于多元高斯分布模型的创建如下
## 多元高斯分布
mu=X.mean(axis=0)
cov=np.cov(X.T)
## 创建多元高斯分布模型
multi_normal = stats.multivariate_normal(mu, cov)