在本练习中,我们将使用高斯模型实现异常检查算法,并将其以用于检查网络上的故障服务器。我们还将看到如何协作过滤构建推荐系统,并将其应用于电影推荐数据集。
我们的第一个任务是使用高斯模型来检测数据集中未标记的示例是否应被歧视为异常。我们有一个简单的二维数据集开始,以帮助可视化该算法正在做什么。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
data = loadmat('ex8data1.mat')
X = data['X']
X.shape
(307, 2)
fig, ax = plt.subplots(figsize = (12, 8))
ax.scatter(X[:,0], X[:,1])
plt.show()
这是一个非常紧密的聚类,几个值远离了聚类。在这个简单的例子中,这些可以被认为是异常的。为了弄清楚,我们正在为数据中的每个特征估计高斯发布。为此,我们将创建一个返回每个要素的均值和方差的函数。
def estimate_gaussian(X):
# INPUT:数据X
# OUTPUT:数据的均值和方差
# TODO:根据数据计算均值和方差
# STEP1:根据数据计算均值和方差
mu = X.mean(axis = 0)
sigma = X.var(axis = 0)
return mu, sigma
mu, sigma = estimate_gaussian(X)
mu, sigma
(array([14.11222578, 14.99771051]), array([1.83263141, 1.70974533]))
现在我们有了我们的模型参数,我们需要确定概率阈值,这表明一个样本应该被认为是一个异常。为此,我们需要使用一组标记的验证数据(其中真实异常样本已被标记),并在给出不同阈值的情况下,对模型的性能进行鉴定。
Xval = data['Xval']
yval = data['yval']
Xval.shape, yval.shape
((307, 2), (307, 1))
我们还需要一种计算数据点属于正态分布的概率的方法。幸运的是SciPy有这个内置的方法。
from scipy import stats
dist = stats.norm(mu[0], sigma[0])
dist.pdf(15)
0.1935875044615038
我们还可以将数据传递给概率密度函数,并获得数据集中每个点的概率密度。
dist.pdf(X[:,0])[0:50]
array([0.183842 , 0.20221694, 0.21746136, 0.19778763, 0.20858956,
0.21652359, 0.16991291, 0.15123542, 0.1163989 , 0.1594734 ,
0.21716057, 0.21760472, 0.20141857, 0.20157497, 0.21711385,
0.21758775, 0.21695576, 0.2138258 , 0.21057069, 0.1173018 ,
0.20765108, 0.21717452, 0.19510663, 0.21702152, 0.17429399,
0.15413455, 0.21000109, 0.20223586, 0.21031898, 0.21313426,
0.16158946, 0.2170794 , 0.17825767, 0.17414633, 0.1264951 ,
0.19723662, 0.14538809, 0.21766361, 0.21191386, 0.21729442,
0.21238912, 0.18799417, 0.21259798, 0.21752767, 0.20616968,
0.21520366, 0.1280081 , 0.21768113, 0.21539967, 0.16913173])
我们计算并保存给定上述的高斯模型参数的数据集中每个值的概率密度。
p = np.zeros((X.shape[0], X.shape[1]))
p[:, 0] = stats.norm(mu[0], sigma[0]).pdf(X[:, 0])
p[:, 1] = stats.norm(mu[1], sigma[1]).pdf(X[:, 1])
p.shape
(307, 2)
pval = np.zeros((Xval.shape[0], Xval.shape[1]))
pval[:, 0] = stats.norm(mu[0], sigma[0]).pdf(Xval[:, 0])
pval[:, 1] = stats.norm(mu[1], sigma[1]).pdf(Xval[:, 1])
pval.shape
(307, 2)
接下来,我们需要一个函数,找到给定概率密度值和真实标签的最佳阈值。为了做到这一点,我们将为不同的epsilon值计算F1分数。F1是真阳性,假阳性和假阴性的数量的函数。方程式在练习文本中。
def select_threshold(pval, yval):
# INPUT:交叉验证集数据pval,标签yval
# OUTPUT:最好的参数epsilon,最高的得分F1
# TODO:寻找最好的参数epsilon,最高的得分F1
# STEP1:初始化变量
best_epsilon = 0
best_f1 = 0
f1 = 0
step = (pval.max() - pval.min()) / 1000
# STEP2:计算F1分数
for epsilon in np.arange(pval.min(), pval.max(), step):
preds = pval < epsilon
tp = np.sum(np.logical_and(preds == 1, yval == 1)).astype(float)
fp = np.sum(np.logical_and(preds == 1, yval == 0)).astype(float)
fn = np.sum(np.logical_and(preds == 0, yval == 1)).astype(float)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2 * precision * recall) / (precision + recall)
if f1 > best_f1:
best_f1 = f1
best_epsilon = epsilon
return best_epsilon, best_f1
epsilon, f1 = select_threshold(pval, yval)
epsilon, f1
(0.009566706005956842, 0.7142857142857143)
outliers = np.where(p < epsilon)
outliers
(array([300, 301, 301, 303, 303, 304, 306, 306], dtype=int64),
array([1, 0, 1, 0, 1, 0, 0, 1], dtype=int64))
fig, ax = plt.subplots(figsize = (12, 8))
ax.scatter(X[:,0], X[:, 1])
ax.scatter(X[outliers[0], 0], X[outliers[0],1], s = 50, color = 'r', marker = 'o')
plt.show()
红点是标记为异常值的点。这些看起来很合理。有一些分离(但没有被标记)的右上角也可能是一个异常值,但是相当接近。
n m n_m nm:电影的数量
n u n_u nu:参加评价用户的数量
y ( i , j ) y^{(i,j)} y(i,j):用户j对电影i的评分(第i行,第j列) Y ( n m , n u ) Y(n_m,n_u) Y(nm,nu)
r ( i , j ) r^{(i,j)} r(i,j):用户j是否对电影i的做了评分(是为1,否为0) R ( n m , n u ) R(n_m,n_u) R(nm,nu)
X ( n m , n f ) X(n_m,n_f) X(nm,nf):电影的特征 ( x ( 1 ) , x ( 2 ) , . . . , x ( n m ) ) (x^{(1)},x^{(2)},...,x^{(n_m)}) (x(1),x(2),...,x(nm))
Θ ( n u , n f ) \Theta(n_u,n_f) Θ(nu,nf):用户的特征 ( Θ ( 1 ) , Θ ( 2 ) , . . . , Θ ( n u ) ) (\Theta^{(1)},\Theta^{(2)},...,\Theta^{(n_u)}) (Θ(1),Θ(2),...,Θ(nu))
Y = X @ T h e t a . T [email protected] Y=X@Theta.T
mat = loadmat('ex8_movies.mat')
mat.keys()
dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])
Y, R = mat['Y'], mat['R']
Y.shape, R.shape
((1682, 943), (1682, 943))
param_mat = loadmat('ex8_movieParams.mat')
param_mat.keys()
dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])
X = param_mat['X']
Theta = param_mat['Theta']
nu = param_mat['num_users']
nm = param_mat['num_movies']
nf = param_mat['num_features']
X.shape, Theta.shape, nu, nm, nf
((1682, 10),
(943, 10),
array([[943]], dtype=uint16),
array([[1682]], dtype=uint16),
array([[10]], dtype=uint8))
nu = int(nu)
nm = int(nm)
nf = int(nf)
nu, nm, nf
(943, 1682, 10)
def serialize(X, Theta):
return np.append(X.flatten(), Theta.flatten())
def deserialize(params, nm, nu, nf):
X = params[: nm * nf].reshape(nm, nf)
Theta = params[nm * nf:].reshape(nu, nf)
return X, Theta
def costFunction(params, Y, R, nm, nu, nf, lamda):
X, Theta = deserialize(params, nm, nu, nf)
error = 0.5 * np.square((X @ Theta.T - Y) * R).sum()
reg1 = 0.5 * lamda * np.square(X).sum()
reg2 = 2.5 * lamda * np.square(Theta).sum()
return error + reg1 + reg2
users = 4
movies = 5
features = 3
X_sub = X[:movies, :features]
Theta_sub = Theta[:users, :features]
Y_sub = Y[:movies, :users]
R_sub = R[:movies, :users]
cost1 = costFunction(serialize(X_sub, Theta_sub), Y_sub, R_sub, movies, users, features, lamda = 0)
cost1
22.224603725685675
cost2 = costFunction(serialize(X_sub, Theta_sub), Y_sub, R_sub, movies, users, features, lamda = 1.5)
cost2
47.52672734347449
def costGradient(params, Y, R, nm, nu, nf, lamda):
X, Theta = deserialize(params, nm, nu, nf)
X_grad = ((X @ Theta.T - Y) * R) @ Theta + lamda * X
Theta_grad = ((X @ Theta.T - Y) * R).T @ X + lamda * Theta
return serialize(X_grad, Theta_grad)
my_ratings = np.zeros((nm, 1))
my_ratings[9] = 5
my_ratings[66] = 5
my_ratings[96] = 5
my_ratings[121] = 4
my_ratings[148] = 4
my_ratings[285] = 3
my_ratings[490] = 4
my_ratings[599] = 4
my_ratings[643] = 4
my_ratings[958] = 5
my_ratings[1117] = 3
Y = np.c_[Y, my_ratings]
R = np.c_[R, my_ratings != 0]
Y.shape
(1682, 946)
nm, nu = Y.shape
def normalizeRatings(Y, R):
Y_mean = (Y.sum(axis = 1) / R.sum(axis = 1)).reshape(-1, 1)
Y_norm = (Y - Y_mean) * R
return Y_norm, Y_mean
Y_norm, Y_mean = normalizeRatings(Y, R)
X = np.random.random((nm, nf))
Theta = np.random.random((nu, nf))
params = serialize(X, Theta)
lamda = 5
from scipy.optimize import minimize
res = minimize(fun = costFunction,
x0 = params,
args = (Y_norm, R, nm, nu, nf, lamda),
method = 'TNC',
jac = costGradient,
options = {'maxiter':100})
params_fit = res.x
fit_X, fit_Theta = deserialize(params_fit, nm, nu, nf)
Y_pred = fit_X @ fit_Theta.T
y_pred = Y_pred[:, -1] + Y_mean.flatten()
index = np.argsort(-y_pred)
index[:10]
array([1535, 1652, 813, 1499, 1188, 1466, 1121, 1598, 1200, 1292],
dtype=int64)
movies = []
with open('movie_ids.txt', 'r', encoding='latin 1') as f:
for line in f:
tokens = line.strip().split(' ')
movies.append(''.join(tokens[1:]))
len(movies)
1682
for i in range(10):
print(index[i], movies[index[i]], y_pred[index[i]])
1535 Aiqingwansui(1994) 5.1402235650034
1652 EntertainingAngels:TheDorothyDayStory(1996) 5.083489691661643
813 GreatDayinHarlem,A(1994) 5.073979829650033
1499 SantawithMuscles(1996) 5.0557879904183665
1188 Prefontaine(1997) 5.054936550086743
1466 SaintofFortWashington,The(1993) 5.051469403295101
1121 TheyMadeMeaCriminal(1939) 5.0423769037101716
1598 SomeoneElse'sAmerica(1995) 5.03778342785061
1200 MarleneDietrich:ShadowandLight(1996) 4.981158654781782
1292 StarKid(1997) 4.925613565282081
])
1535 Aiqingwansui(1994) 5.1402235650034
1652 EntertainingAngels:TheDorothyDayStory(1996) 5.083489691661643
813 GreatDayinHarlem,A(1994) 5.073979829650033
1499 SantawithMuscles(1996) 5.0557879904183665
1188 Prefontaine(1997) 5.054936550086743
1466 SaintofFortWashington,The(1993) 5.051469403295101
1121 TheyMadeMeaCriminal(1939) 5.0423769037101716
1598 SomeoneElse'sAmerica(1995) 5.03778342785061
1200 MarleneDietrich:ShadowandLight(1996) 4.981158654781782
1292 StarKid(1997) 4.925613565282081