In [1]:
import numpy as np
In [2]:
def F1(predictions, y): # 预测值和真实标签值 TP = np.sum((predictions == 1) & (y == 1)) FP = np.sum((predictions == 1) & (y == 0)) FN = np.sum((predictions == 0) & (y == 1)) if TP + FP == 0: precision = 0 else: precision = float(TP) / (TP + FP) if TP + FN == 0: recall = 0 else: recall = float(TP) / (TP + FN) if precision + recall == 0: return 0 else: # 理想情况下,F1 score越大,模型效果越好 return (2.0 * precision * recall) / (precision + recall)
In [3]:
def gaussianModel(X): # 高斯模型返回的是一个函数模型,return p # 参数估计 m, n = X.shape # m个样本,n个特征 mu = np.mean(X, axis=0) sigma2 = np.var(X, axis=0) def p(x): # x是单个样本,n*1维 total = 1 # 总概率初始化 for j in range(x.shape[0]): # 遍历 x 的所有的特征 # 高斯分布,把每个特征的概率乘在一起 total *= np.exp(-np.power((x[j, 0] - mu[0, j]), 2) / (2 * sigma2[0, j]) ) / (np.sqrt(2 * np.pi * sigma2[0, j])) return total # 返回样本概率 return p
In [4]:
def multivariateGaussianModel(X): # 参数估计 m, n = X.shape mu = np.mean(X.T, axis=1) Sigma = np.mat(np.cov(X.T)) # 矩阵的协方差矩阵 detSigma = np.linalg.det(Sigma) # 协方差矩阵的行列式 def p(x): x = x - mu return np.exp(-x.T * np.linalg.pinv(Sigma) * x / 2).A[0] * \ ((2*np.pi)**(-n/2.0) * (detSigma**(-0.5) )) # np.linalg.pinv求逆 return p
In [5]:
def train(X, model=gaussianModel): return model(X) # 返回的是概率模型p
In [6]:
def selectEpsilon(XVal, yVal, p): # 模型p和交叉验证集 pVal = np.mat([p(x.T) for x in XVal]).reshape(-1, 1) # 交叉验证集中所有样本的概率 step = (np.max(pVal) - np.min(pVal)) / 1000.0 # 在最大值和最小值之间生成1000个ε bestEpsilon = 0 bestF1 = 0 # 生成1000个ε,选择合适的ε for epsilon in np.arange(np.min(pVal), np.max(pVal), step): predictions = pVal < epsilon f1 = F1(predictions, yVal) if f1 > bestF1: bestF1 = f1 bestEpsilon = epsilon return bestEpsilon, bestF1
In [7]:
%matplotlib inline
In [8]:
from scipy.io import loadmat import matplotlib.pyplot as plt
In [9]:
plt.rcParams['font.sans-serif']=['SimHei'] plt.rcParams['axes.unicode_minus']=False
In [10]:
# 低维数据测试 data = loadmat('data/ex8data1.mat') X = np.mat(data['X']) # 训练集无标签,没有y XVal = np.mat(data['Xval']) yVal = np.mat(data['yval']) # 交叉验证集中的异常就是标注 p = train(X) # 简单的高斯分布 #p = train(X, model=multivariateGaussianModel) # 多元高斯分布 pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1) # 绘制数据点 plt.xlabel(u'延迟 (ms)') plt.ylabel(u'吞吐 (mb/s)') plt.plot(X[:, 0], X[:, 1], 'bx') epsilon, f1 = selectEpsilon(XVal, yVal, p) print(u'基于交叉验证集最佳ε: %e\n'%epsilon) print(u'基于交叉验证集最佳F1: %f\n'%f1) print(u'找到 %d 个异常点' % np.sum(pTest < epsilon)) # 获得训练集的异常点 outliers = np.where(pTest < epsilon, True, False).ravel() plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, markersize=10, fillstyle='none', markeredgewidth=1) n = np.linspace(0, 35, 100) X1 = np.meshgrid(n,n) XFit = np.mat(np.column_stack((X1[0].T.flatten(), X1[1].T.flatten()))) pFit = np.mat([p(x.T) for x in XFit]).reshape(-1, 1) pFit = pFit.reshape(X1[0].shape) if not np.isinf(np.sum(pFit)): plt.contour(X1[0], X1[1], pFit, 10.0**np.arange(-20, 0, 3).T) plt.show()
基于交叉验证集最佳ε: 8.990853e-05 基于交叉验证集最佳F1: 0.875000 找到 6 个异常点
In [11]:
# 高维数据 data = loadmat('data/ex8data2.mat') X = np.mat(data['X']) XVal = np.mat(data['Xval']) yVal = np.mat(data['yval']) p = train(X) #p = train(X, model=multivariateGaussianModel) pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1) epsilon, f1 = selectEpsilon(XVal, yVal, p) print('Best epsilon found using cross-validation: %e\n'%epsilon) print('Best F1 on Cross Validation Set: %f\n'%f1) print('# Outliers found: %d' % np.sum(pTest < epsilon))
Best epsilon found using cross-validation: 1.377229e-18 Best F1 on Cross Validation Set: 0.615385 # Outliers found: 117