outliers_fraction = 0.01
detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15),
LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30),
LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45),
LOF(n_neighbors=50)]
classifiers = {
'Angle-based Outlier Detector (ABOD)':
ABOD(contamination=outliers_fraction),
'Cluster-based Local Outlier Factor (CBLOF)':
CBLOF(contamination=outliers_fraction,
check_estimator=False, random_state=0),
'Feature Bagging':
FeatureBagging(LOF(n_neighbors=35),
contamination=outliers_fraction,
random_state=0),
'Histogram-base Outlier Detection (HBOS)': HBOS(
contamination=outliers_fraction),
'Isolation Forest': IForest(contamination=outliers_fraction,
random_state=0),
'K Nearest Neighbors (KNN)': KNN(
contamination=outliers_fraction),
'Average KNN': KNN(method='mean',
contamination=outliers_fraction),
'Local Outlier Factor (LOF)':
LOF(n_neighbors=35, contamination=outliers_fraction),
'Minimum Covariance Determinant (MCD)': MCD(
contamination=outliers_fraction, random_state=0),
'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
'Principal Component Analysis (PCA)': PCA(
contamination=outliers_fraction, random_state=0),
'Locally Selective Combination (LSCP)': LSCP(
detector_list, contamination=outliers_fraction,
random_state=0)
}
for i, clf in enumerate(classifiers.keys()):
print('Model', i + 1, clf)
X1= df['num_people'].values.reshape(-1,1)
X2 = df['num_order'].values.reshape(-1,1)
X = np.concatenate((X1,X2),axis=1)
xx , yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
plt.figure(figsize=(20, 15))
for i, (clf_name, clf) in enumerate(classifiers.items()):
clf.fit(X)
scores_pred = clf.decision_function(X) * -1
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)
df1 = df
df1['outlier'] = y_pred.tolist()
inliers_people = np.array(df1['num_people'][df1['outlier'] == 0]).reshape(-1,1)
inliers_order = np.array(df1['num_order'][df1['outlier'] == 0]).reshape(-1,1)
outliers_people = df1['num_people'][df1['outlier'] == 1].values.reshape(-1,1)
outliers_order = df1['num_order'][df1['outlier'] == 1].values.reshape(-1,1)
threshold = np.percentile(scores_pred, 100 * outliers_fraction)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
Z = Z.reshape(xx.shape)
plt.subplot(3,4,i+1)
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
b = plt.scatter(x=inliers_people, y=inliers_order, c='white',s=20, edgecolor='k')
c = plt.scatter(x=outliers_people, y=outliers_order, c='black',s=20, edgecolor='k')
plt.axis('tight')
plt.legend([a.collections[0], b,c], ['决策函数', '正常值','异常值'],
prop=matplotlib.font_manager.FontProperties(size=12),loc='upper right')
plt.xlim((0, 1))
plt.ylim((0, 1))
ss = '异常值数量: '+str(n_outliers)+' 正常值数量: '+str(n_inliers)
plt.title(clf_name)
plt.xlabel(ss)
plt.show();