第一步是载入数据和设定参数。
载入数据,random生成数据可以不需要数据集。
以下是载入数据的代码:
% Generate or load an evaluation set (query+learn+base)
if strcmp (dataset, 'random')
% synthetic dataset
d = 16;
% Generate a set of unit norm vectors
ntrain = 10000;
nbase = 1000000;
nquery = 1000;
% 随机生成d * n维的矩阵
vtrain = single (rand (d, ntrain));
vbase = single (rand (d, nbase));
vquery = single (rand (d, nquery));
% Compute the ground-truth
t0 = cputime;
[ids_gnd, dis_gnd] = yael_nn (vbase, vquery, 1);
tgnd = cputime - t0;
else
switch dataset
case 'siftsmall'
basedir = './siftsmall/' ; % modify this directory to fit your configuration
fbase = [basedir 'siftsmall_base.fvecs'];
fquery = [basedir 'siftsmall_query.fvecs'];
ftrain = [basedir 'siftsmall_learn.fvecs'];
fgroundtruth = [basedir 'siftsmall_groundtruth.ivecs'];
case 'sift'
basedir = './sift/' ; % modify this directory to fit your configuration
fbase = [basedir 'sift_base.fvecs'];
fquery = [basedir 'sift_query.fvecs'];
ftrain = [basedir 'sift_learn.fvecs'];
fgroundtruth = [basedir 'sift_groundtruth.ivecs'];
case 'gist'
basedir = './gist/' ; % modify this directory to fit your configuration
fbase = [basedir 'gist_base.fvecs'];
fquery = [basedir 'gist_query.fvecs'];
ftrain = [basedir 'gist_learn.fvecs'];
fgroundtruth = [basedir 'gist_groundtruth.ivecs'];
end
% Read the vectors
vtrain = fvecs_read (ftrain);
vbase = fvecs_read (fbase);
vquery = fvecs_read (fquery);
ntrain = size (vtrain, 2);
nbase = size (vbase, 2);
nquery = size (vquery, 2);
d = size (vtrain, 1);
% Load the groundtruth
ids = ivecs_read (fgroundtruth);
ids_gnd = ids (1, :) + 1; % matlab indices start at 1
end
在随机生成数据的代码中,调用yael_nn函数来生成ground-truth。
之后设定搜索过程中的参数:
k = 100; % number of elements to be returned
nsq = 8; % number of subquantizers to be used (m in the paper)
k是在搜索时返回top-k个结果;nsq指的是文章里面的m组分组。
接下来是训练的过程,利用数据里面的训练集,预先利用kmeans算法训练出每个subquantizer的中心点,记录在结构体pq中:
% Learn the PQ code structure
t0 = cputime;
pq = pq_new (nsq, vtrain)
tpqlearn = cputime - t0;
训练过程如下:
function pq = pq_new (nsq, v)
n = size (v, 2); % number of vectors in the training set
d = size (v, 1); % vector dimension
ds = d / nsq; % dimension of the subvectors to quantize
nsqbits = 8; % the number of subquantizers is fixed to 8 in this implementation
ks = 2^nsqbits; % number of centroids per subquantizer
pq.nsq = nsq;
pq.ks = ks;
pq.ds = ds;
pq.centroids = cell (nsq, 1);
for q = 1:nsq
vs = v((q-1) * ds + 1 : q *ds, :);
[centroids_tmp, dis, assign] = yael_kmeans (vs, ks, 'niter', 100, 'verbose', 0);
pq.centroids{q} = centroids_tmp;
end
分别对nsq个subquantizer进行kmeans训练,调用了yael_kmeans库函数,针对当前选取的子空间vs,做ks个中心点的kmeans。
训练完kmeans之后,将得到的中心点和数据集做编码:
% encode the database vectors
t0 = cputime;
cbase = pq_assign (pq, vbase);
tpqencode = cputime - t0;
编码过程如下:
function c = pq_assign (pq, v)
n = size (v, 2);
d = size (v, 1);
c = zeros (pq.nsq, n, 'uint8');
% process separately each subquantizer
for q = 1:pq.nsq
% find the nearest centroid for each subvector
vsub = v((q-1)*pq.ds+1:q*pq.ds, :);
[idx, dis] = yael_nn (pq.centroids{q}, vsub, 1, 2);
c(q, :) = idx - 1;
end
编码过程很简单,就是把每个分块后的数据集,分别找到对应的最近邻,用最近邻的index来作为编码。
接下来就是搜索的过程,linear ADC,线性扫描,找近似最近邻。
%---[ perform the search and compare with the ground-truth ]---
t0 = cputime;
[ids_pqc, dis_pqc] = pq_search (pq, cbase, vquery, k);
tpq = cputime - t0;
线性扫描的过程是先把当前询问query点的第q组和之前训练的第q组的中心点全部进行计算。计算完之后进行pq编码元素距离的累和。
function [ids, dis] = pq_search (pq, cbase, vquery, k)
n = size (cbase, 2);
nq = size (vquery, 2);
d = size (vquery, 1);
distab = zeros (pq.ks, pq.nsq, 'single');
dis = zeros (nq, k, 'single');
ids = zeros (nq, k, 'single');
for query = 1:nq
% pre-compute the table of squared distance to centroids
for q = 1:pq.nsq
vsub = vquery ((q-1)*pq.ds+1:q*pq.ds, query);
distab (:,q) = yael_L2sqr (vsub, pq.centroids{q})';
end
% add the tabulated distances to construct the distance estimators
disquerybase = sumidxtab (distab, cbase, 0);
[dis1, ids1] = yael_kmin (disquerybase, k);
dis(query, :) = dis1;
ids(query, :) = ids1;
end
最后计算完毕后,对得到的数据进行分析。代码中进行了一个处理,如果当前找到了nn,就用find到的位置作为标记;如果没有找到,就用k+1来标记。这样方便之后统计recall。
nn_ranks_pqc = zeros (nquery, 1);
hist_pqc = zeros (k+1, 1);
for i = 1:nquery
gnd_ids = ids_gnd(i);
nn_pos = find (ids_pqc(i, :) == gnd_ids);
if length (nn_pos) == 1
nn_ranks_pqc (i) = nn_pos;
else
nn_ranks_pqc (i) = k + 1;
end
end
nn_ranks_pqc = sort (nn_ranks_pqc);
for i = [1 2 5 10 20 50 100 200 500 1000 2000 5000 10000]
if i <= k
r_at_i = length (find (nn_ranks_pqc <= i & nn_ranks_pqc <= k)) / nquery * 100;
fprintf ('r@%3d = %.3f\n', i, r_at_i);
end
end
载入数据部分和PQ是一致的。
参数设置如下:
k = 100; % number of elements to be returned
nsq = 8; % number of subquantizers to be used (m in the paper)
coarsek = 256; % number of centroids for the coarse quantizer
w = 4; % number of cell visited per query
其中w为查询时,查询的key的个数。
IVFPQ的训练过程:
% Learn the PQ code structure
t0 = cputime;
ivfpq = ivfpq_new (coarsek, nsq, vtrain)
tpqlearn = cputime - t0;
IVFPQ的训练过程,首先从训练集V中,用kmeans算法计算出course_centroids;之后利用yael_nn函数,计算训练集的残差;最后对残差进行PQ训练。代码如下:
function ivfpq = ivfpq_new (coarsek, nsq, v)
n = size (v, 2); % number of vectors in the training set
d = size (v, 1); % vector dimension
niter = 50;
ivfpq.coarsek = coarsek;
% first learn the coarse quantizer
ivfpq.coa_centroids = yael_kmeans (v, coarsek, 'niter', niter, 'verbose', 0);
% compute the residual vectors
[idx, dis] = yael_nn (ivfpq.coa_centroids, v);
v = v - ivfpq.coa_centroids(:, idx);
% learn the product quantizer on the residual vectors
ivfpq.pq = pq_new (nsq, v);
接下来是编码的过程:
% encode the database vectors: ivf is a structure containing two sets of k cells
% Each cell contains a set of idx/codes associated with a given coarse centroid
t0 = cputime;
ivf = ivfpq_assign (ivfpq, vbase);
tpqencode = cputime - t0;
编码的过程,首先利用yael_nn找到数据集v对应的coarse_centroids;接着计算相应的残差,接着对残差进行编码。
接下来用hist创建一个直方图,把每个元素放入直方图中就可以了。代码如下:
function ivf = ivfpq_assign (ivfpq, v)
n = size (v, 2);
d = size (v, 1);
% find the indexes for the coarse quantizer
[coaidx, dumm] = yael_nn (ivfpq.coa_centroids, v);
% apply the product quantization on the residual vectors
v = v - ivfpq.coa_centroids(:, coaidx);
c = pq_assign (ivfpq.pq, v);
% prepare the inverted file: count occurences of each coarse centroid
% and prepare the list according to this cell population
ivf.cellpop = hist (double(coaidx), 1:ivfpq.coarsek);
[coaidx, ids] = sort (coaidx);
c = c(:, ids);
ivf.ids = cell (ivfpq.coarsek, 1); % vector identifiers
ivf.codes = cell (ivfpq.coarsek, 1);
pos = 1;
for i=1:ivfpq.coarsek
nextpos = pos+ivf.cellpop(i);
ivf.ids{i} = ids (pos:nextpos-1);
ivf.codes{i} = c (:, pos:nextpos-1);
pos = nextpos;
end
搜索过程:
%---[ perform the search and compare with the ground-truth ]---
t0 = cputime;
[ids_pqc, dis_pqc] = ivfpq_search (ivfpq, ivf, vquery, k, w);
tpqsearch = cputime - t0;
搜索倒排表的过程:
function [ids, dis] = ivfpq_search (ivfpq, ivf, vquery, k, w)
nq = size (vquery, 2);
d = size (vquery, 1);
ds = ivfpq.pq.ds;
ks = ivfpq.pq.ks;
nsq = ivfpq.pq.nsq;
distab = zeros (ks, nsq, 'single');
dis = zeros (nq, k, 'single'); dis(:) = inf;
ids = zeros (nq, k, 'single'); ids(:) = -1;
% find the w nearest neighbors with respect to the coarse quantizer
[coaidx, coadis] = yael_nn (ivfpq.coa_centroids, vquery, w);
for query = 1:nq
%qcoaidx = coaidx((query-1)*w+1:query*w);
qcoaidx = coaidx (:, query);
% compute the w residual vectors
%v = repmat (vquery(:,query), 1, w) - ivfpq.coa_centroids(:,qcoaidx);
v = bsxfun (@minus, vquery (:, query), ivfpq.coa_centroids(:,qcoaidx));
% indices and distances of the database vectors associated with the current query
qidx = [];
qdis = [];
for j = 1:w
% pre-compute the table of squared distance to centroids
for q = 1:nsq
vsub = v ((q-1)*ds+1:q*ds, j);
distab (:,q) = yael_L2sqr (vsub, ivfpq.pq.centroids{q})';
end
% add the tabulated distances to construct the distance estimators
qdis = [qdis ; sumidxtab(distab, ivf.codes{qcoaidx(j)}, 0)];
qidx = [qidx ivf.ids{qcoaidx(j)}];
end
ktmp = min (k, length (qdis));
[dis1, ids1] = yael_kmin (qdis, ktmp);
dis(query, 1:ktmp) = dis1;
ids(query, 1:ktmp) = qidx(ids1);
end
最后再进行数据分析,就可以了。分析过程和PQ的一样。