/**
* Job asMatrix
* 输出:itemA, 格式的向量
* 完成的功能:
* 1.对每个item求topN相似的wupin
* 2.计算下三角矩阵(由已经计算完成的上三角矩阵完成)
*/
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
Job asMatrix = prepareJob(
pairwiseSimilarityPath, // 输入文件
getOutputPath(), // 输出文件
UnsymmetrifyMapper.class, //
IntWritable.class, // mapper output key
VectorWritable.class, // mapper output value
MergeToTopKSimilaritiesReducer.class, //
IntWritable.class, // reducer output key
VectorWritable.class); // reducer output value
asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
boolean succeeded = asMatrix.waitForCompletion(true);
if (!succeeded) {
return -1;
}
}
(1)UnsymmetrifyMapper
public static class UnsymmetrifyMapper extends Mapper {
private int maxSimilaritiesPerRow; // item相似个数
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
maxSimilaritiesPerRow = ctx.getConfiguration().getInt( MAX_SIMILARITIES_PER_ROW, 0);
Preconditions.checkArgument(maxSimilaritiesPerRow > 0, "Maximum number of similarities per row must be greater then 0!");
}
@Override
protected void map(IntWritable row, VectorWritable similaritiesWritable, Context ctx) throws IOException, InterruptedException {
Vector similarities = similaritiesWritable.get();// 相似度输入格式:itemX,
Vector transposedPartial = similarities.like(); // 转置后的向量
TopElementsQueue topKQueue = new TopElementsQueue(maxSimilaritiesPerRow); // 每个item最多maxSimilaritiesPerRow最相似的item
for (Element nonZeroElement : similarities.nonZeroes()) { // 计算topK
MutableElement top = topKQueue.top();
double candidateValue = nonZeroElement.get();
if (candidateValue > top.get()) {
top.setIndex(nonZeroElement.index());
top.set(candidateValue);
topKQueue.updateTop();
}
// 求转置向量
transposedPartial.setQuick(row.get(), candidateValue);// 转置向量
ctx.write(new IntWritable(nonZeroElement.index()), new VectorWritable(transposedPartial));// 以 itemY, 格式写入到输出
transposedPartial.setQuick(row.get(), 0.0);
}
// 将与当前物品TopN相似的物品以 itemX, 格式写入到输出
Vector topKSimilarities = new RandomAccessSparseVector(similarities.size(), maxSimilaritiesPerRow);
for (Vector.Element topKSimilarity : topKQueue.getTopElements()) {
topKSimilarities.setQuick(topKSimilarity.index(), topKSimilarity.get());
}
ctx.write(row, new VectorWritable(topKSimilarities)); // 将itemX最相似的topK写到输出文件中
}
}
public static class MergeToTopKSimilaritiesReducer extends Reducer {
private int maxSimilaritiesPerRow;// item相似个数
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
maxSimilaritiesPerRow = ctx.getConfiguration().getInt(MAX_SIMILARITIES_PER_ROW, 0);
Preconditions.checkArgument(maxSimilaritiesPerRow > 0,"Maximum number of similarities per row must be greater then 0!");
}
@Override
protected void reduce(IntWritable row, Iterable partials, Context ctx) throws IOException, InterruptedException {
// 将mapper过程中得到的两种向量 itemO, 与 itemA, 按相同物品进行合并
Vector allSimilarities = Vectors.merge(partials);
// 再次求TopN
Vector topKSimilarities = Vectors.topKElements(maxSimilaritiesPerRow, allSimilarities);
// 最后输出 itemA, 格式的向量
ctx.write(row, new VectorWritable(topKSimilarities));
}
}