Mahout基于item的协同过滤之asMatrix

		/**
		 * Job asMatrix
		 * 输出:itemA,  格式的向量
		 * 完成的功能:
		 * 1.对每个item求topN相似的wupin
		 * 2.计算下三角矩阵(由已经计算完成的上三角矩阵完成)
		 */
		if (shouldRunNextPhase(parsedArgs, currentPhase)) {
			Job asMatrix = prepareJob(
					pairwiseSimilarityPath, 				// 输入文件
					getOutputPath(),						// 输出文件
					UnsymmetrifyMapper.class,				// 
					IntWritable.class,						// mapper output key
					VectorWritable.class, 					// mapper output value
					MergeToTopKSimilaritiesReducer.class,	// 
					IntWritable.class,						// reducer output key
					VectorWritable.class);					// reducer output value
			asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
			asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
			boolean succeeded = asMatrix.waitForCompletion(true);
			if (!succeeded) {
				return -1;
			}
		}

(1)UnsymmetrifyMapper

	public static class UnsymmetrifyMapper extends Mapper {

		private int maxSimilaritiesPerRow; // item相似个数

		@Override
		protected void setup(Context ctx) throws IOException, InterruptedException {
			maxSimilaritiesPerRow = ctx.getConfiguration().getInt( MAX_SIMILARITIES_PER_ROW, 0);
			Preconditions.checkArgument(maxSimilaritiesPerRow > 0, "Maximum number of similarities per row must be greater then 0!");
		}

		@Override
		protected void map(IntWritable row, VectorWritable similaritiesWritable, Context ctx) throws IOException, InterruptedException {
			
			Vector similarities = similaritiesWritable.get();// 相似度输入格式:itemX, 
			Vector transposedPartial = similarities.like(); // 转置后的向量
			TopElementsQueue topKQueue = new TopElementsQueue(maxSimilaritiesPerRow); // 每个item最多maxSimilaritiesPerRow最相似的item
			for (Element nonZeroElement : similarities.nonZeroes()) { // 计算topK
				MutableElement top = topKQueue.top();
				double candidateValue = nonZeroElement.get();
				if (candidateValue > top.get()) {
					top.setIndex(nonZeroElement.index());
					top.set(candidateValue);
					topKQueue.updateTop();
				}
				
				// 求转置向量
				transposedPartial.setQuick(row.get(), candidateValue);// 转置向量  
				ctx.write(new IntWritable(nonZeroElement.index()), new VectorWritable(transposedPartial));// 以 itemY,  格式写入到输出 
				transposedPartial.setQuick(row.get(), 0.0);
			}
			
			// 将与当前物品TopN相似的物品以 itemX,  格式写入到输出 
			Vector topKSimilarities = new RandomAccessSparseVector(similarities.size(), maxSimilaritiesPerRow);
			for (Vector.Element topKSimilarity : topKQueue.getTopElements()) {
				topKSimilarities.setQuick(topKSimilarity.index(), topKSimilarity.get());
			}
			ctx.write(row, new VectorWritable(topKSimilarities)); // 将itemX最相似的topK写到输出文件中
		}
	}


(2)MergeToTopKSimilaritiesReducer

	public static class MergeToTopKSimilaritiesReducer extends Reducer {

		private int maxSimilaritiesPerRow;// item相似个数

		@Override
		protected void setup(Context ctx) throws IOException, InterruptedException {
			maxSimilaritiesPerRow = ctx.getConfiguration().getInt(MAX_SIMILARITIES_PER_ROW, 0);
			Preconditions.checkArgument(maxSimilaritiesPerRow > 0,"Maximum number of similarities per row must be greater then 0!");
		}

		@Override
		protected void reduce(IntWritable row, Iterable partials, Context ctx) throws IOException, InterruptedException {
			// 将mapper过程中得到的两种向量 itemO,  与 itemA,  按相同物品进行合并
			Vector allSimilarities = Vectors.merge(partials);
			// 再次求TopN
			Vector topKSimilarities = Vectors.topKElements(maxSimilaritiesPerRow, allSimilarities);
			// 最后输出 itemA,  格式的向量
			ctx.write(row, new VectorWritable(topKSimilarities));
		}
	}



你可能感兴趣的:(Mahout,Mahout)