Mahout中基于Item的协同过滤之pairwiseSimilarity

		/**
		 * pairwiseSimilarity作用为计算item之间的相似度
		 * 数据输入格式为(userID, VectorWritable)
		 * 数据输出格式为(itemM, )(每条项目为key的数据中其它项目index一定比当前项目大,比如M对应M+1,M+2,....).主要计算上三角矩阵
		 * 在map阶段完成aggregate计算,reduce阶段完成相似度计算
		 * 
		 */
		if (shouldRunNextPhase(parsedArgs, currentPhase)) {
			Job pairwiseSimilarity = prepareJob(
					weightsPath,					// 输入文件,输入文件格式:(userID, VectorWritable)
					pairwiseSimilarityPath, 		// 相似度输出路径 ,输出格式为(itemM, )
					CooccurrencesMapper.class,		// mapper类,主要完成aggregate的工作,以(userID, VectorWritable)为输入
					IntWritable.class, 				// mapper Key
					VectorWritable.class,			// mapper Value
					SimilarityReducer.class,		// reducer类,只要完成计算相似度的工作
					IntWritable.class,				// reducer key
					VectorWritable.class);			// reducer value
			pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);
			Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
			pairwiseConf.set(THRESHOLD, String.valueOf(threshold));
			pairwiseConf.set(NORMS_PATH, normsPath.toString());
			pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH,numNonZeroEntriesPath.toString());
			pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());
			pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);
			pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
			pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);
			// pairwiseConf.set("mapred.child.java.opts",
			// "-Xmx2048m -Xms2048m -Xmn512m -XX:SurvivorRatio=5 -XX:MaxPermSize=128m");

			boolean succeeded = pairwiseSimilarity.waitForCompletion(true);
			if (!succeeded) {
				return -1;
			}
		}

(1)CooccurrencesMapper

域:

		private VectorSimilarityMeasure similarity; // 计算相似度的类
		private OpenIntIntHashMap numNonZeroEntries;  // 记录用户非0评分item
		private Vector maxValues; // 记录每个用户评分历史的最大值
		private double threshold; // 主要用来判断评分记录是否合理(valid)

		private static final Comparator BY_INDEX = new Comparator() { // 根据itemID来排序的比较器
			@Override
			public int compare(Vector.Element one, Vector.Element two) {
				return Ints.compare(one.index(), two.index());
			}
		};


方法:

1.1 setup()函数的实现

		protected void setup(Context ctx) throws IOException, InterruptedException {
			similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class); // 初始化相似度计算类
			numNonZeroEntries = Vectors.readAsIntMap(new Path(ctx.getConfiguration().get(NUM_NON_ZERO_ENTRIES_PATH)), ctx.getConfiguration());// 获取记录每个用户非0评分项
			maxValues = Vectors.read(new Path(ctx.getConfiguration().get(MAXVALUES_PATH)),ctx.getConfiguration()); //  每个用户所有的评分中的最大值
			threshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));//主要用来判断评分是否合理
		}



1.2 map()函数的实现

		protected void map(IntWritable column, VectorWritable occurrenceVector, Context ctx) throws IOException, InterruptedException {
			Vector.Element[] occurrences = Vectors.toArray(occurrenceVector);//occurrenceVector为用户的评分记录, 即,userID,
			Arrays.sort(occurrences, BY_INDEX);// 将用户的评分记录按照itemID进行排序,BY_INDEX参考....
			int cooccurrences = 0;// 记录用多少个item对,是该用户都评分过的,且为非零值
			int prunedCooccurrences = 0;// 记录多少个评分对是无效的(被剪枝的)评分记录

			// 由item之间的相似度类Similarity的aggregate计算相似度
			// 输出格式:itemX 
			for (int n = 0; n < occurrences.length; n++) {
				Vector.Element occurrenceA = occurrences[n];
				Vector dots = new RandomAccessSparseVector(Integer.MAX_VALUE, occurrences.length);
				for (int m = n; m < occurrences.length; m++) {// 计算上三角矩阵
					Vector.Element occurrenceB = occurrences[m];
					if (threshold == NO_THRESHOLD || consider(occurrenceA, occurrenceB)) {// itemA 和 itemB 被同一个用户共同评分
						// itemA 和 itemB 被同一个用户共同评分
						dots.setQuick(occurrenceB.index(), similarity.aggregate(occurrenceA.get(), occurrenceB.get())); // 计算aggregateValueN
						cooccurrences++;
					} else {
						prunedCooccurrences++;
					}
				}
				ctx.write(new IntWritable(occurrenceA.index()), new VectorWritable(dots));// 输出格式:itemA ....
			}
			ctx.getCounter(Counters.COOCCURRENCES).increment(cooccurrences);// 总共有多少的(有效的)评分记录
			ctx.getCounter(Counters.PRUNED_COOCCURRENCES).increment(prunedCooccurrences);//总共有多少(无效的)评分记录
		}
	}



(2) SimilarityReducer

域:

		private VectorSimilarityMeasure similarity;// 计算相似度的类
		private int numberOfColumns; // 在Cosine相似度计算中 改值没有用到,纯粹是继承自父类保留下来的参数
		private boolean excludeSelfSimilarity;// 计算相似度矩阵的时候。是否包含自身到自身的相似度,即是否保留simi(x,x)的值
		private Vector norms; // 正则化的数据, 数据格式 itemID ...
		private double treshold;// 如果相似度计算结果大于treshold则输出到最终的结果文件,反之,舍弃

方法:


2.1 setup()函数的实现

protected void setup(Context ctx) throws IOException, InterruptedException {
			// 初始化相似度计算类
			similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME),VectorSimilarityMeasure.class);
			numberOfColumns = ctx.getConfiguration().getInt(NUMBER_OF_COLUMNS, -1);
			Preconditions.checkArgument(numberOfColumns > 0, "Number of columns must be greater then 0! But numberOfColumns = " + numberOfColumns);			
			excludeSelfSimilarity = ctx.getConfiguration().getBoolean(EXCLUDE_SELF_SIMILARITY, false);//在结果中是否保留自身到自身的相似度
			norms = Vectors.read(new Path(ctx.getConfiguration().get(NORMS_PATH)), ctx.getConfiguration());
			treshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));
		}



2.2 reduce() 函数的实现

	protected void reduce(IntWritable row, Iterable partialDots, Context ctx) throws IOException, InterruptedException {
            // 上一步mapper过程得到 itemA -  格式的向量
            // 累计item之间由不同用户共同评分的值
			Iterator partialDotsIterator = partialDots.iterator();
			Vector dots = partialDotsIterator.next().get(); // 累计计算的结果保存在dots中
			while (partialDotsIterator.hasNext()) {
				Vector toAdd = partialDotsIterator.next().get();
				for (Element nonZeroElement : toAdd.nonZeroes()) {
					dots.setQuick(nonZeroElement.index(), dots.getQuick(nonZeroElement.index()) + nonZeroElement.get());
				}
			}

			 // 创建一个相同大小的向量存储两两物品间的相似度
			Vector similarities = dots.like();
			
			// norms由其他的MR job计算生成, 并存储到HDFS上,这里直接从HDFS上加载
			double normA = norms.getQuick(row.get()); 

			// 在aggregate基础上,进行相似度计算
			// 相似度由item之间的norm值和aggregate值计算得到
			for (Element b : dots.nonZeroes()) {
				double similarityValue = similarity.similarity(b.get(), normA, norms.getQuick(b.index()), numberOfColumns);
				if (similarityValue >= treshold) {
					similarities.set(b.index(), similarityValue);
				}
			}
			
			//是否包含自身到相似度 simi(item1,item1)
			if (excludeSelfSimilarity) {
				similarities.setQuick(row.get(), 0);
			}
			
			ctx.write(row, new VectorWritable(similarities));
	}


















你可能感兴趣的:(Mahout,Mahout)