/**
* pairwiseSimilarity作用为计算item之间的相似度
* 数据输入格式为(userID, VectorWritable)
* 数据输出格式为(itemM, )(每条项目为key的数据中其它项目index一定比当前项目大,比如M对应M+1,M+2,....).主要计算上三角矩阵
* 在map阶段完成aggregate计算,reduce阶段完成相似度计算
*
*/
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
Job pairwiseSimilarity = prepareJob(
weightsPath, // 输入文件,输入文件格式:(userID, VectorWritable)
pairwiseSimilarityPath, // 相似度输出路径 ,输出格式为(itemM, )
CooccurrencesMapper.class, // mapper类,主要完成aggregate的工作,以(userID, VectorWritable)为输入
IntWritable.class, // mapper Key
VectorWritable.class, // mapper Value
SimilarityReducer.class, // reducer类,只要完成计算相似度的工作
IntWritable.class, // reducer key
VectorWritable.class); // reducer value
pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);
Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
pairwiseConf.set(THRESHOLD, String.valueOf(threshold));
pairwiseConf.set(NORMS_PATH, normsPath.toString());
pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH,numNonZeroEntriesPath.toString());
pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());
pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);
pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);
// pairwiseConf.set("mapred.child.java.opts",
// "-Xmx2048m -Xms2048m -Xmn512m -XX:SurvivorRatio=5 -XX:MaxPermSize=128m");
boolean succeeded = pairwiseSimilarity.waitForCompletion(true);
if (!succeeded) {
return -1;
}
}
(1)CooccurrencesMapper
域:
private VectorSimilarityMeasure similarity; // 计算相似度的类
private OpenIntIntHashMap numNonZeroEntries; // 记录用户非0评分item
private Vector maxValues; // 记录每个用户评分历史的最大值
private double threshold; // 主要用来判断评分记录是否合理(valid)
private static final Comparator BY_INDEX = new Comparator() { // 根据itemID来排序的比较器
@Override
public int compare(Vector.Element one, Vector.Element two) {
return Ints.compare(one.index(), two.index());
}
};
方法:
1.1 setup()函数的实现
protected void setup(Context ctx) throws IOException, InterruptedException {
similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class); // 初始化相似度计算类
numNonZeroEntries = Vectors.readAsIntMap(new Path(ctx.getConfiguration().get(NUM_NON_ZERO_ENTRIES_PATH)), ctx.getConfiguration());// 获取记录每个用户非0评分项
maxValues = Vectors.read(new Path(ctx.getConfiguration().get(MAXVALUES_PATH)),ctx.getConfiguration()); // 每个用户所有的评分中的最大值
threshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));//主要用来判断评分是否合理
}
1.2 map()函数的实现
protected void map(IntWritable column, VectorWritable occurrenceVector, Context ctx) throws IOException, InterruptedException {
Vector.Element[] occurrences = Vectors.toArray(occurrenceVector);//occurrenceVector为用户的评分记录, 即,userID,
Arrays.sort(occurrences, BY_INDEX);// 将用户的评分记录按照itemID进行排序,BY_INDEX参考....
int cooccurrences = 0;// 记录用多少个item对,是该用户都评分过的,且为非零值
int prunedCooccurrences = 0;// 记录多少个评分对是无效的(被剪枝的)评分记录
// 由item之间的相似度类Similarity的aggregate计算相似度
// 输出格式:itemX
for (int n = 0; n < occurrences.length; n++) {
Vector.Element occurrenceA = occurrences[n];
Vector dots = new RandomAccessSparseVector(Integer.MAX_VALUE, occurrences.length);
for (int m = n; m < occurrences.length; m++) {// 计算上三角矩阵
Vector.Element occurrenceB = occurrences[m];
if (threshold == NO_THRESHOLD || consider(occurrenceA, occurrenceB)) {// itemA 和 itemB 被同一个用户共同评分
// itemA 和 itemB 被同一个用户共同评分
dots.setQuick(occurrenceB.index(), similarity.aggregate(occurrenceA.get(), occurrenceB.get())); // 计算aggregateValueN
cooccurrences++;
} else {
prunedCooccurrences++;
}
}
ctx.write(new IntWritable(occurrenceA.index()), new VectorWritable(dots));// 输出格式:itemA ....
}
ctx.getCounter(Counters.COOCCURRENCES).increment(cooccurrences);// 总共有多少的(有效的)评分记录
ctx.getCounter(Counters.PRUNED_COOCCURRENCES).increment(prunedCooccurrences);//总共有多少(无效的)评分记录
}
}
(2) SimilarityReducer
域:
private VectorSimilarityMeasure similarity;// 计算相似度的类
private int numberOfColumns; // 在Cosine相似度计算中 改值没有用到,纯粹是继承自父类保留下来的参数
private boolean excludeSelfSimilarity;// 计算相似度矩阵的时候。是否包含自身到自身的相似度,即是否保留simi(x,x)的值
private Vector norms; // 正则化的数据, 数据格式 itemID ...
private double treshold;// 如果相似度计算结果大于treshold则输出到最终的结果文件,反之,舍弃
方法:
2.1 setup()函数的实现
protected void setup(Context ctx) throws IOException, InterruptedException {
// 初始化相似度计算类
similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME),VectorSimilarityMeasure.class);
numberOfColumns = ctx.getConfiguration().getInt(NUMBER_OF_COLUMNS, -1);
Preconditions.checkArgument(numberOfColumns > 0, "Number of columns must be greater then 0! But numberOfColumns = " + numberOfColumns);
excludeSelfSimilarity = ctx.getConfiguration().getBoolean(EXCLUDE_SELF_SIMILARITY, false);//在结果中是否保留自身到自身的相似度
norms = Vectors.read(new Path(ctx.getConfiguration().get(NORMS_PATH)), ctx.getConfiguration());
treshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));
}
2.2 reduce() 函数的实现
protected void reduce(IntWritable row, Iterable partialDots, Context ctx) throws IOException, InterruptedException {
// 上一步mapper过程得到 itemA - 格式的向量
// 累计item之间由不同用户共同评分的值
Iterator partialDotsIterator = partialDots.iterator();
Vector dots = partialDotsIterator.next().get(); // 累计计算的结果保存在dots中
while (partialDotsIterator.hasNext()) {
Vector toAdd = partialDotsIterator.next().get();
for (Element nonZeroElement : toAdd.nonZeroes()) {
dots.setQuick(nonZeroElement.index(), dots.getQuick(nonZeroElement.index()) + nonZeroElement.get());
}
}
// 创建一个相同大小的向量存储两两物品间的相似度
Vector similarities = dots.like();
// norms由其他的MR job计算生成, 并存储到HDFS上,这里直接从HDFS上加载
double normA = norms.getQuick(row.get());
// 在aggregate基础上,进行相似度计算
// 相似度由item之间的norm值和aggregate值计算得到
for (Element b : dots.nonZeroes()) {
double similarityValue = similarity.similarity(b.get(), normA, norms.getQuick(b.index()), numberOfColumns);
if (similarityValue >= treshold) {
similarities.set(b.index(), similarityValue);
}
}
//是否包含自身到相似度 simi(item1,item1)
if (excludeSelfSimilarity) {
similarities.setQuick(row.get(), 0);
}
ctx.write(row, new VectorWritable(similarities));
}