(二) 基于物品的CF
了解了 User CF,Mahout Item CF 的实现与 User CF 类似,是基于 ItemSimilarity,下面我们看实现的代码例子,它比 User CF 更简单,因为 Item CF 中并不需要引入邻居的概念:
DataModel model = new FileDataModel(new File("preferences.dat")); ItemSimilarity similarity = new PearsonCorrelationSimilarity(model); Recommender recommender = new GenericItemBasedRecommender(model, similarity);
首先来分析一下GenericItemBasedRecommender这个类,他的功能函数为:
@Override public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1"); log.debug("Recommending items for user ID '{}'", userID); PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID); if (preferencesFromUser.length() == 0) { return Collections.emptyList(); } //得到其他所有可能的item FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser); //创建评估器 TopItems.Estimator<Long> estimator = new Estimator(userID, preferencesFromUser); //获取评测分数最高的howMany个item返回 List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer, estimator); log.debug("Recommendations are: {}", topItems); return topItems; }
1、获取其他可能的items
首先根据该userid用户所评论过的所有item,然后得到每个item对应进行评价过的user列表,然后将对应的每个user所评价过的item都添加到一个集合之中 ,最后将该集合中该userid用户评价过的item去除掉,得到我们想要的集合
protected FastIDSet getAllOtherItems(long userID, PreferenceArray preferencesFromUser) throws TasteException { return candidateItemsStrategy.getCandidateItems(userID, preferencesFromUser, dataModel); } //这里的candidateItemsStrategy的出处 public GenericItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity) { this(dataModel, similarity, AbstractRecommender.getDefaultCandidateItemsStrategy(), getDefaultMostSimilarItemsCandidateItemsStrategy()); } protected static CandidateItemsStrategy getDefaultCandidateItemsStrategy() { return new PreferredItemsNeighborhoodCandidateItemsStrategy(); } public final class PreferredItemsNeighborhoodCandidateItemsStrategy extends AbstractCandidateItemsStrategy { /* * 首先根据该user所评论过的所有item,然后得到每个item对应进行评价过的user列表, * 然后将对应的每个user所评价过的item都添加到一个集合之中 * 最后将该集合中该user评价过的item去除掉,得到我们想要的集合 * */ @Override protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException { FastIDSet possibleItemsIDs = new FastIDSet(); for (long itemID : preferredItemIDs) { PreferenceArray itemPreferences = dataModel.getPreferencesForItem(itemID); int numUsersPreferringItem = itemPreferences.length(); for (int index = 0; index < numUsersPreferringItem; index++) { possibleItemsIDs.addAll(dataModel.getItemIDsFromUser(itemPreferences.getUserID(index))); } } possibleItemsIDs.removeAll(preferredItemIDs); return possibleItemsIDs; } }
2、创建评估器
将上边得到的可能的item的列表中的每一个item进行评估,一个可能的item与该userid用户所评价过的所有的item进行相似度的计算,最后取平均值,得到的这个值就是对这个item的评估值
private final class Estimator implements TopItems.Estimator<Long> { @Override public double estimate(Long itemID) throws TasteException { return doEstimatePreference(userID, preferencesFromUser, itemID); } } protected float doEstimatePreference(long userID, PreferenceArray preferencesFromUser, long itemID) throws TasteException { double preference = 0.0; double totalSimilarity = 0.0; int count = 0; //用userid用户所有评价过的item与itemid的物品进行相似度计算 double[] similarities = similarity.itemSimilarities(itemID, preferencesFromUser.getIDs()); for (int i = 0; i < similarities.length; i++) { double theSimilarity = similarities[i]; if (!Double.isNaN(theSimilarity)) { // Weights can be negative! preference += theSimilarity * preferencesFromUser.getValue(i); totalSimilarity += theSimilarity; count++; } } // Throw out the estimate if it was based on no data points, of course, but also if based on // just one. This is a bit of a band-aid on the 'stock' item-based algorithm for the moment. // The reason is that in this case the estimate is, simply, the user's rating for one item // that happened to have a defined similarity. The similarity score doesn't matter, and that // seems like a bad situation. if (count <= 1) { return Float.NaN; } float estimate = (float) (preference / totalSimilarity); if (capper != null) { estimate = capper.capEstimate(estimate); } return estimate; }
3、获取评测分数最高的howMany个item返回
将上边可能item列表中的item的得分,都插入到一个优先队列中,保留评估值最高的howMany个item,作为最后的推荐结果返回
public static List<RecommendedItem> getTopItems(int howMany, LongPrimitiveIterator possibleItemIDs, IDRescorer rescorer, Estimator<Long> estimator) throws TasteException { Preconditions.checkArgument(possibleItemIDs != null, "argument is null"); Preconditions.checkArgument(estimator != null, "argument is null"); Queue<RecommendedItem> topItems = new PriorityQueue<RecommendedItem>(howMany + 1, Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance())); boolean full = false; double lowestTopValue = Double.NEGATIVE_INFINITY; while (possibleItemIDs.hasNext()) { long itemID = possibleItemIDs.next(); if (rescorer == null || !rescorer.isFiltered(itemID)) { double preference; try { //得到该item的平均得分作为user的预测评分 preference = estimator.estimate(itemID); } catch (NoSuchItemException nsie) { continue; } double rescoredPref = rescorer == null ? preference : rescorer.rescore(itemID, preference); if (!Double.isNaN(rescoredPref) && (!full || rescoredPref > lowestTopValue)) { topItems.add(new GenericRecommendedItem(itemID, (float) rescoredPref)); if (full) { topItems.poll(); } else if (topItems.size() > howMany) { full = true; topItems.poll(); } lowestTopValue = topItems.peek().getValue(); } } } int size = topItems.size(); if (size == 0) { return Collections.emptyList(); } List<RecommendedItem> result = Lists.newArrayListWithCapacity(size); result.addAll(topItems); Collections.sort(result, ByValueRecommendedItemComparator.getInstance()); return result; }