前面都是在一个非常小的测试数据集上进行推荐评估,现在我们下载一个真实的数据集来进行评估。
在http://www.grouplens.org/node/73上找到并下载ml-100k.zip,解压在其中找到ua.base文件作为我们的评估数据集。这是一个制表符分隔的文件,对于FileDataModel同样适合装载,包括我们前面用到的逗号分隔符的文件也可以。这个数据集中有100000个偏好值,相对前面的可能会执行比较长一个时间,下面是修改后的代码:
package com.besttone.mahout.demo.recommender; import java.io.File; import java.io.IOException; import org.apache.mahout.cf.taste.common.TasteException; import org.apache.mahout.cf.taste.eval.RecommenderBuilder; import org.apache.mahout.cf.taste.eval.RecommenderEvaluator; import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator; import org.apache.mahout.cf.taste.impl.eval.RMSRecommenderEvaluator; import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood; import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; import org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender; import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; import org.apache.mahout.cf.taste.model.DataModel; import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; import org.apache.mahout.cf.taste.recommender.Recommender; import org.apache.mahout.cf.taste.similarity.UserSimilarity; import org.apache.mahout.common.RandomUtils; public class TestRecommenderEvaluator { /** * @param args * @throws IOException * @throws TasteException */ public static void main(String[] args) throws IOException, TasteException { // TODO Auto-generated method stub //这个是产生唯一的种子使得在划分训练和测试数据的时候具有唯一性= RandomUtils.useTestSeed(); // 装载数据文件,实现存储,并为计算提供所需的用户ID,物品ID,偏好值 // DataModel dataModel = new FileDataModel(new File( // MyFirstRecommender.class.getResource("intro.txt").getPath())); DataModel dataModel = new FileDataModel(new File( MyFirstRecommender.class.getResource("ua.base").getPath())); //推荐评估,使用均方根 //RecommenderEvaluator evaluator = new RMSRecommenderEvaluator(); // //推荐评估,使用平均差值 RecommenderEvaluator evaluator = new AverageAbsoluteDifferenceRecommenderEvaluator(); RecommenderBuilder builder = new RecommenderBuilder() { public Recommender buildRecommender(DataModel dataModel) throws TasteException { // TODO Auto-generated method stub UserSimilarity similarity = new PearsonCorrelationSimilarity( dataModel); UserNeighborhood neighborhood = new NearestNUserNeighborhood(2, similarity, dataModel); return new GenericUserBasedRecommender(dataModel, neighborhood, similarity); //采用slope-one推荐引擎 //return new SlopeOneRecommender(dataModel); } }; //1.0表示待评估的数据集与总数据集的占比,1.0表示100%。 0.7表示训练数据集在评估数据集的占比 double score = evaluator.evaluate(builder, null, dataModel, 0.7, 1.0); System.out.println(score); } }执行以上评估,评估结果为:0.8761682242990649,这个值在1到5这个偏好值区间里面不算坏,也不算太好。
不同的推荐程序得出的评估值都是不一样的,一般都要对不同的推荐程序进行评估,找到针对你的业务场景最适合的推荐程序,即评估结果最小的。
下面用slope-one推荐程序来进行评估:
package com.besttone.mahout.demo.recommender; import java.io.File; import java.io.IOException; import org.apache.mahout.cf.taste.common.TasteException; import org.apache.mahout.cf.taste.eval.RecommenderBuilder; import org.apache.mahout.cf.taste.eval.RecommenderEvaluator; import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator; import org.apache.mahout.cf.taste.impl.eval.RMSRecommenderEvaluator; import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood; import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; import org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender; import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; import org.apache.mahout.cf.taste.model.DataModel; import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; import org.apache.mahout.cf.taste.recommender.Recommender; import org.apache.mahout.cf.taste.similarity.UserSimilarity; import org.apache.mahout.common.RandomUtils; public class TestRecommenderEvaluator { /** * @param args * @throws IOException * @throws TasteException */ public static void main(String[] args) throws IOException, TasteException { // TODO Auto-generated method stub //这个是产生唯一的种子使得在划分训练和测试数据的时候具有唯一性= RandomUtils.useTestSeed(); // 装载数据文件,实现存储,并为计算提供所需的用户ID,物品ID,偏好值 // DataModel dataModel = new FileDataModel(new File( // MyFirstRecommender.class.getResource("intro.txt").getPath())); DataModel dataModel = new FileDataModel(new File( MyFirstRecommender.class.getResource("ua.base").getPath())); //推荐评估,使用均方根 //RecommenderEvaluator evaluator = new RMSRecommenderEvaluator(); // //推荐评估,使用平均差值 RecommenderEvaluator evaluator = new AverageAbsoluteDifferenceRecommenderEvaluator(); RecommenderBuilder builder = new RecommenderBuilder() { public Recommender buildRecommender(DataModel dataModel) throws TasteException { // TODO Auto-generated method stub // UserSimilarity similarity = new PearsonCorrelationSimilarity( // dataModel); // UserNeighborhood neighborhood = new NearestNUserNeighborhood(2, // similarity, dataModel); // // return new GenericUserBasedRecommender(dataModel, neighborhood, // similarity); //采用slope-one推荐引擎 return new SlopeOneRecommender(dataModel); } }; //1.0表示待评估的数据集与总数据集的占比,1.0表示100%。 0.7表示训练数据集在评估数据集的占比 double score = evaluator.evaluate(builder, null, dataModel, 0.7, 1.0); System.out.println(score); } }