<style type="text/css"> <!-- @page {margin:2cm} td p {margin-bottom:0cm} pre.cjk {font-family:"文泉驿微米黑",monospace} pre.ctl {font-family:"Lohit Hindi",monospace} p {margin-bottom:0.21cm} pre.cjk {font-family:"文泉驿微米黑",monospace} pre.ctl {font-family:"Lohit Hindi",monospace} p {margin-bottom:0.21cm} --> </style>
101 {107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0} [5 1 4 2 3] [4.0 5.0 5.0 2.0 2.5] ...WiKiDriver4.java:
package org.fansy.date1012.mahoutinaction.chapter6.sourcecode; import static org.fansy.date1012.mahoutinaction.chapter6.sourcecode.WiKiUtils.PATH; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.mahout.cf.taste.hadoop.item.VectorAndPrefsWritable; import org.apache.mahout.cf.taste.hadoop.item.VectorOrPrefWritable; public class WiKiDriver4 { /** * @param args * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub Configuration conf1 = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: WiKiDriver4 <in1><in2> <out>"); System.exit(2); } Job job1 = new Job(conf1, "wiki job four"); job1.setNumReduceTasks(1); job1.setJarByClass(WiKiDriver4.class); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setMapperClass(WikiMapper4.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(VectorOrPrefWritable.class); job1.setReducerClass(WiKiReducer4.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(VectorAndPrefsWritable.class); job1.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0])); SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[1])); SequenceFileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[2])); if(!job1.waitForCompletion(true)){ System.exit(1); // run error then exit } } }WiKiMapper4.java:
package org.fansy.date1012.mahoutinaction.chapter6.sourcecode; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.mahout.cf.taste.hadoop.item.VectorOrPrefWritable; public class WikiMapper4 extends Mapper<IntWritable ,VectorOrPrefWritable,IntWritable,VectorOrPrefWritable> { public void map(IntWritable key,VectorOrPrefWritable value,Context context) throws IOException, InterruptedException{ context.write(key, value); } }WiKiReducer4.java:
package org.fansy.date1012.mahoutinaction.chapter6.sourcecode; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Reducer; import org.apache.mahout.cf.taste.hadoop.item.VectorAndPrefsWritable; import org.apache.mahout.cf.taste.hadoop.item.VectorOrPrefWritable; import org.apache.mahout.math.Vector; public class WiKiReducer4 extends Reducer<IntWritable,VectorOrPrefWritable,IntWritable,VectorAndPrefsWritable> { public void reduce(IntWritable key, Iterable<VectorOrPrefWritable> values,Context context) throws IOException, InterruptedException{ List<Long> userfs=new ArrayList<Long>(); List<Float> prefs=new ArrayList<Float>(); Vector v=null; for(VectorOrPrefWritable value:values){ if(value.getVector()!=null){ v=value.getVector(); }else{ userfs.add(value.getUserID()); prefs.add(value.getValue()); } } context.write(key, new VectorAndPrefsWritable(v,userfs,prefs)); // System.out.println("key ,itemid:"+key.toString()+", information:"+v+","+userfs+","+prefs); } }第五个MR:
<style type="text/css"> <!-- @page {margin:2cm} pre.cjk {font-family:"文泉驿微米黑",monospace} pre.ctl {font-family:"Lohit Hindi",monospace} p {margin-bottom:0.21cm} --> </style>
map:针对MR4的输出的每一行中的每一个用户,用这个用户的评分值(value)去乘以项目之间的相似度向量,比如针对第一条记录中的用户3,则有 Vectorforuser3=[1.0 2.0 2.0 4.0 4.0 3.0 5.0]* 2.5 则map的输出为 key : 3 value : Vectorforuser3;map的输出应该如下所示:
alluserids:[5, 1, 4, 2, 3] ,userid:5,vector:{107:4.0,106:8.0,105:8.0,104:16.0,103:16.0,102:12.0,101:20.0} ,userid:1,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0} ,userid:4,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0} ,userid:2,vector:{107:2.0,106:4.0,105:4.0,104:8.0,103:8.0,102:6.0,101:10.0} ,userid:3,vector:{107:2.5,106:5.0,105:5.0,104:10.0,103:10.0,102:7.5,101:12.5} 。。。<style type="text/css"> <!-- @page {margin:2cm} pre.cjk {font-family:"文泉驿微米黑",monospace} pre.ctl {font-family:"Lohit Hindi",monospace} p {margin-bottom:0.21cm} --> </style>
Combine : 针对map的输出,把相同 key(即userID)的向量对应相加,得到的向量和即为该userID的对各个项目的评分;
combine的输出应该如下所示:
userid:1,vecotr:{107:5.0,106:18.0,105:15.5,104:33.5,103:39.0,102:31.5,101:44.0} userid:2,vecotr:{107:4.0,106:20.5,105:15.5,104:36.0,103:41.5,102:32.5,101:45.5} 。。。<style type="text/css"> <!-- @page {margin:2cm} pre.cjk {font-family:"文泉驿微米黑",monospace} pre.ctl {font-family:"Lohit Hindi",monospace} p {margin-bottom:0.21cm} --> </style>
Reduce:针对combine的输出,把用户已经评价过分的项目筛选掉,然后按照评分值的大小有大到小排序输出,即为用户推荐项目;
最后的输出为:
1 [104:33.5,106:18.0,105:15.5,107:5.0] 2 [106:20.5,105:15.5,107:4.0] 3 [103:26.5,102:20.0,106:17.5] 4 [102:37.0,105:26.0,107:9.5] 5 [107:11.5]WiKiDriver5.java:
package org.fansy.date1012.mahoutinaction.chapter6.sourcecode; import static org.fansy.date1012.mahoutinaction.chapter6.sourcecode.WiKiUtils.PATH; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable; import org.apache.mahout.math.VarLongWritable; import org.apache.mahout.math.VectorWritable; public class WiKiDriver5 { /** * @param args * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub Configuration conf1 = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: WiKiDriver5 <in> <out>"); System.exit(2); } Job job1 = new Job(conf1, "wiki job five"); job1.setNumReduceTasks(1); job1.setJarByClass(WiKiDriver5.class); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setMapperClass(WikiMapper5.class); job1.setMapOutputKeyClass(VarLongWritable.class); job1.setMapOutputValueClass(VectorWritable.class); job1.setCombinerClass(WiKiCombiner5.class); job1.setReducerClass(WiKiReducer5.class); job1.setOutputKeyClass(VarLongWritable.class); job1.setOutputValueClass(RecommendedItemsWritable.class); // job1.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0])); FileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[1])); if(!job1.waitForCompletion(true)){ System.exit(1); // run error then exit } } }WiKiMapper5.java:
package org.fansy.date1012.mahoutinaction.chapter6.sourcecode; import java.io.IOException; import java.util.List; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.mahout.cf.taste.hadoop.item.VectorAndPrefsWritable; import org.apache.mahout.math.VarLongWritable; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; public class WikiMapper5 extends Mapper<IntWritable ,VectorAndPrefsWritable,VarLongWritable,VectorWritable>{ public void map(IntWritable key,VectorAndPrefsWritable vectorAndPref,Context context) throws IOException, InterruptedException{ Vector coo=vectorAndPref.getVector(); List<Long> userIds=vectorAndPref.getUserIDs(); List<Float> prefValues=vectorAndPref.getValues(); //System.out.println("alluserids:"+userIds); for(int i=0;i<userIds.size();i++){ long userID=userIds.get(i); float prefValue=prefValues.get(i); Vector par=coo.times(prefValue); context.write(new VarLongWritable(userID), new VectorWritable(par)); //System.out.println(",userid:"+userID+",vector:"+par); // if the user id = 3 is the same as my paper then is right } // System.out.println(); } }WiKiCombiner5.java:
package org.fansy.date1012.mahoutinaction.chapter6.sourcecode; import java.io.IOException; import org.apache.hadoop.mapreduce.Reducer; import org.apache.mahout.math.VarLongWritable; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; public class WiKiCombiner5 extends Reducer<VarLongWritable,VectorWritable,VarLongWritable,VectorWritable> { public void reduce(VarLongWritable key, Iterable<VectorWritable> values,Context context) throws IOException, InterruptedException{ Vector partial=null; for(VectorWritable v:values){ partial=partial==null?v.get():partial.plus(v.get()); } context.write(key, new VectorWritable(partial)); System.out.println("userid:"+key.toString()+",vecotr:"+partial);// here also should be the same as my paper's result } }WiKiReducer5.java:
package org.fansy.date1012.mahoutinaction.chapter6.sourcecode; import static org.fansy.date1012.mahoutinaction.chapter6.sourcecode.WiKiUtils.*; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.PriorityQueue; import java.util.Queue; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.ReflectionUtils; import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable; import org.apache.mahout.cf.taste.impl.common.FastMap; import org.apache.mahout.cf.taste.impl.recommender.ByValueRecommendedItemComparator; import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem; import org.apache.mahout.cf.taste.recommender.RecommendedItem; import org.apache.mahout.math.VarLongWritable; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; public class WiKiReducer5 extends Reducer<VarLongWritable,VectorWritable,VarLongWritable,RecommendedItemsWritable> { private int recommendationsPerUser=RECOMMENDATIONSPERUSER; private String path=JOB1OUTPATH; private static FastMap<Integer,String> map=new FastMap<Integer,String>(); public void setup(Context context) throws IOException{ Configuration conf=new Configuration(); FileSystem fs=FileSystem.get(URI.create(path), conf); Path tempPath=new Path(path); SequenceFile.Reader reader=null; try { reader=new SequenceFile.Reader(fs, tempPath, conf); Writable key=(Writable)ReflectionUtils.newInstance(reader.getKeyClass(),conf); Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); // long position = reader.getPosition(); while (reader.next(key, value)) { map.put(Integer.parseInt(key.toString()), value.toString()); // System.out.println(key.toString()+","+value.toString()); // position = reader.getPosition(); // beginning of next record } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void reduce(VarLongWritable key, Iterable<VectorWritable> values,Context context) throws IOException, InterruptedException{ int userID=(int)key.get(); Vector rev=null; for(VectorWritable vec:values){ rev=rev==null? vec.get():rev.plus(vec.get()); } Queue<RecommendedItem>topItems=new PriorityQueue<RecommendedItem>( recommendationsPerUser+1, Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance()) ); Iterator<Vector.Element>recommendationVectorIterator= rev.iterateNonZero(); while(recommendationVectorIterator.hasNext()){ Vector.Element e=recommendationVectorIterator.next(); int index=e.index(); System.out.println("Vecotr.element.indxe:"+index); // test here find the index is item id or not ** test result : index is item if(!hasItem(userID,String.valueOf(index))){ float value=(float) e.get(); if(topItems.size()<recommendationsPerUser){ // here only set index topItems.add(new GenericRecommendedItem(index,value)); }else if(value>topItems.peek().getValue()){ topItems.add(new GenericRecommendedItem(index,value)); topItems.poll(); } } } List<RecommendedItem>recom=new ArrayList<RecommendedItem>(topItems.size()); recom.addAll(topItems); Collections.sort(recom,ByValueRecommendedItemComparator.getInstance()); context.write(key, new RecommendedItemsWritable(recom)); } public static boolean hasItem(int user,String item){ // to check whether the user has rate the item boolean flag=false; String items=map.get(user); if(items.contains(item)){ flag=true; } return flag; } }
其实在编写这些代码的时候 查了好多mahout的API,因为好多类都是在Mahout上面的,要了解它的用法才行,在最后一个Reducer中我也用了一个FastMap,这个类也是Mahout的,应该用他提供的一些类会运行的更加快吧。
最后说下算法吧:
在《Mahout in Action》中最初的算法是这样的:
用相似度矩阵点乘用户对项目的评价向量得到用户对项目的评分(其中的U3对对101项目的评分应该是2.5,我怀疑书上印错了);
但用代码实现的时候书上建议这样做:
这样做可以提高效率。
分享,快乐,成长