hadoop模拟实现pagerank算法过程

package package1.pagerank;

import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MyPageRank {

    public static class PageRankMap extends  Mapper {
        
        private IntWritable id;
        private String pr;
        private int count;
        private float average_pr;
        
        public void map(Object key, Text value, Context context)
        {
            StringTokenizer str = new StringTokenizer(value.toString());
            if(str.hasMoreTokens())
            {
                id = new IntWritable(Integer.parseInt(str.nextToken()));
            }else{
                return;
            }
            pr = str.nextToken();
            count = str.countTokens();
            average_pr = Float.parseFloat(pr)/count;
            while(str.hasMoreTokens())
            {
                try{
                    String nextId = str.nextToken();
                    IntWritable linid = new IntWritable(Integer.parseInt(nextId));
                    //将网页向外链接的ID以“pr+得到贡献值”格式输出  
                    Text avpr = new Text("pr" + average_pr);  
                    context.write(linid, avpr);  
                    // 将网页ID和PR值输出  
                    Text ids = new Text("id" + nextId);  
                    context.write(id, ids);
                }catch(IOException e)
                {
                    e.printStackTrace();
                }catch (InterruptedException e) {  
                    e.printStackTrace();  
                }
            }
        }    
    }
    
    public static class PageRankReducer extends Reducer{
        public void reduce(IntWritable key, Iterable values,  
                Context context) {  
 
            // 定义一个存储网页链接ID的队列  
            ArrayList ids = new ArrayList();  
            // 将所有的链接ID以String格式保存  
            String strid = " ";  
            // 定义一个保存网页PR值的变量  
            float pr = 0;  
            //遍历
            System.out.println(key.get());
            for(Text txt : values) {  
                  String str = txt.toString();  
                //判断value是贡献值还是向外部的链接  
                  if (str.startsWith("pr")) {  
                    // 贡献值  
                    pr += Float.parseFloat(str.substring(2));
                    System.out.println(pr);
                } else if (str.startsWith("id")) {  
                    // 链接id  
                    String id = str.substring(2);  
                    ids.add(id);  
                }  
            }  
            
            // 得到所有链接ID的String形式  
            for (int i = 0; i < ids.size(); i++) {  
                strid += ids.get(i) + "  ";  
            }  
            // 组合pr+lianjie成原文件的格式类型
            String strpr = String.format("%.5f", pr);
            String result = strpr + strid;  
            try {  
                context.write(key, new Text(result));
            } catch (IOException e) {  
                e.printStackTrace();  
            } catch (InterruptedException e) {  
                e.printStackTrace();  
            }  
        }  
    }
    
    public static void main(String[] args) throws IOException,  
        InterruptedException, ClassNotFoundException {  
    
        Configuration conf = new Configuration();  
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        String paths= otherArgs[0];
        String path1= paths;
        String path2="";
        for (int i = 1; i <= 10; i++) {  
            Job job = new Job(conf, "MapReduce pagerank");  
            path2 = paths + i;  
            job.setJarByClass(MyPageRank.class);  
            job.setMapperClass(PageRankMap.class);  
            job.setReducerClass(PageRankReducer.class);  
            job.setOutputKeyClass(IntWritable.class);  
            job.setOutputValueClass(Text.class);  
            FileInputFormat.addInputPath(job, new Path(path1));  
            FileOutputFormat.setOutputPath(job, new Path(path2));  
            path1 = path2;  
            job.waitForCompletion(true);  
        
        }  
    
    }

}

你可能感兴趣的:(hadoop)