Hadoop编程学习5--PageRank算法实现

PageRank 网页排名的算法,曾是 Google 发家致富的法宝。用于衡量特定网页相对于搜索引擎索引中的其他网页而言的重要程度。
PageRank 的设计思路如下:
  1. 初始化过程: 将原始文档的每个 Page 行末尾加上 1.0 表示的 PageRank 值初始化为 1
  2. 迭代计算过程:经过 Map 方法处理后,每行存放的数据格式为 page_name \t list_page_name(用,隔开) \t pagerank,在 Map 中迭代计算每个 Page 的出链的 rank 值,在 Reduce 时,对相同 page 在 Map 中得到的每一个 rank 值相加得到最终的 PageRank
  3. 最终排序并得到结果过程:经过上述两个过程后, 得到的仍然是 page_name \t list_page_name(用,隔开) \t pagerank 格式的文件,我们并不需要中间的 list_page_name,因此在这个过程中将中间的部分去掉,并按 PageRank 值倒序排序。
PageRank 的源代码如下(共有四个类):
1. PageRank_Initialzation :进行初始化
package org.apache.hadoop.examples;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PageRank_Initialzation {

    public static class Map extends Mapper
    {
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException 
        {
            String pr="1.0";  //初始化PageRank值
            context.write(value, new Text(pr));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        //指定输入输出目录
        if (args.length != 2) 
        {
            System.err.println("路径出错");
            System.exit(2);
        }

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://10.102.0.197:9000");

        final String OUTPUT_PATH = args[1];  
        Path path = new Path(OUTPUT_PATH);  

        //加载配置文件
        FileSystem fileSystem = path.getFileSystem(conf);

        //输出目录若存在则删除
        if (fileSystem.exists(new Path(OUTPUT_PATH))) 
        {  
           fileSystem.delete(new Path(OUTPUT_PATH),true);  
        }  

        //一些初始化
        Job job = Job.getInstance(conf,"PageRank_Initialzation");
        job.setJarByClass(PageRank_Initialzation.class);
        job.setMapperClass(Map.class);  //初始化为自定义Map类
        job.setOutputKeyClass(Text.class);  //指定输出的key的类型,Text相当于String类
        job.setOutputValueClass(Text.class);  //指定输出的Value的类型,Text相当于String类

        FileInputFormat.addInputPath(job, new Path(args[0]));  //FileInputFormat指将输入的文件(若大于64M)进行切片划分,每个split切片对应一个Mapper任务
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);

    }

}
2.PageRankIter:进行迭代计算PageRank
package org.apache.hadoop.examples;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PageRankIter {

    private static double d = 0.85;  // 阻尼系数

    public static class Map extends Mapper<Object,Text,Text,Text>
    {
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException 
        {
            //value的格式为 page_name \t list_page_name(用,隔开) \t pagerank
            String page[]=value.toString().split("\t"); 

            //该page
            String page_name=page[0];

            Text prValue = new Text();

            //如果该page有出链
            if(page.length>2)
            {
                //page_list存放所有的出链
                String page_list[]=page[1].split(",");

                double pr = Double.parseDouble(page[2]);

                //写入context 格式为 page_list中的page_name \t pagerank
                for(String list:page_list)
                {
                    if (list.isEmpty()) {
                        continue;
                    }

                    //计算该page对每一个出链的pagerank贡献度(平均值)
                    prValue.set( new Text(String.valueOf(pr / page_list.length)));

                    context.write(new Text(list),prValue);
                }

                //将该page_name  | list_page_name(用,隔开) 写入context
                context.write(new Text(page_name), new Text("|"+page[1]));
            }
        }
    }

    public static class Reduce extends Reducer<Text,Text,Text,Text>
    {
        public void reduce(Text key, Iterable values, Context context)throws IOException, InterruptedException 
        {
             String list="";  //list_page_name
             double pr=0;  //pagerank

             //对于相同key的所有value
             for(Text val:values)
             {
                 //如果该value表示的是该page \t page_list 将之赋给list
                 if(val.toString().startsWith("|"))
                     list+=val.toString().substring(1);

                 //否则,该value表示的是该page及其他page对该page的贡献度 即 (page_name \t each_page_rank)
                 else
                 {
                     //获取所有其他page的贡献度each_page_rank并相加
                     pr+=Double.parseDouble(val.toString());
                 }
             }

             //按照公式算出最终的pagerank并和list一起写入文件中,格式仍然为  page_name \t list_page_name(用,隔开) \t pagerank
             pr=pr*d+(1-d);
             String v="";

             v=String.valueOf(pr);

             context.write(key, new Text(list+"\t"+v));
        }
    }


    //main方法
    public static void main(String[] args) throws Exception {

        //指定输入输出目录
        if (args.length != 2) 
        {
            System.err.println("路径出错");
            System.exit(2);
        }

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://10.102.0.197:9000");

        final String OUTPUT_PATH = args[1];  
        Path path = new Path(OUTPUT_PATH);  

        //加载配置文件
        FileSystem fileSystem = path.getFileSystem(conf);

        //输出目录若存在则删除
        if (fileSystem.exists(new Path(OUTPUT_PATH))) 
        {  
           fileSystem.delete(new Path(OUTPUT_PATH),true);  
        }  

        //一些初始化
        Job job = Job.getInstance(conf,"PageRank_Iter");
        job.setJarByClass(PageRankIter.class);
        job.setMapperClass(Map.class);  //初始化为自定义Map类
        job.setReducerClass(Reduce.class);  //初始化为自定义Reduce类
        job.setOutputKeyClass(Text.class);  //指定输出的key的类型,Text相当于String类
        job.setOutputValueClass(Text.class);  //指定输出的Value的类型,Text相当于String类

        FileInputFormat.addInputPath(job, new Path(args[0]));  //FileInputFormat指将输入的文件(若大于64M)进行切片划分,每个split切片对应一个Mapper任务
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);

    }

}
3.PageRankViewer:按各网页的PageRank值倒序排序
package org.apache.hadoop.examples;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PageRankViewer {

    public static class Map extends Mapper<Object,Text,DoubleWritable,Text>
    {
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException 
        {
            String line[] =value.toString().split("\t");

            DoubleWritable pr= new DoubleWritable();
            pr.set(Double.parseDouble(line[2]));

            //将page_rank和page_name写入context
            context.write(pr, new Text(line[0]));
        }
    }

    //重写Compare方法 倒序排序
    public static class DescFloatComparator extends DoubleWritable.Comparator {

        public float compare(WritableComparator a, WritableComparable b) {
            return -super.compare(a, b);
        }

        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            return -super.compare(b1, s1, l1, b2, s2, l2);
        }
    }

    public static class Reduce extends Reducer<DoubleWritable,Text,Text,Text>
    {
        public void reduce(DoubleWritable key, Iterable values, Context context)throws IOException, InterruptedException 
        {
            //key为pagerank值 ,value为page_name

            //out_key 为page_name  ,out_val为pagerank
            String out_key="(";
            String out_val="";

            for(Text val:values)
            {
                out_key+=val.toString();
            }

            out_val=String.format("%.10f", key.get())+")";

            context.write(new Text(out_key),new Text(out_val));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        //指定输入输出目录
        if (args.length != 2) 
        {
            System.err.println("路径出错");
            System.exit(2);
        }

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://10.102.0.197:9000");
        conf.set("mapred.textoutputformat.ignoreseparator", "true");  
        conf.set("mapred.textoutputformat.separator", ",");  

        final String OUTPUT_PATH = args[1];  
        Path path = new Path(OUTPUT_PATH);  
        //加载配置文件
        FileSystem fileSystem = path.getFileSystem(conf);

        //输出目录若存在则删除
        if (fileSystem.exists(new Path(OUTPUT_PATH))) 
        {  
           fileSystem.delete(new Path(OUTPUT_PATH),true);  
        }  

        //一些初始化
        Job job = Job.getInstance(conf,"PageRankViewer");
        job.setJarByClass(PageRankViewer.class);
        job.setMapperClass(Map.class);  //初始化为自定义Map类
        job.setReducerClass(Reduce.class);  //初始化为自定义Reduce类
        job.setSortComparatorClass(DescFloatComparator.class);

        job.setMapOutputKeyClass(DoubleWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);  //指定输出的key的类型,Text相当于String类
        job.setOutputValueClass(Text.class);  //指定输出的Value的类型,Text相当于String类

        FileInputFormat.addInputPath(job, new Path(args[0]));  //FileInputFormat指将输入的文件(若大于64M)进行切片划分,每个split切片对应一个Mapper任务
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);

    }

}
4.PageRankDriver:驱动类,在该类中运行PageRank三个步骤的main方法
package org.apache.hadoop.examples;

public class PageRankDriver {

    //main方法
    public static void main(String[] args) throws Exception 
    {

        //指定输入输出目录
        String[] otherArgs = new String[]{"/Experiment_3","Experiment_3_Hadoop"};
        if (otherArgs.length != 2) 
        {
            System.err.println("路径出错");
            System.exit(2);
        }

        //PageRank_Initialzation
        String temp="temp";
        String[] PR_Ini = { otherArgs[0], temp+"0"};
        PageRank_Initialzation.main(PR_Ini);

        //PageRankIter
        String[] temp_PRIter_args = { "", "" };
        int times = 10;

        for (int i = 0; i < times; i++) 
        {
            temp_PRIter_args[0] = temp + i;
            temp_PRIter_args[1] = temp + (i + 1);
            PageRankIter.main(temp_PRIter_args);
        }       

        //PageRankViewer
        String[] final_PR = { "temp10", otherArgs[1] };
        PageRankViewer.main(final_PR);



    }

}

你可能感兴趣的:(Hadoop)