初始化过程: 将原始文档的每个 Page 行末尾加上 1.0 表示的 PageRank 值初始化为 1
迭代计算过程:经过 Map 方法处理后,每行存放的数据格式为 page_name \t list_page_name(用,隔开) \t pagerank,在 Map 中迭代计算每个 Page 的出链的 rank 值,在 Reduce 时,对相同 page 在 Map 中得到的每一个 rank 值相加得到最终的 PageRank
最终排序并得到结果过程:经过上述两个过程后, 得到的仍然是 page_name \t list_page_name(用,隔开) \t pagerank 格式的文件,我们并不需要中间的 list_page_name,因此在这个过程中将中间的部分去掉,并按 PageRank 值倒序排序。
package org.apache .hadoop .examples
import java.io .IOException
import org.apache .hadoop .conf .Configuration
import org.apache .hadoop .fs .FileSystem
import org.apache .hadoop .fs .Path
import org.apache .hadoop .io .Text
import org.apache .hadoop .mapreduce .Job
import org.apache .hadoop .mapreduce .Mapper
import org.apache .hadoop .mapreduce .lib .input .FileInputFormat
import org.apache .hadoop .mapreduce .lib .output .FileOutputFormat
public class PageRank_Initialzation {
public static class Map extends Mapper
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String pr="1.0"
context.write (value, new Text(pr))
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//指定输入输出目录
if (args.length != 2 )
{
System.err .println ("路径出错" )
System.exit (2 )
}
Configuration conf = new Configuration()
conf.set ("fs.defaultFS" , "hdfs://10.102.0.197:9000" )
final String OUTPUT_PATH = args[1 ]
Path path = new Path(OUTPUT_PATH)
//加载配置文件
FileSystem fileSystem = path.getFileSystem (conf)
//输出目录若存在则删除
if (fileSystem.exists (new Path(OUTPUT_PATH)))
{
fileSystem.delete (new Path(OUTPUT_PATH),true)
}
//一些初始化
Job job = Job.getInstance (conf,"PageRank_Initialzation" )
job.setJarByClass (PageRank_Initialzation.class )
job.setMapperClass (Map.class )
job.setOutputKeyClass (Text.class )
job.setOutputValueClass (Text.class )
FileInputFormat.addInputPath (job, new Path(args[0 ]))
FileOutputFormat.setOutputPath (job, new Path(args[1 ]))
job.waitForCompletion (true)
}
}
package org.apache.hadoop.examples;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PageRankIter {
private static double d = 0.85 ;
public static class Map extends Mapper <Object ,Text ,Text ,Text >
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String page[]=value.toString().split("\t" );
String page_name=page[0 ];
Text prValue = new Text();
if (page.length>2 )
{
String page_list[]=page[1 ].split("," );
double pr = Double.parseDouble(page[2 ]);
for (String list:page_list)
{
if (list.isEmpty()) {
continue ;
}
prValue.set( new Text(String.valueOf(pr / page_list.length)));
context.write(new Text(list),prValue);
}
context.write(new Text(page_name), new Text("|" +page[1 ]));
}
}
}
public static class Reduce extends Reducer <Text ,Text ,Text ,Text >
{
public void reduce(Text key, Iterable values, Context context)throws IOException, InterruptedException
{
String list="" ;
double pr=0 ;
for (Text val :values)
{
if (val .toString().startsWith("|" ))
list+=val .toString().substring(1 );
else
{
pr+=Double.parseDouble(val .toString());
}
}
pr=pr*d+(1 -d);
String v="" ;
v=String.valueOf(pr);
context.write(key, new Text(list+"\t" +v));
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2 )
{
System.err.println("路径出错" );
System.exit(2 );
}
Configuration conf = new Configuration();
conf.set("fs.defaultFS" , "hdfs://10.102.0.197:9000" );
final String OUTPUT_PATH = args[1 ];
Path path = new Path(OUTPUT_PATH);
FileSystem fileSystem = path.getFileSystem(conf);
if (fileSystem.exists(new Path(OUTPUT_PATH)))
{
fileSystem.delete(new Path(OUTPUT_PATH),true );
}
Job job = Job.getInstance(conf,"PageRank_Iter" );
job.setJarByClass(PageRankIter.class );
job.setMapperClass(Map.class );
job.setReducerClass(Reduce.class );
job.setOutputKeyClass(Text.class );
job.setOutputValueClass(Text.class );
FileInputFormat.addInputPath(job, new Path(args[0 ]));
FileOutputFormat.setOutputPath(job, new Path(args[1 ]));
job.waitForCompletion(true );
}
}
package org.apache.hadoop.examples;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PageRankViewer {
public static class Map extends Mapper <Object ,Text ,DoubleWritable ,Text >
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line[] =value.toString().split("\t" );
DoubleWritable pr= new DoubleWritable();
pr.set(Double.parseDouble(line[2 ]));
context.write(pr, new Text(line[0 ]));
}
}
public static class DescFloatComparator extends DoubleWritable .Comparator {
public float compare(WritableComparator a, WritableComparable b) {
return -super .compare(a, b);
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super .compare(b1, s1, l1, b2, s2, l2);
}
}
public static class Reduce extends Reducer <DoubleWritable ,Text ,Text ,Text >
{
public void reduce(DoubleWritable key, Iterable values, Context context)throws IOException, InterruptedException
{
String out_key="(" ;
String out_val="" ;
for (Text val :values)
{
out_key+=val .toString();
}
out_val=String.format("%.10f" , key.get())+")" ;
context.write(new Text(out_key),new Text(out_val));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
if (args.length != 2 )
{
System.err.println("路径出错" );
System.exit(2 );
}
Configuration conf = new Configuration();
conf.set("fs.defaultFS" , "hdfs://10.102.0.197:9000" );
conf.set("mapred.textoutputformat.ignoreseparator" , "true" );
conf.set("mapred.textoutputformat.separator" , "," );
final String OUTPUT_PATH = args[1 ];
Path path = new Path(OUTPUT_PATH);
FileSystem fileSystem = path.getFileSystem(conf);
if (fileSystem.exists(new Path(OUTPUT_PATH)))
{
fileSystem.delete(new Path(OUTPUT_PATH),true );
}
Job job = Job.getInstance(conf,"PageRankViewer" );
job.setJarByClass(PageRankViewer.class );
job.setMapperClass(Map.class );
job.setReducerClass(Reduce.class );
job.setSortComparatorClass(DescFloatComparator.class );
job.setMapOutputKeyClass(DoubleWritable.class );
job.setMapOutputValueClass(Text.class );
job.setOutputKeyClass(Text.class );
job.setOutputValueClass(Text.class );
FileInputFormat.addInputPath(job, new Path(args[0 ]));
FileOutputFormat.setOutputPath(job, new Path(args[1 ]));
job.waitForCompletion(true );
}
}
4.PageRankDriver:驱动类,在该类中运行PageRank三个步骤的main方法
package org.apache.hadoop.examples;
public class PageRankDriver {
public static void main (String[] args) throws Exception
{
String[] otherArgs = new String[]{"/Experiment_3" ,"Experiment_3_Hadoop" };
if (otherArgs.length != 2 )
{
System.err.println("路径出错" );
System.exit(2 );
}
String temp="temp" ;
String[] PR_Ini = { otherArgs[0 ], temp+"0" };
PageRank_Initialzation.main(PR_Ini);
String[] temp_PRIter_args = { "" , "" };
int times = 10 ;
for (int i = 0 ; i < times; i++)
{
temp_PRIter_args[0 ] = temp + i;
temp_PRIter_args[1 ] = temp + (i + 1 );
PageRankIter.main(temp_PRIter_args);
}
String[] final_PR = { "temp10" , otherArgs[1 ] };
PageRankViewer.main(final_PR);
}
}