pagerank-mapreduce

原程序代码来源丢失。

MapReduce下的pagerank程序由四部分组成,分别是:mapper,combiner,reducer和驱动迭代计算的main函数。

15行-60行:Mappper的输入key-value分别是原始数据的行号(LongWritable)和该行对应的内容(Text),这里的数据格式与BCBSP的pagerank运行数据相同。处理流程如下:
    a) 22行-34行:对输入的value解析,顶点编号存在id中,当前pagerank值存在pr中,出边列表存在数组border[]中;
    b) 35行计算出平均pagerank值average_pr,以待发送给所有出边;
    c) 36行开始的for循环处理每一条边,每次循环中:46行context.write(borderId, bdContribution)将以“@”开头的average_pr值“发”给borderID,这一步相当于发送消息;47行context.write(id, bdid)将以“&”开头的出边顶点号发给id,这一步传输图的拓扑结构,以便继续下次迭代。


62行开始的Combiner将发给相同borderId的消息合并,因此,它将以“@”开头的消息合并,以“&”开头的出边号组成列表,重新组装发送。


95行开始的reducer收到<id,List<value>>,解析List<value>中含以“@”开头的消息,计算出新的pagerank值,与id一起作为reducer的outKey;解析其中以“&”开头的边数据,作为reducer的outValue。这样reducer的结果就可以作为下次迭代使用。


125行开始的main函数处理设定作业配置参数以外,控制一个for循环将map-reduce作业迭代N次。 


import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class PageRankHadoop {
	public static class PageRankMapper extends Mapper<LongWritable, Text, Text, Text> {
		static enum ErrorCounters  {ERROR};
		private Text id;
		private String pr = "";
		private float average_pr = 0;

		public void map(LongWritable key, Text value, Context context) {
			String data[] = value.toString().split("\t");
			if (data.length != 2){
//				report.incrCounter(ErrorCounters.ERROR, 1);
				return;
			}

			String vertex[] = data[0].toString().split(":");
			if (vertex.length != 2){
				return;
			}
			id = new Text(vertex[0]);
			pr = vertex[1];
			String border[] = data[1].toString().split(" ");
			average_pr = Float.parseFloat(pr) / border.length;
			for (String bd : border) {
				try {
					String tmp_border[] = bd.split(":");
					if (tmp_border.length != 2){
//						report.incrCounter(ErrorCounters.ERROR, 1);
						return;
						
					}
					Text bdContribution = new Text("@" + String.valueOf(average_pr));
					Text borderId = new Text(tmp_border[0]);
					context.write(borderId, bdContribution);
					Text bdid = new Text("&" + bd);					
					context.write(id, bdid);
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				} catch (InterruptedException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

			}

		}
	}

	public static class PageRankCombiner extends
			Reducer<Text, Text, Text, Text> {
		public void reduce(Text key, Iterable<Text> value, Context context) {
			String border = "";
			float pr = 0;
			for (Text id : value) {
				String idtmp = id.toString();
				if (idtmp.substring(0, 1).equals("@")) {
					pr += Float.parseFloat(idtmp.substring(1));
				} else if (idtmp.substring(0, 1).equals("&")) {
					border += idtmp.substring(1) + " ";
				}
			}
			if (border.length() > 0)
				border = border.substring(0, border.length() - 1);
			Text outKey = key;
			Text bdContribution = new Text("@" + pr);
			Text bdData = new Text("&" + border);
			try {
				context.write(outKey, bdContribution);
				if (border.length() != 0)
					context.write(outKey, bdData);
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (InterruptedException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}

	}

	public static class PageRankReducer extends Reducer<Text, Text, Text, Text> {
		public void reduce(Text key, Iterable<Text> value, Context context) {

			String border = "";
			float pr = 0;
			for (Text id : value) {
				String idtmp = id.toString();
				if (idtmp.substring(0, 1).equals("@")) {
					pr += Float.parseFloat(idtmp.substring(1));
				} else if (idtmp.substring(0, 1).equals("&")) {
					border += idtmp.substring(1) + " ";
				}
			}
			pr = 0.0001f * 0.15f + pr * (1 - 0.15f);
			Text outKey = new Text(key.toString() + ":" + pr);
			if (border.length() > 0)
				border = border.substring(0, border.length() - 1);
			Text outValue = new Text(border.toString());
			try {
				context.write(outKey, outValue);
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (InterruptedException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}

	public static void main(String args[]) throws InterruptedException,
			ClassNotFoundException {
		Long start = System.currentTimeMillis();
		Configuration conf = new Configuration();
		String otherArgs[] = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		if (otherArgs.length != 4) {
			System.out.println("Usage: PageRankHadoop new version <the number of super-step > <in> <out> <the number of reducer>");
			System.exit(0);
		}

		String input = otherArgs[1];
		String output = otherArgs[2];
		int reducerNum=Integer.parseInt(otherArgs[3]);
		FileSystem fs;
		System.out.println(otherArgs[0]+" "+input +" "+ output+" "+reducerNum);
		try {
			fs = FileSystem.get(conf);
			int superstep = Integer.parseInt(otherArgs[0]);
			for (int i = 0; i < superstep; i++) {
				fs.delete(new Path(output), true);
				System.out.println("superstep "+i+" "+input +" "+ output+" "+reducerNum);
				
				Job job = new Job(conf, "PageRankHadoop");
				job.setJarByClass(PageRankHadoop.class);

				job.setMapperClass(PageRankMapper.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(Text.class);

				job.setCombinerClass(PageRankCombiner.class);

				job.setPartitionerClass(MD5Partitioner.class);
				
				job.setNumReduceTasks(reducerNum);
				job.setReducerClass(PageRankReducer.class);
				job.setOutputKeyClass(Text.class);
				job.setOutputValueClass(Text.class);

				FileInputFormat.addInputPath(job, new Path(input));
				FileOutputFormat.setOutputPath(job, new Path(output));
						
				job.waitForCompletion(true);
				if(i!=0)fs.delete(new Path(input),true);
				input = output;
				output += i;
				
				
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		}
		System.out.println((float) (System.currentTimeMillis() - start) / 1000);

	}
}

你可能感兴趣的:(mapreduce,hadoop,pagerank)