1 0 {2=1, 3=1} 2 1000 {4=1, 5=1} 3 1000 {10=1, 6=1, 7=1} 4 1000 {5=1, 6=1, 3=1} 5 1000 {8=1} 6 1000 {9=1, 8=1, 7=1} 7 1000 {8=1} 8 1000 {9=1} 9 1000 {5=1} 10 1000 {2=1, 9=1, 8=1}
1 0 {2=6, 3=3, 18=100, 11=1} 2 1000 {4=9, 5=3, 1=4} 3 1000 {10=3, 6=6, 7=1} 4 1000 {5=1, 6=4, 3=3} 5 1000 {8=3, 1=4} 6 1000 {9=4, 8=7, 7=4} 7 1000 {8=2, 6=2} 8 1000 {9=3, 1=3} 9 1000 {5=8} 10 1000 {2=4, 9=6, 8=4} 11 1000 {12=1} 12 1000 {13=1} 13 1000 {14=1) 14 1000 {15=1} 15 1000 {16=1} 16 1000 {17=1} 17 1000 {18=1} 18 1000 {18=0}
package GraphAlgorithms.PBFS; import org.apache.hadoop.io.Writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-24 * Time: 下午2:07 * To change this template use File | Settings | File Templates. */ public class linkMeta implements Writable { private int linkWeight; public linkMeta() { } public linkMeta(int linkWeight) { this.linkWeight = linkWeight; } @Override public void write(DataOutput out) throws IOException { out.writeInt(linkWeight); } @Override public void readFields(DataInput in) throws IOException { linkWeight = in.readInt(); } @Override public String toString() { return "" + linkWeight; } public int getLinkWeight() { return linkWeight; } public void setLinkWeight(int linkWeight) { this.linkWeight = linkWeight; } }
package GraphAlgorithms.PBFS; import org.apache.hadoop.io.WritableComparable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.HashMap; public class Node implements WritableComparable { private boolean node; private HashMap<Long, linkMeta> adjacencyList; private int currentDistance; public Node() { } @Override public boolean equals(Object obj) { if (obj instanceof Node) { Node that = (Node) obj; return this.isNode() == that.isNode() && that.getAdjacencyList().equals(that.getAdjacencyList()) && this.getCurrentDistance() == that.getCurrentDistance(); } return false; } @Override public int hashCode() { return adjacencyList.hashCode(); } @Override public String toString() { if (node == true) return currentDistance + "\t" + adjacencyList.toString(); else return "" + currentDistance; } @Override public int compareTo(Object o) { Node that = (Node) o; if (that.isNode() == that.isNode()) return ((Integer) currentDistance).compareTo(((Node) o).getCurrentDistance()); else if (this.isNode()) { return 1; } else { return -1; } } @Override public void write(DataOutput out) throws IOException { out.writeBoolean(isNode()); //为节点类型,才对adjacencyList序列化 if (isNode()) { out.writeInt(adjacencyList.size()); for (Long aLong : adjacencyList.keySet()) { out.writeLong(aLong); adjacencyList.get(aLong).write(out); } } out.writeInt(currentDistance); } @Override public void readFields(DataInput in) throws IOException { node = in.readBoolean(); //为节点类型,才对adjacencyList反序列化 if (isNode()) { adjacencyList = new HashMap<Long, linkMeta>(); int size = in.readInt(); long key; for (int i = 0; i < size; i++) { linkMeta linkMeta = new linkMeta(); key = in.readLong(); linkMeta.readFields(in); adjacencyList.put(key, linkMeta); } } currentDistance = in.readInt(); } public HashMap<Long, linkMeta> getAdjacencyList() { return adjacencyList; } public void setAdjacencyList(HashMap<Long, linkMeta> adjacencyList) { this.adjacencyList = adjacencyList; } public boolean isNode() { return node; } public void setNode(boolean node) { this.node = node; } public int getCurrentDistance() { return currentDistance; } public void setCurrentDistance(int currentDistance) { this.currentDistance = currentDistance; } public void set(Node value) { this.node = value.isNode(); this.adjacencyList = value.getAdjacencyList(); this.currentDistance = value.getCurrentDistance(); } }
package GraphAlgorithms.PBFS; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-25 * Time: 下午12:41 * To change this template use File | Settings | File Templates. */ //用来统计访问到的节点、未访问到的节点以及变化的节点 public enum Finished { MAXDISTANCE, CHANGED, REACHED }
package GraphAlgorithms.PBFS; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; import java.util.HashMap; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-24 * Time: 下午3:15 * To change this template use File | Settings | File Templates. */ public class MyMapper extends Mapper<LongWritable, Node, LongWritable, Node> { private final static int MAXDISTANCE = 1000; private LongWritable outKey = new LongWritable(); private Node outValue = new Node(); @Override protected void map(LongWritable key, Node value, Context context) throws IOException, InterruptedException { int distance = value.getCurrentDistance(); value.setNode(true); context.write(key, value); // System.out.println(key + "\t" + value); HashMap<Long, linkMeta> adjacencyList = value.getAdjacencyList(); for (Long aLong : adjacencyList.keySet()) { //当key节点已经访问到时,产生一个条由key到key相邻节点的路径(不一定最短,最短在reduce计算) if (distance != MAXDISTANCE && aLong != key.get()) { outKey.set(aLong); outValue.setNode(false); int linkWeight = adjacencyList.get(aLong).getLinkWeight(); outValue.setCurrentDistance(distance + linkWeight); context.write(outKey, outValue); // System.out.println("-----" + outKey + "\t" + outValue); } } } }
package GraphAlgorithms.PBFS; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-24 * Time: 下午9:06 * To change this template use File | Settings | File Templates. */ public class MyReducer extends Reducer<LongWritable, Node, LongWritable, Node> { private final static int MAXDISTANCE = 1000; private Node outValue = new Node(); private long sourceNode; @Override protected void setup(Context context) throws IOException, InterruptedException { sourceNode = Long.parseLong(context.getConfiguration().get("sourceNode")); } @Override protected void reduce(LongWritable key, Iterable<Node> values, Context context) throws IOException, InterruptedException { int minDistance = MAXDISTANCE; int preDistance = -1; for (Node value : values) { if (value.isNode()) { // outValue = value; //对象被重用,必须重新构造,不能直接引用 outValue.set(value); preDistance = value.getCurrentDistance(); } else { int distance = ((Node) value).getCurrentDistance(); if (distance < minDistance) { // 找出当前到key节点的最短路径 minDistance = distance; } } } if (sourceNode != key.get()) outValue.setCurrentDistance(minDistance); context.write(key, outValue); //key节点未被访问到,未被访问节点数加一,否则访问节点数加一 if (outValue.getCurrentDistance() == MAXDISTANCE) { context.getCounter(Finished.MAXDISTANCE).increment(1); } else { context.getCounter(Finished.REACHED).increment(1); } //统计本轮当前最短路径变化的节点数 if (preDistance != outValue.getCurrentDistance()) { context.getCounter(Finished.CHANGED).increment(1); } // System.out.println("ReduceOut:" + key + "\t" + outValue); } }
package GraphAlgorithms.PBFS; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-24 * Time: 下午1:43 * To change this template use File | Settings | File Templates. */ public class PBFSearch extends Configured implements Tool { private FileSystem fileSystem; @Override public int run(String[] args) throws Exception { // lingWeightIsOne(args); //linkWeightIsNotOne(args); //linkWeightIsNotOneRevised(args); return 0; } private void linkWeightIsNotOneRevised(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Path input = new Path(args[0]); Path output = null; long changeCounter = -1; //访问过的节点数不变时才迭代 for (int i = 1; changeCounter != 0; i++) { output = new Path(args[1] + "/" + i); Job firstJob = oneStepRun(input, output); changeCounter = firstJob.getCounters().findCounter(Finished.CHANGED).getValue(); //删除临时文件 deleteTempFile(input, i, firstJob); input = output; System.out.println(i); } //对最终的输出结果重命名 fileSystem.rename(output, new Path(args[1] + "/out")); } private void linkWeightIsNotOne(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Path input = new Path(args[0]); Path output = null; long maxDistanceCounter = Long.MAX_VALUE; long reachedCounter = 0; //访问过的节点数不变时才迭代 for (int i = 1; i - reachedCounter != maxDistanceCounter; i++) { output = new Path(args[1] + "/" + i); Job firstJob = oneStepRun(input, output); //获取访问的节点数 reachedCounter = firstJob.getCounters().findCounter(Finished.REACHED).getValue(); maxDistanceCounter = firstJob.getCounters().findCounter(Finished.MAXDISTANCE).getValue(); //删除临时文件 deleteTempFile(input, i, firstJob); input = output; System.out.println(i); } //对最终的输出结果重命名 fileSystem.rename(output, new Path(args[1] + "/out")); } private void lingWeightIsOne(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Path input = new Path(args[0]); Path output = null; long preReached = -1; long reachedCounter = 0; //访问过的节点数不变时才迭代 for (int i = 1; preReached != reachedCounter; i++) { preReached = reachedCounter; output = new Path(args[1] + "/" + i); Job firstJob = oneStepRun(input, output); //获取访问的节点数 reachedCounter = firstJob.getCounters().findCounter(Finished.REACHED).getValue(); // System.out.println("MaxDistanceCounter = "+maxDistanceCounter); // System.out.println("Reached = "+ reachedCounter); //删除临时文件 deleteTempFile(input, i, firstJob); input = output; System.out.println(i); } //对最终的输出结果重命名 fileSystem.rename(output, new Path(args[1] + "/out")); } private void deleteTempFile(Path input, int i, Job firstJob) throws IOException { if (i != 1) { fileSystem.delete(input, true); } else { fileSystem = FileSystem.get(firstJob.getConfiguration()); } } private Job oneStepRun(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("sourceNode", "1"); Job job = new Job(conf, "PBFSearch"); job.setJarByClass(PBFSearch.class); job.setInputFormatClass(MyInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Node.class); MyInputFormat.addInputPath(job, input); TextOutputFormat.setOutputPath(job, output); job.waitForCompletion(true); return job; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new PBFSearch(), args); System.exit(exitCode); } }
package GraphAlgorithms.PBFS; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-24 * Time: 下午3:21 * To change this template use File | Settings | File Templates. */ public class MyInputFormat extends FileInputFormat<LongWritable, Node> { @Override public RecordReader<LongWritable, Node> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new MyRecordReader(); } }
自定义RecordReader,用于产生<LongWritable,Node>的key-value对
package GraphAlgorithms.PBFS; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import java.io.IOException; import java.util.HashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-24 * Time: 下午3:21 * To change this template use File | Settings | File Templates. */ public class MyRecordReader extends RecordReader<LongWritable, Node> { private RecordReader recordReader; private LongWritable key; private Node value; @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { recordReader = new LineRecordReader(); recordReader.initialize(split, context); key = new LongWritable(); value = new Node(); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { boolean hasNext = recordReader.nextKeyValue(); if (hasNext) { String valueText = recordReader.getCurrentValue().toString(); String[] keyAndValues = valueText.split("\t"); key.set(Long.parseLong(keyAndValues[0])); value.setCurrentDistance(Integer.parseInt(keyAndValues[1])); generateValue(keyAndValues[2]); return true; } else { return false; } } private void generateValue(String value) { HashMap<Long, linkMeta> result = new HashMap<Long, linkMeta>(); Pattern pattern = Pattern.compile("(\\d+)=(\\d+)"); Matcher matcher = pattern.matcher(value); long keyLong = -1; int linkWeight = -1; while (matcher.find()) { for (int i = 1; i <= matcher.groupCount(); i++) { switch (i) { case 1: keyLong = Long.parseLong(matcher.group(i)); break; case 2: linkWeight = Integer.parseInt(matcher.group(i)); break; default: System.out.println("Error in parse Regex"); break; } } result.put(keyLong, new linkMeta(linkWeight)); } this.value.setAdjacencyList(result); } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return key; //To change body of implemented methods use File | Settings | File Templates. } @Override public Node getCurrentValue() throws IOException, InterruptedException { return value; //To change body of implemented methods use File | Settings | File Templates. } @Override public float getProgress() throws IOException, InterruptedException { return recordReader.getProgress(); //To change body of implemented methods use File | Settings | File Templates. } @Override public void close() throws IOException { recordReader.close(); } }写代码过程中的一些问题:
aaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbb
package test; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import static java.lang.System.exit; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-25 * Time: 下午8:28 * To change this template use File | Settings | File Templates. */ public class HadoopTest extends Configured implements Tool { public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new HadoopTest(), args); exit(exitCode); } @Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "HadoopTest"); job.setJarByClass(HadoopTest.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(A.class); TextInputFormat.addInputPath(job,new Path(args[0])); TextOutputFormat.setOutputPath(job,new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; } public static class A implements WritableComparable{ Text text = new Text(); @Override public int compareTo(Object o) { return text.compareTo(((A)o).text); } @Override public void write(DataOutput out) throws IOException { text.write(out); } @Override public void readFields(DataInput in) throws IOException { text.readFields(in); } } public static class MyMapper extends Mapper<LongWritable,Text,LongWritable,A> { private A outValue = new A(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { outValue.text = value; context.write(new LongWritable(1),outValue); } } public static class MyReducer extends Reducer<LongWritable, A, LongWritable, A> { @Override protected void reduce(LongWritable key, Iterable<A> values, Context context) throws IOException, InterruptedException { for (A value : values) { System.out.println(value.text+" = " +value); } } } }
reduce中打印部分 13/11/25 21:35:03 INFO mapred.LocalJobRunner: 13/11/25 21:35:03 INFO mapred.Task: Task:attempt_local297038115_0001_r_000000_0 is done. And is in the process of commiting aaaaaaaaaaaaaaaaaaaaaa = test.HadoopTest$A@24be0018 bbbbbbbbbbbbbbbbbbbbbb = test.HadoopTest$A@24be0018 13/11/25 21:35:03 INFO mapred.LocalJobRunner:
package test; import java.io.FileNotFoundException; import java.util.ArrayList; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-25 * Time: 下午9:42 * To change this template use File | Settings | File Templates. */ public class Test { public static void main(String[] args) throws FileNotFoundException { ArrayList<A> as = new ArrayList<A>(); as.add(new A()); as.add(new A()); as.add(new A()); for (A a : as) { System.out.println(a); } } static class A { } }
test.Test$A@121f34d0 test.Test$A@3ec44cfb test.Test$A@499a12ee Process finished with exit code 0
1 0.1|2 3 2 0.1|4 5 1 3 0.1|10 6 7 4 0.1|5 6 3 5 0.1|8 6 0.1|9 8 7 7 0.1|8 8 0.1|9 9 0.1| 10 0.1|
package GraphAlgorithms.PageRank; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-30 * Time: 上午9:57 * To change this template use File | Settings | File Templates. */ /** * PageRank用到的Counter * */ public enum PageRankCounters { DANGLES, //记录dangling节点数 TOTAL //记录总节点数 }
package GraphAlgorithms.PageRank; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-30 * Time: 上午9:15 * To change this template use File | Settings | File Templates. */ public class PageRank extends Configured implements Tool { public static final String damplingFile = "_DanglingSum"; public static final double ALPHA = 0.05; public static final double VARIANCE = 0.00000001; public static final int MAXITER = 100; @Override public int run(String[] args) throws Exception { //i为迭代次数,input和output为输入输出 int i = 1; String input = args[0]; String output = args[1]; while (iterOnce(input, output, i) != 0 && i < MAXITER) { //第i+1轮输入等于第i轮输出 input = output; i++; } return 0; } /** * 执行一轮完整的迭代,包括 Job one:求PR值和dangling mass ; *Job two:均分dangling mass 和 random work 并统计本轮结束时dangling node数目,作为返回值 */ private long iterOnce(String input, String output, int i) throws IOException, InterruptedException, ClassNotFoundException { // 输入输出目录结构说明 // 第1轮 calculatePageRank 初始输入为:InitData 输出为:OutData/1/A // considerDamplingAndRandom 输入为:OutData/1/A 输出为 OutData/1/B // 第2轮 calculatePageRank 输入为:OutData/1/B 输出为:OutData/2/A // considerDamplingAndRandom 输入为:OutData/2/A 输出为 OutData/2/B long totalNode = calculatePageRank(input, output, i); long dangling = considerDamplingAndRandom(output, input, i, totalNode); System.out.println("i = "+i + " , dangling = "+dangling); return dangling; } /** * 均分dangling mass 和 random work 并统计本轮结束时dangling node数目,作为返回值 */ private long considerDamplingAndRandom(String inputPre, String inputLastIter, int i, long totalNode) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf, "PageRank2"); job.setJarByClass(PageRank.class); MultipleInputs.addInputPath(job, new Path(inputPre + "/" + i + "/A"), KeyValueTextInputFormat.class, DamplingAndRandomMapper.class); if (i == 1) { MultipleInputs.addInputPath(job, new Path(inputLastIter), KeyValueTextInputFormat.class, LastIterMapper.class); } else { MultipleInputs.addInputPath(job, new Path(inputLastIter + "/" + (i - 1) + "/B"), KeyValueTextInputFormat.class, LastIterMapper.class); } job.setOutputFormatClass(TextOutputFormat.class); job.setReducerClass(FinalReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); TextOutputFormat.setOutputPath(job, new Path(inputPre + "/" + i + "/B")); String danglingSum = getDanglingSum(inputPre + "/" + i + "/A", conf); double reg = (ALPHA + (1 - ALPHA) * (Double.parseDouble(danglingSum))) / totalNode; job.getConfiguration().set("totalNode", "" + totalNode); job.getConfiguration().set("reg", "" + reg); job.waitForCompletion(true); return job.getCounters().findCounter(PageRankCounters.DANGLES).getValue(); } /** * 获取dangling mass * */ private String getDanglingSum(String input, Configuration conf) throws IOException { FileSystem fileSystem = FileSystem.get(conf); Path inputPath = new Path(input + "/_DanglingSum"); FSDataInputStream inputStream = fileSystem.open(inputPath); byte[] buff = new byte[100]; int k = inputStream.read(buff); IOUtils.closeStream(inputStream); return new String(buff, 0, k); } /** * 求PR值和dangling mass * */ private long calculatePageRank(String input, String output, int i) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t"); Job job = new Job(conf, "PageRank"); job.setJarByClass(PageRank.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); if (i == 1) { //第一次迭代时,输入路径任意 KeyValueTextInputFormat.addInputPath(job, new Path(input + "/")); } else { //第一次迭代之后的输入,为上次迭代的最终输出 KeyValueTextInputFormat.addInputPath(job, new Path(input + "/" + (i - 1) + "/B")); } TextOutputFormat.setOutputPath(job, new Path(output + "/" + i + "/A")); job.waitForCompletion(true); return job.getCounters().findCounter(PageRankCounters.TOTAL).getValue(); } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new PageRank(), args); System.exit(exitCode); } }
package GraphAlgorithms.PageRank; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-30 * Time: 上午9:17 * To change this template use File | Settings | File Templates. */ public class MyMapper extends Mapper<Text, Text, LongWritable, Text> { private double pagerankValue; private LongWritable outKey = new LongWritable(); private Text outValue = new Text(); @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { String[] line = value.toString().trim().split("\\|"); pagerankValue = Double.parseDouble(line[0]); //将邻接表写出到reduce,*标识value中内容为邻接表 outKey.set(Long.parseLong(key.toString())); outValue.set("*"+value.toString()); context.write(outKey, outValue); if (line.length == 1) { //邻接表为空,说明为dangling node,key设为-1,value为当前PR值 outKey.set(-1); outValue.set("" + pagerankValue); context.write(outKey, outValue); } else { //邻接表部位空,将当前PR值均分给各个邻接节点,#标识输出内容为均分的PR值 String[] nodes = line[1].trim().split(" "); double avgPR = pagerankValue / nodes.length; for (String node : nodes) { outKey.set(Long.parseLong(node)); outValue.set("#" + avgPR); context.write(outKey, outValue); } } } }
package GraphAlgorithms.PageRank; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-30 * Time: 上午9:19 * To change this template use File | Settings | File Templates. */ public class MyReducer extends Reducer<LongWritable, Text, LongWritable, Text> { private Text outKey = new Text(); private Text outValue = new Text(); private String danglingSum = null; @Override protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String sKey = key.toString(); if (sKey.equals("-1")) { // 统计 dangling mass double sum = 0; for (Text value : values) { sum += Double.parseDouble(value.toString()); } danglingSum = "" + sum; } else { double currentPRValue = 0; String sValue; String adjacencyList = ""; //更新PR值,并将邻接表写出,生成下一轮输入 for (Text value : values) { sValue = value.toString(); if (sValue.startsWith("*")) { String[] line = value.toString().trim().split("\\|"); if (line.length == 2) { adjacencyList = line[1]; } } else { currentPRValue += Double.parseDouble(sValue.substring(1)); } } context.getCounter(PageRankCounters.TOTAL).increment(1); outValue.set(currentPRValue + "|" + adjacencyList); context.write(key, outValue); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { if (danglingSum != null) { //将本轮的dangling mass写入文件,PS:单个reduce可以试用,多个reduce则要使用MultipleOutputs,在reduce函数中输出 Configuration conf = context.getConfiguration(); FileSystem fileSystem = FileSystem.get(conf); Path outDanglingPath = new Path(conf.get("mapred.output.dir") + "/"+PageRank.damplingFile); FSDataOutputStream outputStream = fileSystem.create(outDanglingPath); outputStream.write(danglingSum.getBytes("utf-8")); IOUtils.closeStream(outputStream); } } }
package GraphAlgorithms.PageRank; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-30 * Time: 下午9:17 * To change this template use File | Settings | File Templates. */ public class LastIterMapper extends Mapper<Text,Text,LongWritable,Text> { private Text outValue = new Text(); private LongWritable outKey = new LongWritable(); @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { //得到上一轮的PR值 String[] line = value.toString().trim().split("\\|"); outValue.set("*" + Double.parseDouble(line[0])); outKey.set(Long.parseLong(key.toString())); context.write(outKey,outValue); } }
package GraphAlgorithms.PageRank; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-30 * Time: 下午4:11 * To change this template use File | Settings | File Templates. */ public class DamplingAndRandomMapper extends Mapper<Text,Text,LongWritable,Text> { private double totalNode; private double reg; private LongWritable outKey = new LongWritable(); private Text outValue = new Text(); @Override protected void setup(Context context) throws IOException, InterruptedException { totalNode = Double.parseDouble(context.getConfiguration().get("totalNode")); reg = Double.parseDouble(context.getConfiguration().get("reg")); } @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { //针对dangling node 和 random work 修改PR值 String[] line = value.toString().trim().split("\\|"); double prePRValue = Double.parseDouble(line[0]); double currentPRValue = reg+(1-PageRank.ALPHA)*prePRValue; outKey.set(Long.parseLong(key.toString())); outValue.set(currentPRValue + (line.length == 2 ? "|" + line[1] : "|")); context.write(outKey, outValue); } }
package GraphAlgorithms.PageRank; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ubuntu * Date: 13-11-30 * Time: 下午9:32 * To change this template use File | Settings | File Templates. */ public class FinalReducer extends Reducer<LongWritable,Text,LongWritable,Text> { @Override protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { //将当前PR值与上轮PR值进行比较。 double prePRV = 0,currPRV = 0; for (Text value : values) { String sValue = value.toString(); if (sValue.startsWith("*")) { prePRV = Double.parseDouble(sValue.substring(1)); } else { context.write(key,value); currPRV = Double.parseDouble(sValue.substring(0, sValue.indexOf('|'))); } } if (Math.abs(currPRV - prePRV) >= PageRank.VARIANCE) { context.getCounter(PageRankCounters.DANGLES).increment(1); } } }写代码过程中的问题: