hadoop 三次排序

数据

10.0.0.1,192.168.0.1,10000
10.0.0.1,192.168.0.1,10001
10.0.0.1,192.168.0.1,10002
10.0.0.1,192.168.0.2,10000
10.0.0.1,192.168.0.2,10001
10.0.0.1,192.168.0.3,09999
10.0.0.1,192.168.0.3,10000
10.0.0.2,192.168.0.1,10001
10.0.0.2,192.168.0.1,10002
10.0.0.2,192.168.0.2,10000
10.0.0.2,192.168.0.2,10003
10.0.0.3,192.168.0.1,10000
10.0.0.3,192.168.0.1,10001
10.0.0.3,192.168.0.1,10001
10.0.0.3,192.168.0.1,10002
10.0.0.3,192.168.0.1,10004
10.0.0.3,192.168.0.3,10005
10.0.0.3,192.168.0.3,10006
10.0.0.3,192.168.0.3,10007
10.0.0.3,192.168.0.3,10008


打乱顺序

10.0.0.3,192.168.0.3,10005
10.0.0.1,192.168.0.1,10000
10.0.0.3,192.168.0.1,10001
10.0.0.2,192.168.0.2,10000
10.0.0.3,192.168.0.1,10004
10.0.0.3,192.168.0.1,10002
10.0.0.2,192.168.0.1,10001
10.0.0.1,192.168.0.1,10002
10.0.0.3,192.168.0.1,10000
10.0.0.3,192.168.0.3,10006
10.0.0.1,192.168.0.3,09999
10.0.0.3,192.168.0.3,10008
10.0.0.1,192.168.0.3,10000
10.0.0.1,192.168.0.1,10001
10.0.0.3,192.168.0.3,10007
10.0.0.1,192.168.0.2,10000
10.0.0.2,192.168.0.1,10002
10.0.0.1,192.168.0.2,10001
10.0.0.3,192.168.0.1,10001
10.0.0.2,192.168.0.2,10003


自定义结构

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TriTuple implements WritableComparable
{
	private Text serverIP;
	private Text clientIP;
	private Text time;

	public TriTuple()
	{
		serverIP = new Text();
		clientIP = new Text();
		time = new Text();
	}

	public Text getServerIP()
	{
		return serverIP;
	}

	public void setServerIP(Text serverIP)
	{
		this.serverIP = serverIP;
	}

	public Text getClientIP()
	{
		return clientIP;
	}

	public void setClientIP(Text clientIP)
	{
		this.clientIP = clientIP;
	}

	public Text getTime()
	{
		return time;
	}

	public void setTime(Text time)
	{
		this.time = time;
	}

	public void readFields(DataInput in) throws IOException
	{
		serverIP.readFields(in);
		clientIP.readFields(in);
		time.readFields(in);
	}

	public void write(DataOutput out) throws IOException
	{
		serverIP.write(out);
		clientIP.write(out);
		time.write(out);
	}

	public int compareTo(TriTuple tt)
	{
		int cmp;
		if(0 != (cmp = serverIP.compareTo(tt.serverIP)))
		{
			return cmp;
		}
		else
		{
			if(0 != (cmp = clientIP.compareTo(tt.clientIP)))
			{
				return cmp;
			}
			else
			{
				return time.compareTo(tt.time);
			}
		}
	}

	public int hashCode()
	{
		return serverIP.hashCode() * 31;
	}

	public boolean equals(Object o)
	{
		if(o instanceof TriTuple)
		{
			TriTuple tt = (TriTuple) o;
			return serverIP.equals(tt.serverIP) && clientIP.equals(tt.clientIP)
					&& time.equals(tt.time);
		}
		return false;
	}

	public String toString()
	{
		return serverIP + "," + clientIP + "," + time;
	}

	public static class FirstComparator extends WritableComparator
	{
		protected FirstComparator()
		{
			super(TriTuple.class, true);
		}

		public int compare(WritableComparable w1, WritableComparable w2)
		{
			int cmp1 = ((TriTuple) w1).getServerIP().compareTo(((TriTuple) w2).getServerIP());
			int cmp2 = ((TriTuple) w1).getClientIP().compareTo(((TriTuple) w2).getClientIP());
			if(0 == cmp1 && 0 == cmp2)
			{
				return 0;
			}
			else
			{
				if(0 != cmp1)
				{
					return cmp1;
				}
				else
				{
					return cmp2;
				}
			}
		}
	}
}


mapreduce + driver

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class MyJob
{
	public static class MapClass extends Mapper
	{
		TriTuple outKey = new TriTuple();
		
		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
		{
			String inValue = value.toString();
			String[] field = inValue.split(",");
			if(3 == field.length)
			{
				outKey.setClientIP(new Text(field[0]));
				outKey.setServerIP(new Text(field[1]));
				outKey.setTime(new Text(field[2]));
				
				context.write(outKey, NullWritable.get());
			}
		}
	}

	public static class Reduce extends Reducer
	{
		Text outKey = new Text();
		int counter = 0;
		public void reduce(TriTuple key, Iterable values,	Context context) throws IOException, InterruptedException
		{
			counter = 0;
			for(NullWritable value : values)
			{
				++counter;
				outKey.set(key.toString() + ", " + counter);
				context.write(outKey, NullWritable.get());
			}
		}
	}

	public static void main(String[] args) throws Exception
	{
		Configuration conf = new Configuration();
		Job job = new Job(conf, "MyJob");

		job.setNumReduceTasks(10);
		job.setJarByClass(MyJob.class);
		
		job.setMapperClass(MapClass.class);
		job.setReducerClass(Reduce.class);	
		job.setGroupingComparatorClass(TriTuple.FirstComparator.class);
		//job.setPartitionerClass(PhNumPartitioner.class);
		
		job.setMapOutputKeyClass(TriTuple.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}


结果

192.168.0.1,10.0.0.1,10000, 1
192.168.0.1,10.0.0.1,10001, 2
192.168.0.1,10.0.0.1,10002, 3
192.168.0.1,10.0.0.2,10001, 1
192.168.0.1,10.0.0.2,10002, 2
192.168.0.1,10.0.0.3,10000, 1
192.168.0.1,10.0.0.3,10001, 2
192.168.0.1,10.0.0.3,10001, 3
192.168.0.1,10.0.0.3,10002, 4
192.168.0.1,10.0.0.3,10004, 5
192.168.0.2,10.0.0.1,10000, 1
192.168.0.2,10.0.0.1,10001, 2
192.168.0.2,10.0.0.2,10000, 1
192.168.0.2,10.0.0.2,10003, 2
192.168.0.3,10.0.0.1,09999, 1
192.168.0.3,10.0.0.1,10000, 2
192.168.0.3,10.0.0.3,10005, 1
192.168.0.3,10.0.0.3,10006, 2
192.168.0.3,10.0.0.3,10007, 3
192.168.0.3,10.0.0.3,10008, 4


 

你可能感兴趣的:(hadoop 三次排序)