MapReduce---连接操作--Reduce端连接

MapReduce---连接操作--Reduce端连接

由于reduce端连接并不要求输入数据集符合特定结构,因而reduce端比map端连接更为常用,但是,关联的两个数据集都需要经过MapReduce的shuffer过程,所以reduce端的连接的效率往往会很低:

基本思路:mapper为各个记录标记源,并且使用连接键作为map的输出键,使键相同的记录放在同一个reduce中

1  、定义组合CombKey

package hadoop.join.reduce;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;


public class CombKey implements WritableComparable {
	//类型:0-customer 1-order
	public int type = -1 ;
	//customer id
	public int cid = -1 ;
	//order id
	public int oid = -1;

	/**
	 * 排序
	 */
	public int compareTo(CombKey o) {
		int otype = o.type ;
		int ocid = o.cid ;
		int ooid = o.oid ;
		//相同类型
		if(type == otype){
			if(type == 0){
				return cid - ocid ;
			}
			else{
				//同一客户订单
				if(cid == ocid){
					return oid - ooid ;
				}
				//不同客户的订单
				else{
					return cid - ocid ;
				}
			}
		}
		//不同型
		else{
			//是否同一客户
			if(type == 0){
				//是否是该客户的订单
				if(cid == ocid){
					return -1 ;
				}
				else{
					return cid - ocid ;
				}
			}
			else{
				if(cid == ocid){
					return 1 ;
				}
				else{
					return cid - ocid ;
				}
			}
		}
	}

	public void write(DataOutput out) throws IOException {
		out.writeInt(type);
		out.writeInt(cid);
		out.writeInt(oid);

	}

	public void readFields(DataInput in) throws IOException {
		this.type = in.readInt() ;
		this.cid = in.readInt() ;
		this.oid = in.readInt() ;
	}
}



2、JoinMapper

package hadoop.join.reduce;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

/**
 * Mapper
 */
public class JoinMapper extends Mapper{

	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		//一行文本
		String line = value.toString();
		String[] arr = line.split(",") ;

		FileSplit split = (FileSplit) context.getInputSplit();
		String path = split.getPath().getName() ;

		CombKey keyOut = new CombKey() ;
		//customer
		if(path.contains("customers")){
			keyOut.type = 0 ;
			keyOut.cid = Integer.parseInt(arr[0]) ;
		}
		//订单
		else{
			keyOut.type = 1;
			keyOut.cid = Integer.parseInt(arr[3]);
			keyOut.oid = Integer.parseInt(arr[0]);
		}
		context.write(keyOut,value);
	}
}

3、JoinReducer

package hadoop.join.reduce;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

/**
 *
 */
public class JoinReducer extends Reducer{

	protected void reduce(CombKey key, Iterable values, Context context) throws IOException, InterruptedException {
		System.out.println("================================");
		//customer
		Iterator it = values.iterator() ;
		if(key.type == 0){
			//取得custInfo
			String custInfo = it.next().toString() ;
			System.out.println(custInfo);
			while(it.hasNext()){
				String orderInfo = it.next().toString();

				System.out.println(custInfo + "," + orderInfo);
				context.write(new Text(custInfo + "," + orderInfo),NullWritable.get());
			}
		}
		//order
		else{
			while (it.hasNext()) {
				String orderInfo = it.next().toString();
				System.out.println("NULL," + orderInfo);
				context.write(new Text("NULL," + orderInfo), NullWritable.get());
			}
		}
	}
}


4、自定义CIDPartitioner

package hadoop.join.reduce;

import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.io.Text;

/**
 * 
 */
public class CIDPartitioner extends Partitioner{

	public int getPartition(CombKey key, Text text, int numPartitions) {
		return key.cid % numPartitions ;
	}
}


5、自定义CIDGroupComparator

package hadoop.join.reduce;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * cid分组对比器
 */
public class CIDGroupComparator extends WritableComparator{
	protected CIDGroupComparator() {
		super(CombKey.class, true);
	}

	public int compare(WritableComparable k1, WritableComparable k2) {
		CombKey ck1 = (CombKey) k1;
		CombKey ck2 = (CombKey) k2;
		return ck1.cid - ck2.cid ;

	}
}


6、App

package hadoop.join.reduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * join:reduce端连接
 */
public class App {
	public static void main(String[] args) throws Exception {
		args = new String[]{"d:/java/mr/join", "d:/java/mr/out"} ;
		Configuration conf = new Configuration();

		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path(args[1]))){
			fs.delete(new Path(args[1]),true);
		}

		Job job = Job.getInstance(conf);

		job.setJobName("join-reduce");
		job.setJarByClass(App.class);

		job.setMapperClass(JoinMapper.class);
		job.setReducerClass(JoinReducer.class);

		//添加输入路径
		FileInputFormat.addInputPath(job,new Path(args[0]));
		//设置输出路径
		FileOutputFormat.setOutputPath(job,new Path(args[1]));

		//设置mapreduce输出
		job.setMapOutputKeyClass(CombKey.class);
		job.setMapOutputValueClass(Text.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		job.setGroupingComparatorClass(CIDGroupComparator.class);
		job.setPartitionerClass(CIDPartitioner.class);

		job.setNumReduceTasks(2);

		//第一个阶段(job)
		job.waitForCompletion(true) ;
	}
}




你可能感兴趣的:(Hadoop实战)