SpringBoot集成Hadoop系列二 ---- MapReduce对表的join操作

代码:

package com.hadoop.reduce.model;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 订单商品对象
 * @author linhaiy
 * @date 2019.05.18
 */
public class OrderInfo implements Writable, Cloneable {
	// 订单号
	private Integer orderId;
	// 时间
	private String orderDate;
	// 产品编号
	private String pid;
	// 数量
	private Integer amount;
	// 产品名称
	private String pname;
	// 种类
	private Integer categoryId;
	// 价格
	private Double price;
	/**
	 * 这个字段需要理解
* 因为这个对象,包含了订单与产品的两个文件的内容,当我们加载一个文件的时候,肯定只能加载一部分的信息,另一部分是加载不到的,需要在join的时候, * 加进去,这个字段就代表着这个对象存的是哪些信息 如果为0 则是存了订单信息 如果为1 则是存了产品信息 */ private String flag; public OrderInfo() { } @Override public Object clone() throws CloneNotSupportedException { return super.clone(); } @Override public void write(DataOutput output) throws IOException { output.writeInt(orderId); output.writeUTF(orderDate); output.writeUTF(pid); output.writeInt(amount); output.writeUTF(pname); output.writeInt(categoryId); output.writeDouble(price); output.writeUTF(flag); } @Override public void readFields(DataInput input) throws IOException { orderId = input.readInt(); orderDate = input.readUTF(); pid = input.readUTF(); amount = input.readInt(); pname = input.readUTF(); categoryId = input.readInt(); price = input.readDouble(); flag = input.readUTF(); } public void set(Integer orderId, String orderDate, String pid, Integer amount, String pname, Integer categoryId, Double price, String flag) { this.orderId = orderId; this.orderDate = orderDate; this.pid = pid; this.amount = amount; this.pname = pname; this.categoryId = categoryId; this.price = price; this.flag = flag; } public Integer getOrderId() { return orderId; } public void setOrderId(Integer orderId) { this.orderId = orderId; } public String getOrderDate() { return orderDate; } public void setOrderDate(String orderDate) { this.orderDate = orderDate; } public String getPid() { return pid; } public void setPid(String pid) { this.pid = pid; } public Integer getAmount() { return amount; } public void setAmount(Integer amount) { this.amount = amount; } public String getPname() { return pname; } public void setPname(String pname) { this.pname = pname; } public Integer getCategoryId() { return categoryId; } public void setCategoryId(Integer categoryId) { this.categoryId = categoryId; } public Double getPrice() { return price; } public void setPrice(Double price) { this.price = price; } public String getFlag() { return flag; } public void setFlag(String flag) { this.flag = flag; } @Override public String toString() { final StringBuilder sb = new StringBuilder("{"); sb.append("\"orderId\":").append(orderId); sb.append(",\"orderDate\":\"").append(orderDate).append('\"'); sb.append(",\"pid\":").append(pid); sb.append(",\"amount\":").append(amount); sb.append(",\"pname\":\"").append(pname).append('\"'); sb.append(",\"categoryId\":").append(categoryId); sb.append(",\"price\":").append(price); sb.append(",\"flag\":\"").append(flag).append('\"'); sb.append('}'); return sb.toString(); } }
package com.hadoop.reduce.mapper;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import com.hadoop.reduce.model.OrderInfo;

import java.io.IOException;

/**
 * mapreduce 表join功能
 * @author linhaiy
 * @date 2019.05.18
 */
public class JoinMapper extends Mapper {
	private Text text = new Text();
	private OrderInfo orderInfo = new OrderInfo();
	private final static String ORDER_FILE_NAME = "order";
	private final static String PRODUCT_FILE_NAME = "product";
	private final static String ORDER_FLAG = "0";
	private final static String PRODUCT_FLAG = "1";

	/**
	 * 读取 order.txt 内容格式 1001,20170822,p1,3 读取 product.txt 内容格式 p1,防空火箭,1,20.2
	 * @param key
	 * @param value
	 * @param context
	 * @throws IOException
	 * @throws InterruptedException
	 */
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		String line = new String(value.getBytes(), 0, value.getLength(), "GBK");
		// 跳过标题,标题带有#号
		if (line.startsWith("#")) {
			return;
		}

        //获取当前任务的输入切片,这个InputSplit是一个最上层抽象类,可以转换成FileSplit
		InputSplit inputSplit = context.getInputSplit();
		FileSplit fileSplit = (FileSplit) inputSplit;
		// 得到的是文件名,这里根据文件名来判断是哪一种类型的数据,得到的是order或者product
		String fileName = fileSplit.getPath().getName();

		// 我们这里通过文件名判断是哪种数据
		String pid = "";
		String[] spilt = line.split(",");
		if (fileName.startsWith(ORDER_FILE_NAME)) {
			// 加载订单内容,订单数据里面有 订单号,时间,产品ID,数量
			Integer orderId = Integer.parseInt(spilt[0]);
			String orderDate = spilt[1];
			pid = spilt[2];
			Integer amount = Integer.parseInt(spilt[3]);
//          set(Integer orderId, String orderDate, String pid, Integer amount, String pname, Integer categoryId, Double price, String flag)
			orderInfo.set(orderId, orderDate, pid, amount, "", 0, 0.0, ORDER_FLAG);
		} else {
			// 加载产品内容,产品数据有 产品编号,产品名称,种类,价格
			pid = spilt[0];
			String pname = spilt[1];
			Integer categoryId = Integer.parseInt(spilt[2]);
			Double price = Double.valueOf(spilt[3]);
			orderInfo.set(0, "", pid, 0, pname, categoryId, price, PRODUCT_FLAG);
		}
		text.set(pid);
		context.write(text, orderInfo);
	}
}

package com.hadoop.reduce.reducer;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import com.hadoop.reduce.model.OrderInfo;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * mapreduce的表join操作
 * @author linhaiy
 * @date 2019.05.18
 */
public class JoinReduce extends Reducer {
	private final static String ORDER_FLAG = "0";
	private final static String PRODUCT_FLAG = "1";

	/**
	 * 解析mapper读取后的文件格式 产品pid orderInfo对象
	 * @param key
	 * @param values
	 * @param context
	 * @throws IOException
	 * @throws InterruptedException
	 */
	@Override
	protected void reduce(Text key, Iterable values, Context context)
			throws IOException, InterruptedException {
		// 这个对象用来存放产品的数据,一个产品所以只有一个对象
		OrderInfo product = new OrderInfo();
		// 这个list用来存放所有的订单数据,订单肯定是有多个的
		List list = new ArrayList<>();

		// 循环map输出
		for (OrderInfo info : values) {
			// 判断是订单还是产品的map输出
			if (ORDER_FLAG.equals(info.getFlag())) {
				// 订单表数据
				OrderInfo tmp = new OrderInfo();
				try {
					tmp = (OrderInfo) info.clone();
				} catch (Exception e) {
					e.printStackTrace();
				}
				list.add(tmp);
			} else {
				// 产品表数据
				try {
					product = (OrderInfo) info.clone();
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		}

		// 经过上面的操作,就把订单与产品完全分离出来了,订单在list集合中,产品在单独的一个对象中
		// 然后可以分别综合设置进去
		for (OrderInfo tmp : list) {
			tmp.setPname(product.getPname());
			tmp.setCategoryId(product.getCategoryId());
			tmp.setPrice(product.getPrice());
			// 最后输出
			context.write(tmp, NullWritable.get());
		}

	}
}

package com.hadoop.reduce.service;

import java.io.IOException;

import javax.annotation.PostConstruct;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

import com.hadoop.reduce.bean.StaffProvincePartitioner;
import com.hadoop.reduce.bean.WeiboInputFormat;
import com.hadoop.reduce.mapper.CounterMapper;
import com.hadoop.reduce.mapper.FriendsMapper;
import com.hadoop.reduce.mapper.JoinMapper;
import com.hadoop.reduce.mapper.StaffMap;
import com.hadoop.reduce.mapper.WeatherMap;
import com.hadoop.reduce.mapper.WeiboMapper;
import com.hadoop.reduce.mapper.WordCount;
import com.hadoop.reduce.mapper.WordCountMap;
import com.hadoop.reduce.model.GroupSortModel;
import com.hadoop.reduce.model.OrderInfo;
import com.hadoop.reduce.model.StaffModel;
import com.hadoop.reduce.model.Weibo;
import com.hadoop.reduce.reducer.FriendsReduce;
import com.hadoop.reduce.reducer.JoinReduce;
import com.hadoop.reduce.reducer.StaffReduce;
import com.hadoop.reduce.reducer.WeatherReduce;
import com.hadoop.reduce.reducer.WeiboReduce;
import com.hadoop.reduce.reducer.WordCountReduce;
import com.hadoop.util.GroupSort;

/**
 * Map/Reduce工具类
 * @author linhaiy
 * @date 2019.05.18
 */
@Component
public class ReduceJobsUtils {

	@Value("${hdfs.path}")
	private String path;

	private static String hdfsPath;

	/**
	 * 获取HDFS配置信息
	 * @return
	 */
	public static Configuration getConfiguration() {
		Configuration configuration = new Configuration();
		configuration.set("fs.defaultFS", hdfsPath);
		configuration.set("mapred.job.tracker", hdfsPath);
		// 运行在yarn的集群模式
		// configuration.set("mapreduce.framework.name", "yarn");
		// 这个配置是让main方法寻找该机器的mr环境
		// configuration.set("yarn.resourcemanmager.hostname", "node1");
		return configuration;
	}

	/**
	 * mapreduce 表join
	 * @param jobName
	 * @param inputPath
	 * @param outputPath
	 * @throws IOException
	 * @throws ClassNotFoundException
	 * @throws InterruptedException
	 */
	public static void join(String jobName, String inputPath, String outputPath)
			throws IOException, ClassNotFoundException, InterruptedException {
		Configuration config = getConfiguration();
		Job job = Job.getInstance(config, jobName);
		// 设置jar中的启动类,可以根据这个类找到相应的jar包
		job.setJarByClass(OrderInfo.class);

		job.setMapperClass(JoinMapper.class);
		job.setReducerClass(JoinReduce.class);

		// 设置Mapper的输出
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(OrderInfo.class);

		// 设置reduce的输出
		job.setOutputKeyClass(OrderInfo.class);
		job.setOutputValueClass(NullWritable.class);

		// 指定输入输出文件的位置
		FileInputFormat.setInputPaths(job, new Path(inputPath));
		FileOutputFormat.setOutputPath(job, new Path(outputPath));

		job.waitForCompletion(true);
	}

	@PostConstruct
	public void getPath() {
		hdfsPath = this.path;
	}

	public static String getHdfsPath() {
		return hdfsPath;
	}
}
package com.hadoop.reduce.service;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.springframework.stereotype.Service;
import com.hadoop.hdfs.service.HdfsService;

/**
 * 单词统计
 * @author linhaiy
 * @date 2019.05.18
 */
@Service
public class MapReduceService {

	// 默认reduce输出目录
	private static final String OUTPUT_PATH = "/output";

	/**
	 * mapreduce 表join操作
	 * @param jobName
	 * @param inputPath
	 * @throws Exception
	 */
	public void join(String jobName, String inputPath) throws Exception {
		if (StringUtils.isEmpty(jobName) || StringUtils.isEmpty(inputPath)) {
			return;
		}
		// 输出目录 = output/当前Job
		String outputPath = OUTPUT_PATH + "/" + jobName;
		if (HdfsService.existFile(outputPath)) {
			HdfsService.deleteFile(outputPath);
		}
		ReduceJobsUtils.join(jobName, inputPath, outputPath);
	}
}
package com.hadoop.reduce.controller;

import org.apache.commons.lang.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.bind.annotation.RestController;
import com.hadoop.reduce.service.MapReduceService;
import com.hadoop.util.Result;

/**
 * MapReduce处理控制层
 * @author linhaiy
 * @date 2019.05.18
 */
@RestController
@RequestMapping("/hadoop/reduce")
public class MapReduceAction {

	@Autowired
    MapReduceService mapReduceService;
	
	/**
	 * mapreduce 表join操作
	 * @param jobName
	 * @param inputPath
	 * @return
	 * @throws Exception
	 */
	@RequestMapping(value = "join",method= RequestMethod.POST)
    @ResponseBody
	public Result join(@RequestParam("jobName") String jobName, @RequestParam("inputPath") String inputPath) throws  Exception{
		if (StringUtils.isEmpty(jobName) || StringUtils.isEmpty(inputPath)) {
			return new Result(Result.FAILURE, "请求参数为空");
		}
		mapReduceService.join(jobName, inputPath);
		return new Result(Result.SUCCESS, "表join操作成功");
	}
}

 

你可能感兴趣的:(大数据开发,大数据开发)