hadoop Mapreduce中的group by 和 join实操

背景

MapReduce中提供了表连接操作,其中包括Map join、Reduce join等。

group by

Userinfos 实体类

package com.njbdqn.cust;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Userinfos implements WritableComparable<Userinfos> {
    private int userid;
    private String username;
    private String className;
    private int score;

    public int getUserid() {
        return userid;
    }

    public void setUserid(int userid) {
        this.userid = userid;
    }

    public String getUsername() {
        return username;
    }

    public void setUsername(String username) {
        this.username = username;
    }

    public String getClassName() {
        return className;
    }

    public void setClassName(String className) {
        this.className = className;
    }

    public int getScore() {
        return score;
    }

    public void setScore(int score) {
        this.score = score;
    }

    @Override
    public String toString() {
        return "Userinfos{" +
                "userid=" + userid +
                ", username='" + username + '\'' +
                ", className='" + className + '\'' +
                ", score=" + score +
                '}';
    }

    @Override
    public int compareTo(Userinfos o) {
        if (score > o.getScore()) {
            return 1;
        } else {
            return -1;

        }
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeInt(userid);
        dataOutput.writeUTF(username);
        dataOutput.writeUTF(className);
        dataOutput.writeInt(score);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.userid= dataInput.readInt();
        this.username = dataInput.readUTF();
        this.className = dataInput.readUTF();
        this.score = dataInput.readInt();
    }


}

UserMapper

package com.njbdqn.cust;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class UserMapper extends Mapper<LongWritable, Text,Userinfos, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String [] sps = value.toString().split(",");
        Userinfos us = new Userinfos();
        us.setUserid(Integer.parseInt(sps[0]));
        us.setUsername(sps[1]);
        us.setClassName(sps[2]);
        us.setScore(Integer.parseInt(sps[3]));

        context.write(us,NullWritable.get());
    }
}

Driver

package com.njbdqn.cust;

import com.njbdqn.util.Tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driver {
    public static void main(String[] args) throws Exception {
        Tools.getInstance().checkpoint();
        Job job = Job.getInstance(new Configuration());
        job.setJarByClass(Driver.class);

        FileInputFormat.addInputPath(job,new Path("f://sourcel"));
        job.setMapperClass(UserMapper.class);

        job.setMapOutputKeyClass(Userinfos.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Userinfos.class);
        job.setOutputValueClass(NullWritable.class);

        FileOutputFormat.setOutputPath(job,new Path("f://ff"));
        job.waitForCompletion(true);
    }
}

Mapper join

Mapper适合一大一小表,小表不超过10M,

package com.njbdqn.mapper;

import com.njbdqn.cust.Driver;
import com.njbdqn.util.Tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

public class MyMapInner {
    private static class MyMapper extends Mapper<LongWritable, Text,Text, NullWritable>{
        private Map myType = new HashMap();//存储小表信息的

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //获取缓存中的小表
            String fileName = context.getCacheFiles()[0].getPath();

//            String fileName = DistributedCa
            final BufferedReader reader = new BufferedReader(new FileReader(fileName));
            String str = null;
            while ((str=reader.readLine())!=null){
                String [] sps = str.split(",");
                myType.put(sps[0],sps[1]);
            }
            reader.close();
        }


        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String [] goodInfos = value.toString().split(",");
            //根据主外键获取商品分类
            String type = myType.get(goodInfos[1]).toString();
            //将数据填充回数组
            goodInfos[1] = type;
            //将数组转字符串
            Text text = new Text(Arrays.toString(goodInfos));
            context.write(text,NullWritable.get());


        }
    }
    public static class MyReduce extends Reducer<Text,NullWritable,Text,NullWritable>{


    }

    public static void main(String[] args) throws Exception {
        Tools.getInstance().checkpoint();
        Job job = Job.getInstance(new Configuration());
        job.setJarByClass(Driver.class);
        FileInputFormat.addInputPath(job,new Path("f://source2"));
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
//        job.addCacheFile(new URI("hdfs://192.168.56.120:9000/1.csv"));
        job.addFileToClassPath(new Path("f://1.csv"));
        FileOutputFormat.setOutputPath(job,new Path("f://ff"));

        job.waitForCompletion(true);
    }
}

Redece join

package com.njbdqn.reducejoin;

import com.njbdqn.util.Tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * 二边都是大文件
 */
public class MyReduceInner {

    public static class MyMapper extends Mapper<LongWritable, Text,Text,Text>{

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            //获取当前行数据出自哪个文件的文件名
            String path = ((FileSplit)context.getInputSplit()).getPath().toString();
            String [] words = value.toString().split(",");
            //不同文件输出的键的位置不同
            if (path.contains("1")){
                context.write(new Text(words[0]),new Text("type:"+words[1]));
            }else {
                context.write(new Text(words[1]),new Text("context:"+words[0]+":"+words[1]+":"+words[2]));
            }
        }

    }

    public static class MyReduce extends Reducer<Text,Text,Text, NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            //按分类编号已经分好组 处理二种不同的数据
            //先将迭代器的数据存放到集合中
            List<Text> lst = new ArrayList<>();
            //将迭代器转化成数组
            for (Text tx : values) {
                String s = tx.toString();
                lst.add(new Text(s));
            }
            //在数组中找到前面有type:的信息 用来获得商品分类信息
            String typeInfo="";
            for (Text tx : lst) {
                String val = tx.toString();
                if (val.contains("type")){
                    typeInfo = val.substring(val.indexOf(":")+1);
                    //再结合中移除本条信息
                    lst.remove(tx);
                    break;
                }
            }
            //将其他的信息遍历替换中间的内容
            for (Text tx : lst) {
                String [] infos = tx.toString().split(":");
                infos[2] = typeInfo;
                context.write(new Text(Arrays.toString(infos)),NullWritable.get());
            }
        }
    }

    public static void main(String[] args) throws Exception{
        Tools.getInstance().checkpoint();
        Job job = Job.getInstance(new Configuration());
        job.setJarByClass(MyReduceInner.class);
        FileInputFormat.addInputPath(job,new Path("f://source3"));
        job.setMapperClass(MyMapper.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        FileOutputFormat.setOutputPath(job,new Path("f://ff"));
        job.waitForCompletion(true);
    }
}

你可能感兴趣的:(hadoop)