06_mapreduce小demo_自定义分组聚合规则

mapreduce小demo

1.单词分类

a.txt    
hello tom   
hello jim   
hello kitty     
hello rose  

b.txt   
hello jerry     
hello jim   
hello kitty     
hello jack  

c.txt       
hello jerry     
hello java      
hello c++       
hello c++       

需要得到以下结果:   
hello  a.txt-->4  b.txt-->4  c.txt-->4      
java   c.txt-->1        
jerry  b.txt-->1  c.txt-->1 

....

1.1 思路

1 将单词个数以及文件位置 统计个数
2 将相同的单词 将数据合并为一条数据

1.2 代码

得到文件名
从输入切片信息中获取当前正在处理的一行数据所属的文件
InputSplit 的实现类 FileSplit inputSplit = (FileSplit)context.getInputSplit();

1. 第一步

  1. IndexesMapper
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class IndexesMapper extends Mapper {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        String fileName = inputSplit.getPath().getName();

        String[] str = value.toString().split(" ");

        for (String s : str) {
            context.write(new Text(s+"-"+fileName), new IntWritable(1));
        }
    }
}


  1. IndexesReduce
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class IndexesReduce extends Reducer {
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
        int temp = 0;
        for (IntWritable value : values) {
            temp+=1;
        }
        context.write(key,new IntWritable(temp));
    }
}

  1. IndexesSub

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class IndexesSub {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        job.setJarByClass(IndexesSub.class);

        job.setMapperClass(IndexesMapper.class);
        job.setReducerClass(IndexesReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\索引数据"));
        FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\output"));

        job.waitForCompletion(true);
    }
}

1. 第二步

1 IndexSecMapper

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class IndexSecMapper extends Mapper {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] str = value.toString().split("-");
        context.write(new Text(str[0]), new Text(str[1]));
    }
}

2 IndexSecReduce

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class IndexSecReduce extends Reducer {
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
        StringBuilder stringBuilder = new StringBuilder();
        for (Text value : values) {
            stringBuilder.append("  >"+value.toString());
        }
        context.write(key, new Text(stringBuilder.toString()));
    }
}

3 IndexSecSub

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class IndexSecSub {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        job.setJarByClass(IndexSecSub.class);

        job.setMapperClass(IndexSecMapper.class);
        job.setReducerClass(IndexSecReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\output"));
        FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\output2"));

        job.waitForCompletion(true);
    }
}

2.需要求出每一个订单中成交金额最大的三笔

order001,u001,小米6,1999.9,2
order001,u001,雀巢咖啡,99.0,2
order001,u001,安慕希,250.0,2
order001,u001,经典红双喜,200.0,4
order001,u001,防水电脑包,400.0,2
order002,u002,小米手环,199.0,3
order002,u002,榴莲,15.0,10
order002,u002,苹果,4.5,20
order002,u002,肥皂,10.0,40
order003,u001,小米6,1999.9,2
order003,u001,雀巢咖啡,99.0,2
order003,u001,安慕希,250.0,2
order003,u001,经典红双喜,200.0,4
order003,u001,防水电脑包,400.0,2

2.1 代码

2.1.1 OrderMapper

public class OrderMapper extends Mapper {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] split = value.toString().split(",");

        context.write(new Text(split[0]),
                new OrderBean(split[0],split[1],split[2],Float.parseFloat(split[3]),Integer.parseInt(split[4])));
    }
}

2.1.2 OrderReduce

public class OrderReduce extends Reducer {
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
        List orderBeans = new ArrayList();

        for (OrderBean value : values) {
            orderBeans.add(new OrderBean(value));
        }

        Collections.sort(orderBeans);

        for (int i = 0; i < 3; i++) {
            context.write(orderBeans.get(i), NullWritable.get());
        }
    }
}

2.1.3 OrderSub

public class OrderSub {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        job.setJarByClass(OrderSub.class);
        job.setMapperClass(OrderMapper.class);
        job.setReducerClass(OrderReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(OrderBean.class);

        job.setOutputKeyClass(OrderBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\交金额最大的三笔\\input"));
        FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\交金额最大的三笔\\output"));

        job.waitForCompletion(true);

    }
}

2.1.4 OrderBean

public class OrderBean implements WritableComparable {
    private String orderId;
    private String userId;
    private String comName;
    private Float price;
    private Integer nub;
    private Float subPrice;

    @Override
    public String toString() {
        return orderId+"\t"+userId+"\t"+comName+"\t"+price+"\t"+nub+subPrice;
    }


    public int compareTo(OrderBean o) {
        Float temp = o.getSubPrice() - this.getSubPrice();
        if (temp == 0){
            return this.getComName().compareTo(o.getComName());
        }else if (temp>0){
            return 1;
        }else{
            return 0;
        }
    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.orderId);
        dataOutput.writeUTF(this.userId);
        dataOutput.writeUTF(this.comName);
        dataOutput.writeFloat(this.price);
        dataOutput.writeInt(this.nub);
        dataOutput.writeFloat(this.subPrice);
    }

    public void readFields(DataInput dataInput) throws IOException {
        this.orderId = dataInput.readUTF();
        this.userId = dataInput.readUTF();
        this.comName = dataInput.readUTF();
        this.price = dataInput.readFloat();
        this. nub = dataInput.readInt();
        this.subPrice = price*nub;
    }

    public OrderBean() {}

    public OrderBean(OrderBean ob){
        this.orderId = ob.getOrderId();
        this.userId = ob.getUserId();
        this.comName = ob.getComName();
        this.price = ob.getPrice();
        this.nub = ob.getNub();
        this.subPrice = ob.getSubPrice();
    }

    public OrderBean(String orderId, String userId, String comName, Float price, Integer nub) {
        this.orderId = orderId;
        this.userId = userId;
        this.comName = comName;
        this.price = price;
        this.nub = nub;
        this.subPrice = price*nub;
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getComName() {
        return comName;
    }

    public void setComName(String comName) {
        this.comName = comName;
    }

    public Float getPrice() {
        return price;
    }

    public void setPrice(Float price) {
        this.price = price;
    }

    public Integer getNub() {
        return nub;
    }

    public void setNub(Integer nub) {
        this.nub = nub;
    }

    public Float getSubPrice() {
        return subPrice;
    }

    public void setSubPrice(Float subPrice) {
        this.subPrice = subPrice;
    }
}

2.2 解释

  1. 排序
    1. bean 中调用 WritableComparable 接口
          public int compareTo(OrderBean o) {
                Float temp = o.getSubPrice() - this.getSubPrice();
                if (temp == 0){
                    return this.getComName().compareTo(o.getComName());
                }else if (temp>0){
                    return 1;
                }else{
                    return 0;
                }
            }
    
    1. OrderReduce中使用
         List orderBeans = new ArrayList();
        
        for (OrderBean value : values) {
            orderBeans.add(new OrderBean(value));
        }
    
        Collections.sort(orderBeans);
    

2.2 输出

order001    u001    雀巢咖啡    99.0    2198.0
order001    u001    安慕希 250.0   2500.0
order001    u001    小米6 1999.9  23999.8
order002    u002    榴莲  15.0    10150.0
order002    u002    肥皂  10.0    40400.0
order002    u002    苹果  4.5 2090.0
order003    u001    小米6 1999.9  23999.8
order003    u001    雀巢咖啡    99.0    2198.0
order003    u001    安慕希 250.0   2500.0

2.3 优化

自定义分发规则

  1. 自定义 mapper分发的规则
public class OptimizeOrderPartitione extends Partitioner {

    @Override
    public int getPartition(OrderBean orderBean, NullWritable nullWritable, int i) {
        return (orderBean.getOrderId().hashCode() & Integer.MAX_VALUE)%i;
    }
}
  1. 自定义 reduce对比的规则(分组比较器)
public class OptimizeOrderGroupingComparator extends WritableComparator {

    public OptimizeOrderGroupingComparator(){
        super(OrderBean.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        OrderBean o1 = (OrderBean) a;
        OrderBean o2 = (OrderBean) b;

        //返回为零则代表相同
        return o1.getOrderId().compareTo(o2.getOrderId());
    }
}

  1. bean 中的compareTo
public int compareTo(OrderBean o) {
   return this.getOrderId().compareTo(o.getOrderId())==0 ? Float.compare(o.getSubPrice(),this.getSubPrice()):this.getOrderId().compareTo(o.getOrderId());
}

代码

public class OptimizeOrder {
    public static class OpOrdMapper extends Mapper {
        NullWritable v = NullWritable.get();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(",");

            context.write(new OrderBean(
                    split[0],split[1],split[2],Float.parseFloat(split[3]),Integer.parseInt(split[4])),v);
        }
    }

    public static class OpOrdReduce extends Reducer{
        @Override
        protected void reduce(OrderBean key, Iterable values, Context context) throws IOException, InterruptedException {
            int temp = 0;
            for (NullWritable value : values) {
                context.write(key, value);
                if (++temp==3){
                    return;
                }
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        job.setJarByClass(OptimizeOrder.class);

        job.setMapperClass(OpOrdMapper.class);
        job.setReducerClass(OpOrdReduce.class);

        job.setMapOutputKeyClass(OrderBean.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(OrderBean.class);
        job.setOutputValueClass(NullWritable.class);

        //设置分割器
        job.setPartitionerClass(OptimizeOrderPartitione.class);
        //设置分组比较器
        job.setGroupingComparatorClass(OptimizeOrderGroupingComparator.class);



        FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\交金额最大的三笔\\input"));
        FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\交金额最大的三笔\\output2"));

        job.waitForCompletion(true);
    }
}

3.共同好友分析

A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

3.1 代码

  1. 第一步
public class MutualFriend {
    public static class MutualFriendMapper extends Mapper{
        Text k = new Text();
        Text v = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(":");
            String[] split1 = split[1].split(",");
            v.set(split[0]);
            for (String s : split1) {
                k.set(s);
                context.write(k,v);
            }
        }
    }


    public static class MutualFriendReduce extends Reducer{
        Text k = new Text();
        Text v = new Text();
        @Override
        protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
            List list = new ArrayList();
            for (Text value : values) {
                list.add(value.toString());
            }

            v.set(key);
            for (int i =0 ;i

结果:

I-K A
I-C A
I-B A
I-G A
I-F A
I-H A
I-O A
I-D A
K-C A
K-B A
K-G A
K-F A
K-H A
K-O A
K-D A
C-B A
C-G A
C-F A
C-H A
C-O A
C-D A
B-G A
B-F A
B-H A
B-O A
B-D A
G-F A
G-H A
G-O A
G-D A
F-H A
F-O A
F-D A
H-O A
H-D A
O-D A
A-F B
A-J B
A-E B
F-J B
F-E B
J-E B
A-E C
A-B C
A-H C
A-F C
A-G C
A-K C
E-B C
E-H C
E-F C
E-G C
E-K C
B-H C
B-F C
B-G C
B-K C
H-F C
H-G C
H-K C
F-G C
F-K C
G-K C
G-C D
G-K D
G-A D
G-L D
G-F D
G-E D
G-H D
C-K D
C-A D
C-L D
C-F D
C-E D
C-H D
K-A D
K-L D
K-F D
K-E D
K-H D
A-L D
A-F D
A-E D
A-H D
L-F D
L-E D
L-H D
F-E D
F-H D
E-H D
G-M E
G-L E
G-H E
G-A E
G-F E
G-B E
G-D E
M-L E
M-H E
M-A E
M-F E
M-B E
M-D E
L-H E
L-A E
L-F E
L-B E
L-D E
H-A E
H-F E
H-B E
H-D E
A-F E
A-B E
A-D E
F-B E
F-D E
B-D E
L-M F
L-D F
L-C F
L-G F
L-A F
M-D F
M-C F
M-G F
M-A F
D-C F
D-G F
D-A F
C-G F
C-A F
G-A F
O-C I
D-E L
E-F M
A-H O
A-I O
A-J O
A-F O
H-I O
H-J O
H-F O
I-J O
I-F O
J-F O
  1. 第二步
public class MutualFriend2 {
    public static class MutualFriend2Mapper extends Mapper{
        Text k = new Text();
        Text v = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");
            k.set(split[0]);
            v.set(split[1]);
            context.write(k,v);
        }
    }

    public static class MutualFriend2Reduce extends Reducer{
        Text k = new Text();
        Text v = new Text();
        @Override
        protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
            k.set(key+">");
            StringBuilder stringBuilder = new StringBuilder();

            for (Text value : values) {
                stringBuilder.append(value.toString()+"\t");
            }
            v.set(stringBuilder.toString());

            context.write(k, v);
        }
    }
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(MutualFriend2.class);
        job.setMapperClass(MutualFriend2Mapper.class);
        job.setReducerClass(MutualFriend2Reduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\共同好友\\output"));
        FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\共同好友\\output2"));

        job.waitForCompletion(true);
    }
}

结果:

A-B>    E   C   
A-D>    E   
A-E>    D   C   B   
A-F>    C   D   B   O   E   
A-G>    C   
A-H>    D   O   C   
A-I>    O   
A-J>    O   B   
A-K>    C   
A-L>    D   
B-D>    E   A   
B-F>    C   A   
B-G>    C   A   
B-H>    C   A   
B-K>    C   
B-O>    A   
C-A>    D   F   
C-B>    A   
C-D>    A   
C-E>    D   
C-F>    D   A   
C-G>    F   A   
C-H>    A   D   
C-K>    D   
C-L>    D   
C-O>    A   
D-A>    F   
D-C>    F   
D-E>    L   
D-G>    F   
E-B>    C   
E-F>    M   C   
E-G>    C   
E-H>    C   D   
E-K>    C   
F-B>    E   
F-D>    E   A   
F-E>    B   D   
F-G>    C   
F-H>    D   A   
F-J>    B   
F-K>    C   
F-O>    A   
G-A>    E   D   F   
G-B>    E   
G-C>    D   
G-D>    E   A   
G-E>    D   
G-F>    D   E   A   
G-H>    E   A   D   
G-K>    D   C   
G-L>    D   E   
G-M>    E   
G-O>    A   
H-A>    E   
H-B>    E   
H-D>    A   E   
H-F>    C   E   O   
H-G>    C   
H-I>    O   
H-J>    O   
H-K>    C   
H-O>    A   
I-B>    A   
I-C>    A   
I-D>    A   
I-F>    A   O   
I-G>    A   
I-H>    A   
I-J>    O   
I-K>    A   
I-O>    A   
J-E>    B   
J-F>    O   
K-A>    D   
K-B>    A   
K-C>    A   
K-D>    A   
K-E>    D   
K-F>    D   A   
K-G>    A   
K-H>    D   A   
K-L>    D   
K-O>    A   
L-A>    F   E   
L-B>    E   
L-C>    F   
L-D>    E   F   
L-E>    D   
L-F>    E   D   
L-G>    F   
L-H>    E   D   
L-M>    F   
M-A>    F   E   
M-B>    E   
M-C>    F   
M-D>    F   E   
M-F>    E   
M-G>    F   
M-H>    E   
M-L>    E   
O-C>    I   
O-D>    A   

你可能感兴趣的:(06_mapreduce小demo_自定义分组聚合规则)