MapReduce高阶训练 10

1. 社交粉丝数据分析

1.1 逻辑分析

以下是qq的好友列表数据,冒号前是一个用户,冒号后是该用户的所有好友(数据中的好友关系是单向的)

A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

求出哪些人两两之间有共同好友,及他俩的共同好友都有谁?

MapReduce高阶训练 10_第1张图片
MapReduce高阶训练 10_第2张图片

  • 第一步:

    • map
      读一行 A:B,C,D,F,E,O
      输出
      在读一行 B:A,C,E,K
      输出
    • REDUCE
      拿到的数据比如
      输出:





  • 第二步

    • map
      读入一行
      直接输出
    • reduce
      读入数据
      输出: A-B C,F,G,…

1.2 代码实现

  • 第一步
public class ComonsFriendsStepOne  extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = super.getConf();
        Job job = Job.getInstance(conf, ComonsFriendsStepOne.class.getSimpleName());
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\共同好友\\input"));
        job.setMapperClass(ComonsFriendsStepOneMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setReducerClass(ComonsFriendsStepOneReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\共同好友\\output"));
        boolean b = job.waitForCompletion(true);
        return b?0:1;
    }
    public  static class ComonsFriendsStepOneMapper  extends Mapper<LongWritable,Text,Text,Text>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(":");
            String person = split[0];
            String[] friends = split[1].split(",");
            for (String friend : friends) {
                context.write(new Text(friend),new Text(person));
            }
        }
    }
    public static class ComonsFriendsStepOneReducer extends Reducer<Text,Text,Text,Text>{
        @Override
        protected void reduce(Text friend, Iterable<Text> persons, Context context) throws IOException, InterruptedException {
            StringBuffer buffer = new StringBuffer();
            for (Text person : persons) {
                buffer.append(person).append("-");
            }
            context.write(friend,new Text(buffer.toString()));
        }
    }
    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        ToolRunner.run(configuration,new ComonsFriendsStepOne(),args);
    }
}
  • 第二步
public class ComonsFriendsStepTwo extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {

        Job job = Job.getInstance(super.getConf(), ComonsFriendsStepTwo.class.getSimpleName());
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\共同好友\\output"));
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setMapperClass(ComonsFriendStepTwoMapper.class);
        job.setReducerClass(ComonsFriendStepTwoReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\共同好友\\outstep2"));
        boolean b = job.waitForCompletion(true);
        return b?0:1;
    }
    public  static  class ComonsFriendStepTwoMapper  extends Mapper<LongWritable,Text,Text,Text>{

        /**
         * A   F-D-O-I-H-B-K-G-C-
         * B   E-A-J-F-
         * C   K-A-B-E-F-G-H-
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");
            String friends = split[0];
            String[] persons = split[1].split("-");
            //排序,避免c-b  与b-c  这样的情况出现
            Arrays.sort(persons);
            for(int i =0;i< persons.length -1 ;i++){
                for(int j = i+1;j<persons.length;j++){
                    context.write(new Text(persons[i]+"-"+persons[j]),new Text(friends));
                }

            }
        }
    }
    public static class ComonsFriendStepTwoReducer extends Reducer<Text,Text,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            StringBuffer buffer = new StringBuffer();
            for (Text value : values) {
                buffer.append(value.toString()+"\t");
            }
            context.write(key,new Text(buffer.toString()));
        }
    }
    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Configuration(),new ComonsFriendsStepTwo(),args);
    }
}

2. 倒排索引建立

2.1 需求

有大量的文本(文档、网页),需要建立搜索索引

2.2 思路分析

首选将文档的内容全部读取出来,加上文档的名字作为key,文档的value为1,组织成这样的一种形式的数据
map端数据输出
hello-a.txt 1
hello-a.txt 1
hello-a.txt 1
reduce端数据输出
hello-a.txt 3

2.3 代码实现

public class IndexCreate extends Configured implements Tool {
    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Configuration(),new IndexCreate(),args);
    }
    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(super.getConf(), IndexCreate.class.getSimpleName());
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\倒排索引\\input"));
        job.setMapperClass(IndexCreateMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(IndexCreateReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\倒排索引\\outindex"));
        boolean bool = job.waitForCompletion(true);
        return bool?0:1;
    }
    public static class IndexCreateMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
        Text text = new Text();
        IntWritable v = new IntWritable(1);
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //获取文件切片
            FileSplit fileSplit  = (FileSplit) context.getInputSplit();
            //通过文件切片获取文件名
            String name = fileSplit.getPath().getName();
            String line = value.toString();
            String[] split = line.split(" ");
            //输出 单词--文件名作为key  value是1
            for (String word : split) {
               text.set(word+"--"+name);
                context.write(text,v);
            }
        }
    }
    public static class IndexCreateReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
        IntWritable value = new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count += value.get();
            }
            value.set(count);
            context.write(key,value);
        }
    }
}

3. 自定义inputFormat合并小文件

3.1 需求

无论hdfs还是mapreduce,对于小文件都有损效率,实践中,又难免面临处理大量小文件的场景,此时,就需要有相应解决方案

3.2 分析

小文件的优化无非以下几种方式:
1、在数据采集的时候,就将小文件或小批数据合成大文件再上传HDFS
2、在业务处理之前,在HDFS上使用mapreduce程序对小文件进行合并
3、在mapreduce处理时,可采用combineInputFormat提高效率

3.3 实现

本节实现的是上述第二种方式
程序的核心机制:
自定义一个InputFormat
改写RecordReader,实现一次读取一个完整文件封装为KV
在输出时使用SequenceFileOutPutFormat输出合并文件
MapReduce高阶训练 10_第3张图片
MapReduce高阶训练 10_第4张图片

3.4 代码

  • 自定义InputFormat
public class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable>{
   /*
   直接返回文件不可切割,保证一个文件是一个完整的一行
    */
   @Override
   protected boolean isSplitable(JobContext context, Path file) {
      return false;
   }
   @Override
   public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException,InterruptedException {
      WholeFileRecordReader reader = new WholeFileRecordReader();
      reader.initialize(split, context);
      return reader;
   }
}
  • 自定义RecordReader
/**
 * 
 * RecordReader的核心工作逻辑:
 * 通过nextKeyValue()方法去读取数据构造将返回的key   value
 * 通过getCurrentKey 和 getCurrentValue来返回上面构造好的key和value
 * 
 * 
 * @author
 *
 */
public class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> {
   private FileSplit fileSplit;
   private Configuration conf;
   private BytesWritable value = new BytesWritable();
   private boolean processed = false;
   @Override
   public void initialize(InputSplit split, TaskAttemptContext context)
         throws IOException, InterruptedException {
      this.fileSplit = (FileSplit) split;
      this.conf = context.getConfiguration();
   }
   @Override
   public boolean nextKeyValue() throws IOException, InterruptedException {
      if (!processed) {
         byte[] contents = new byte[(int) fileSplit.getLength()];
         Path file = fileSplit.getPath();
         FileSystem fs = file.getFileSystem(conf);
         FSDataInputStream in = null;
         try {
            in = fs.open(file);
            IOUtils.readFully(in, contents, 0, contents.length);
            value.set(contents, 0, contents.length);
         } finally {
            IOUtils.closeStream(in);
         }
         processed = true;
         return true;
      }
      return false;
   }
   @Override
   public NullWritable getCurrentKey() throws IOException,
         InterruptedException {
      return NullWritable.get();
   }
   @Override
   public BytesWritable getCurrentValue() throws IOException,
         InterruptedException {
      return value;
   }
   @Override
   public float getProgress() throws IOException {
      return processed ? 1.0f : 0.0f;
   }
   @Override
   public void close() throws IOException {
   }
}
  • 自定义MapReduce逻辑
public class SmallFilesToSequenceFileConverter extends Configured implements Tool {
   static class SequenceFileMapper extends
         Mapper<NullWritable, BytesWritable, Text, BytesWritable> {
      private Text filenameKey;
      @Override
      protected void setup(Context context) throws IOException,
            InterruptedException {
         InputSplit split = context.getInputSplit();
         Path path = ((FileSplit) split).getPath();
         filenameKey = new Text(path.toString());
      }
      @Override
      protected void map(NullWritable key, BytesWritable value,
            Context context) throws IOException, InterruptedException {
         context.write(filenameKey, value);
      }
   }
   @Override
   public int run(String[] args) throws Exception {
      Configuration conf = new Configuration();
      Job job = Job.getInstance(conf,"combine small files to sequencefile");
      job.setJarByClass(SmallFilesToSequenceFileConverter.class);
      job.setInputFormatClass(WholeFileInputFormat.class);
      WholeFileInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义inputformat_小文件合并\\input"));
      job.setOutputFormatClass(SequenceFileOutputFormat.class);
      SequenceFileOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义inputformat_小文件合并\\output"));
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(BytesWritable.class);
      job.setMapperClass(SequenceFileMapper.class);
      return job.waitForCompletion(true) ? 0 : 1;
   }
   public static void main(String[] args) throws Exception {
      int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(),
            args);
      System.exit(exitCode);
      
   }
}

4. 自定义outputFormat输出文件分文件夹

4.1 需求

现在有一些订单的评论数据,需求,将订单的好评与差评进行区分开来,将最终的数据分开到不同的文件夹下面去,数据内容参见资料文件夹,其中数据第九个字段表示好评,中评,差评。0:好评,1:中评,2:差评

4.2 分析

程序的关键点是要在一个mapreduce程序中根据数据的不同输出两类结果到不同目录,这类灵活的输出需求可以通过自定义outputformat来实现
MapReduce高阶训练 10_第5张图片

4.3 实现

1、在mapreduce中访问外部资源
2、自定义outputformat,改写其中的recordwriter,改写具体输出数据的方法write()

  • 自定义一个outputformat
public class MyOutPutFormat extends FileOutputFormat<Text,NullWritable> {
    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
        FileSystem fs = FileSystem.get(context.getConfiguration());
        Path enhancePath = new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义outputformat\\out1\\1.txt");
        Path toCrawlPath = new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义outputformat\\out2\\2.txt");
        FSDataOutputStream enhanceOut = fs.create(enhancePath);
        FSDataOutputStream toCrawlOut = fs.create(toCrawlPath);
        return new MyRecordWriter(enhanceOut,toCrawlOut);
    }
    static class MyRecordWriter extends RecordWriter<Text, NullWritable>{

        FSDataOutputStream enhanceOut = null;
        FSDataOutputStream toCrawlOut = null;

        public MyRecordWriter(FSDataOutputStream enhanceOut, FSDataOutputStream toCrawlOut) {
            this.enhanceOut = enhanceOut;
            this.toCrawlOut = toCrawlOut;
        }

        @Override
        public void write(Text key, NullWritable value) throws IOException, InterruptedException {
            if (key.toString().split("\t")[9].equals("0")){
                toCrawlOut.write(key.toString().getBytes());
                toCrawlOut.write("\r\n".getBytes());
            }else{
                enhanceOut.write(key.toString().getBytes());
                enhanceOut.write("\r\n".getBytes());
            }
        }
        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            if(toCrawlOut!=null){
                toCrawlOut.close();
            }
            if(enhanceOut!=null){
                enhanceOut.close();
            }
        }
    }
}
  • 开发mapreduce处理流程
public class MyOwnOutputFormatMain extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = super.getConf();
        Job job = Job.getInstance(conf, MyOwnOutputFormatMain.class.getSimpleName());
        job.setJarByClass(MyOwnOutputFormatMain.class);
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义outputformat\\input"));
        job.setMapperClass(MyOwnMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormatClass(MyOutPutFormat.class);
        //设置一个输出目录,这个目录会输出一个success的成功标志的文件
        MyOutPutFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义outputformat\\out2"));
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        boolean b = job.waitForCompletion(true);
        return b?0:1;
    }
        public static class MyOwnMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
            @Override
            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                String[] split = value.toString().split("\t");
                String commentStatus = split[9];
               context.write(value,NullWritable.get());
            }
        }
    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        ToolRunner.run(configuration,new MyOwnOutputFormatMain(),args);
    }
}

5. 分组求topN

  • 自定义GroupingComparator求取topN
  • GroupingComparator是mapreduce当中reduce端的一个功能组件,主要的作用是决定哪些数据作为一组,调用一次reduce的逻辑,默认是每个不同的key,作为多个不同的组,每个组调用一次reduce逻辑,
  • 我们可以自定义GroupingComparator实现不同的key作为同一个组,调用一次reduce逻辑

5.1 需求

MapReduce高阶训练 10_第6张图片

5.2 分析

1、利用“订单id和成交金额”作为key,可以将map阶段读取到的所有订单数据按照id分区,按照金额排序,发送到reduce
2、在reduce端利用groupingcomparator将订单id相同的kv聚合成组,然后取第一个即是最大值
MapReduce高阶训练 10_第7张图片

5.3 代码实现

  • 定义OrderBean
    定义一个OrderBean,里面定义两个字段,第一个字段是我们的orderId,第二个字段是我们的金额(注意金额一定要使用Double或者DoubleWritable类型,否则没法按照金额顺序排序)
public class OrderBean implements WritableComparable<OrderBean> {
    private String orderId;
    private Double price;
    @Override
    public int compareTo(OrderBean o) {
        //比较订单id的排序顺序
        int i = this.orderId.compareTo(o.orderId);
        if(i==0){
          //如果订单id相同,则比较金额,金额大的排在前面
           i = - this.price.compareTo(o.price);
        }
        return i;
    }
    @Override
    public void write(DataOutput out) throws IOException {
            out.writeUTF(orderId);
            out.writeDouble(price);
    }
    @Override
    public void readFields(DataInput in) throws IOException {
        this.orderId =  in.readUTF();
        this.price = in.readDouble();
    }
    public OrderBean() {
    }
    public OrderBean(String orderId, Double price) {
        this.orderId = orderId;
        this.price = price;
    }
    public String getOrderId() {
        return orderId;
    }
    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }
    public Double getPrice() {
        return price;
    }
    public void setPrice(Double price) {
        this.price = price;
    }
    @Override
    public String toString() {
        return  orderId +"\t"+price;
    }
}
  • 自定义分区
    按照订单id进行分区,把所有订单id相同的数据,都发送到同一个reduce中去
public class OrderPartition extends Partitioner<OrderBean,NullWritable> {
    @Override
    public int getPartition(OrderBean orderBean, NullWritable nullWritable, int i) {
        //自定义分区,将相同订单id的数据发送到同一个reduce里面去
        return  (orderBean.getOrderId().hashCode() & Integer.MAX_VALUE)%i;
    }
}
  • 自定义groupingComparator
    按照我们自己的逻辑进行分组,通过比较相同的订单id,将相同的订单id放到一个组里面去,进过分组之后当中的数据,已经全部是排好序的数据,我们只需要取前topN即可
public class MyGroupIngCompactor extends WritableComparator {
    //将我们自定义的OrderBean注册到我们自定义的MyGroupIngCompactor当中来
    //表示我们的分组器在分组的时候,对OrderBean这一种类型的数据进行分组
    //传入作为key的bean的class类型,以及制定需要让框架做反射获取实例对象
    public MyGroupIngCompactor() {
        super(OrderBean.class,true);
    }
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        OrderBean first = (OrderBean) a;
        OrderBean second = (OrderBean) b;
        return first.getOrderId().compareTo(second.getOrderId());
    }
}
  • 程序main函数入口
public class GroupingCompactorMain extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(super.getConf(), GroupingCompactorMain.class.getSimpleName());
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义groupingComparator\\input"));
        job.setMapperClass(MyGroupingMapper.class);
        job.setMapOutputKeyClass(OrderBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setPartitionerClass(OrderPartition.class);
        job.setGroupingComparatorClass(MyGroupIngCompactor.class);
        job.setReducerClass(MyGroupingReducer.class);
        job.setOutputKeyClass(OrderBean.class);
        job.setOutputValueClass(NullWritable.class);
        job.setNumReduceTasks(2);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义groupingComparator\\output"));
        boolean b = job.waitForCompletion(true);
        return b?0:1;
    }
    public static class MyGroupingMapper extends Mapper<LongWritable,Text,OrderBean,NullWritable>{

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");
            OrderBean orderBean = new OrderBean(split[0], Double.valueOf(split[2]));
            context.write(orderBean,NullWritable.get());
        }
    }
    public static class MyGroupingReducer extends Reducer<OrderBean,NullWritable,OrderBean,NullWritable>{
        @Override
        protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            context.write(key,NullWritable.get());
        }
    }
    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Configuration(),new GroupingCompactorMain(),args);
    }
}

6. MapReduce的其他补充

  • 多job串联
    一个稍复杂点的处理逻辑往往需要多个mapreduce程序串联处理,多job的串联可以借助mapreduce框架的JobControl实现
  • 实例代码
 ControlledJob cJob1 = new ControlledJob(job1.getConfiguration());
        ControlledJob cJob2 = new ControlledJob(job2.getConfiguration());
        ControlledJob cJob3 = new ControlledJob(job3.getConfiguration());
        cJob1.setJob(job1);
        cJob2.setJob(job2);
        cJob3.setJob(job3);
        // 设置作业依赖关系
        cJob2.addDependingJob(cJob1);
        cJob3.addDependingJob(cJob2);
        JobControl jobControl = new JobControl("RecommendationJob");
        jobControl.addJob(cJob1);
        jobControl.addJob(cJob2);
        jobControl.addJob(cJob3);
 
 
        // 新建一个线程来运行已加入JobControl中的作业,开始进程并等待结束
        Thread jobControlThread = new Thread(jobControl);
        jobControlThread.start();
        while (!jobControl.allFinished()) {
            Thread.sleep(500);
        }
        jobControl.stop();
 
        return 0;

你可能感兴趣的:(Hadoop)