Parquet_1. 使用谓词下推和映射来优化 Job

文章中涉及的Github源码 : https://github.com/Hanmourang/hiped2/blob/master/src/main/java/hip/ch3/parquet/AvroProjectionParquetMapReduce.java.

映射下推[projectoin pushdown]和谓词下推[predicates pushdown]包括一个执行引擎,用来将映射和谓词推送到存储格式中以便于在底层尽可能来优化操作。结果就是提高了时间和空间上的效率,由于与查询无关的列都会被摒弃,并且不需要提供给执行引擎。

这对柱状存储无疑是非常有效的,因为 Pushdown 允许存储格式跳过整个与查询无关的列组,并且柱状存储格式操作起来也更加高效。

接下来我们将看看如果在 Hadoop 管道中使用 Pushdown。

在编码之前,你需要在 Hive 和 Pig 中启动对 Parquet 提供的开箱即用的投影/映射下推。在 MapReduce 程序中,有些手动操作你需要加入到 Driver 代码中来启动下推操作。从下面的高亮部分可以看到具体操作。
Hive --> Predicates [set hive.optimize.ppd = true; ]
Pig   --> Projectoin [  以后补充]


具体代码实现如下:

package hip.ch3.parquet;

import com.google.common.collect.Lists;
import hip.ch3.avro.gen.Stock;
import hip.ch3.avro.gen.StockAvg;
import hip.util.Cli;
import hip.util.CliCommonOpts;
import org.apache.avro.Schema;
import org.apache.commons.math.stat.descriptive.moment.Mean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import parquet.avro.AvroParquetInputFormat;
import parquet.avro.AvroParquetOutputFormat;
import parquet.column.ColumnReader;
import parquet.filter.ColumnPredicates;
import parquet.filter.ColumnRecordFilter;
import parquet.filter.RecordFilter;
import parquet.filter.UnboundRecordFilter;

import java.io.IOException;
import java.util.List;

public class AvroProjectionParquetMapReduce extends Configured implements Tool {

  /**
   * Main entry point for the example.
   *
   * @param args arguments
   * @throws Exception when something goes wrong
   */
  public static void main(final String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(), new AvroProjectionParquetMapReduce(), args);
    System.exit(res);
  }

  /**
   * The MapReduce driver - setup and launch the job.
   *
   * @param args the command-line arguments
   * @return the process exit code
   * @throws Exception if something goes wrong
   */
  public int run(final String[] args) throws Exception {

    Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
    int result = cli.runCmd();

    if (result != 0) {
      return result;
    }

    Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
    Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

    Configuration conf = super.getConf();

    Job job = new Job(conf);
    job.setJarByClass(AvroProjectionParquetMapReduce.class);

    job.setInputFormatClass(AvroParquetInputFormat.class);
    AvroParquetInputFormat.setInputPaths(job, inputPath);

    // 为 Job 设置谓词下推 [predicate pushdown]
    AvroParquetInputFormat.setUnboundRecordFilter(job, GoogleStockFilter.class);

    // 基于原始的schema来为 映射[projection] 定义一个新的 schema
    Schema projection = Schema.createRecord(Stock.SCHEMA$.getName(),
        Stock.SCHEMA$.getDoc(), Stock.SCHEMA$.getNamespace(), false);
    List fields = Lists.newArrayList();
    for (Schema.Field field : Stock.SCHEMA$.getFields()) {
      if ("symbol".equals(field.name()) || "open".equals(field.name())) {  // 只将股票代码和开盘价格映射输出
        fields.add(new Schema.Field(field.name(), field.schema(), field.doc(),
            field.defaultValue(), field.order()));
      }
    }
    projection.setFields(fields);
    AvroParquetInputFormat.setRequestedProjection(job, projection);        // 为 Job 设置映射[projection]


    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputFormatClass(AvroParquetOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    AvroParquetOutputFormat.setSchema(job, StockAvg.SCHEMA$);

    return job.waitForCompletion(true) ? 0 : 1;
  }
// 创建一个继承 predicate 的类
  public static class GoogleStockFilter implements UnboundRecordFilter {

    private final UnboundRecordFilter filter;

    public GoogleStockFilter() {
      filter = ColumnRecordFilter.column("symbol", ColumnPredicates.equalTo("GOOG"));  // 定义一个过滤谓词,用来过滤出股票代码等于“GOOG”
    }

    @Override
    public RecordFilter bind(Iterable readers) {
      return filter.bind(readers);
    }
  }

  public static class Map extends Mapper {   // 最初的 Stock 对象依旧提供给了 mapper

    @Override
    public void map(Void key,
                    Stock value,
                    Context context) throws IOException, InterruptedException {
      // 检查 null 值,避免记录因为谓词下推而被过滤掉 
      if (value != null) {      
        context.write(new Text(value.getSymbol().toString()),
            new DoubleWritable(value.getOpen()));   // 映射操作使得最后只有股票和开盘价格导出,其他字段为null
      }
    }
  }

  public static class Reduce extends Reducer {

    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
      Mean mean = new Mean();
      for (DoubleWritable val : values) {
        mean.increment(val.get());
      }
      StockAvg avg = new StockAvg();
      avg.setSymbol(key.toString());
      avg.setAvg(mean.getResult());
      context.write(null, avg);
    }
  }
}


运作上面的代码,可以看到股票代码为 'GOOG' 的股票平均值:

$ hip hip.ch3.parquet.AvroProjectionParquetMapReduce \
    --input stocks.parquet \
    --output output

$ hip --nolib parquet.tools.Main cat output/part-r-00000.parquet
symbol = GOOG
avg = 417.47799999999995

你可能感兴趣的:(Hive,Pig,Parquet)