hive 实现 udf row_number 以及遇到的问题

为hive的每条数据添加row_number, 首先添加行号,必须考虑到数据必须放在一个reduce中去执行。先上代码
package xx.xxxxx.hive.udf;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;

@UDFType(deterministic = false)
public class RowNumber extends UDF {
     private static int MAX_VALUE = 50;
    private static String comparedColumn[] = new String[MAX_VALUE];
    private static int rowNum = 1;
   
    public int evaluate(Object ...args) {
        String columnValue[] = new String[args.length];
        for (int i = 0; i < args.length; i++){
            columnValue[i] = args[i].toString();
        }
        if (rowNum == 1) {
            for (int i = 0; i < columnValue.length; i++)
                comparedColumn[i] = columnValue[i];
        }

        for (int i = 0; i < columnValue.length; i++) {
            if (!comparedColumn[i].equals(columnValue[i])) {
                for (int j = 0; j < columnValue.length; j++) {
                    comparedColumn[j] = columnValue[j];
                }
                rowNum = 1;
                return rowNum++;
            }
        }
        return rowNum++;
    }
} 
打包jar包,并创建函数。
add jar /home/hdbatch/jars/iclickhiveudf.jar;
create temporary function row_number as 'cn.iclick.hive.udf.RowNumber';
但是用法要注意,假设我要对一个表的数据进行标注行号,两条sql语句,
create table test_tony as select row_number(1), tid from(select distinct tid from cookie  where i_date=20131105)t order by tid;
上边这条语句会标注行号错误,会产生11个reduce,所以会打11份相同的row number,所以就会有错误,为什么会有不同的解释呢??? 看explain sql语句, 原因是 编写non-deterministic的UDF时遇到的谓词下推错误。
具体详见:  http://blog.sina.com.cn/s/blog_6ff05a2c01017myn.html
正确的写法:
create table test_tony as select row_number(1) rownumber, tid from (select  distinct tid from cookie where i_date=20131105 order by tid) t ;
(一)
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-2 depends on stages: Stage-1
  Stage-0 is a root stage

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Alias -> Map Operator Tree:
        t:cookie_label_weight
          TableScan
            alias: cookie_label_weight
            Select Operator
              expressions:
                    expr: tid
                    type: string
              outputColumnNames: tid
              Group By Operator
                bucketGroup: false
                keys:
                      expr: tid
                      type: string
                mode: hash
                outputColumnNames: _col0
                Reduce Output Operator
                  key expressions:
                        expr: _col0
                        type: string
                  sort order: +
                  Map-reduce partition columns:
                        expr: _col0
                        type: string
                  tag: -1
      Reduce Operator Tree:
        Group By Operator
          bucketGroup: false
          keys:
                expr: KEY._col0
                type: string
          mode: mergepartial
          outputColumnNames: _col0
          Select Operator
            expressions:
                  expr: _col0
                  type: string
            outputColumnNames: _col0
            Select Operator
              expressions:
                    expr: row_number(1)
                    type: int
                    expr: _col0
                    type: string
                      type: int
                    expr: _col0
                    type: string
              outputColumnNames: _col0, _col1
              File Output Operator
                compressed: false
                GlobalTableId: 0
                table:
                    input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                    output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

  Stage: Stage-2
    Map Reduce
      Alias -> Map Operator Tree:
        maprfs:/user/hdbatch/tmp/hive/hive_2013-11-08_18-13-02_418_6171072878206847672/-mr-10002
            Reduce Output Operator
              key expressions:
                    expr: _col1
                    type: string
              sort order: +
              tag: -1
              value expressions:
                    expr: _col0
                    type: int
                    expr: _col1
                    type: string
      Reduce Operator Tree:
        Extract
          File Output Operator
            compressed: false
            GlobalTableId: 0
            table:
                input format: org.apache.hadoop.mapred.TextInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

  Stage: Stage-0
    Fetch Operator
      limit: -1
(二)
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-2 depends on stages: Stage-1
  Stage-0 is a root stage
  
STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Alias -> Map Operator Tree:
        t:cookie_label_weight
          TableScan
            alias: cookie_label_weight
            Select Operator
              expressions:
                    expr: tid
                    type: string
              outputColumnNames: tid
              Group By Operator
                bucketGroup: false
                keys:
                      expr: tid
                      type: string
                mode: hash
                outputColumnNames: _col0
                Reduce Output Operator
                  key expressions:
                        expr: _col0
                        type: string
                  sort order: +
                  Map-reduce partition columns:
                        expr: _col0
                        type: string
                  tag: -1
      Reduce Operator Tree:
        Group By Operator
          bucketGroup: false
          keys:
                expr: KEY._col0
                type: string
          mode: mergepartial
          outputColumnNames: _col0
          Select Operator
            expressions:
                  expr: _col0
                  type: string
            outputColumnNames: _col0
            File Output Operator
              compressed: false
              GlobalTableId: 0
              table:
                  input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
  
  Stage: Stage-2
    Map Reduce
      Alias -> Map Operator Tree:
        maprfs:/user/hdbatch/tmp/hive/hive_2013-11-08_18-02-33_734_4047092562620720785/-mr-10002
            Reduce Output Operator
              key expressions:
                    expr: _col0
                    type: string
              sort order: +
              tag: -1
              value expressions:
                    expr: _col0
                    type: string
      Reduce Operator Tree:
        Extract
          Select Operator
            expressions:
                  expr: row_number(1)
                  type: int
                  expr: _col0
                  type: string
            outputColumnNames: _col0, _col1
            File Output Operator
              compressed: false
              GlobalTableId: 0
              table:
                  input format: org.apache.hadoop.mapred.TextInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
  
  Stage: Stage-0
    Fetch Operator
      limit: -1








你可能感兴趣的:(hive,hadoop)