hive UDAF之cube

之前有想实现在hadoop上面自动cube,并计算同比的自动化解决方法。过考虑用UDAF去做但是一直没有去实现,最近项目中需要根据配置自动聚合生成数据,目标结果表格式固定,正好满足自动cube的场景,所以就搞了个demo,还在测试中


package com.taobao.voc.hive.udf;
/**
  * description :对传入的多个维度的所有组合所对应的度量进行汇总
  * @param :dim1,dim2 [... ,dim10] , '度量1,度量2,...度量N '
  * @return : 返回一个数组,数组的每一个成员即一条返回记录,顺序与输入参数位置一一对应
  * @comment :目前的多个度量需要拼成一个字符串,用逗号做分隔符,且是整数
  * @author : 松坡
* @version : 1.0
*/
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;

public class SumCube extends UDAF {

public static class Evaluator implements UDAFEvaluator {
private boolean mEmpty;
private double mSum;
private Long mcount;
private ArrayList dim_list = new ArrayList();
private String split_str = "_";
private String sub_split_str = ",";
private Map hm_out = new HashMap();
private Map hm_ret = new HashMap();
private String[] dim_array;
private static int dim_num = 0;
private ArrayList ret_al=new ArrayList();

public Evaluator() {
super();
init();
}

public void init() {
mSum = 0;
mcount = new Long(0);
mEmpty = true;

}

public static String getAddStr(String src_old, String src_new) {
String ret_str = "";
if (src_old == null || src_old.equals("")) {
ret_str = src_new;
} else {
String[] l_old = src_old.split(",");
String[] l_new = src_new.split(",");

for (int i = 0; i < src_new.split(",").length; i++) {
ret_str = ret_str
+ (Long.parseLong(l_old[i]) + Long
.parseLong(l_new[i])) + ",";
}
ret_str = ret_str.substring(0, ret_str.length() - 1);
}
return ret_str;
}

public boolean iterate(String... args) {
String in_num = "";

if (args.length > 0) {
in_num = args[args.length - 1];//最后一位是需要聚集的参数
dim_array = new String[args.length - 1];
dim_num = args.length - 1;
//将字段保存到数组中
for (int a = 0; a < args.length - 1; a++) {
dim_array[a] = args[a];
}
// dim_array = dim.split(sub_split_str);
}

//拼接纬度
if (mcount == 0) {
StringBuffer sb_tmp = new StringBuffer();

for (int i = 0; i < dim_array.length; i++) {
sb_tmp.append(i).append(sub_split_str);
}
String dim_src = sb_tmp.toString();

dim_list = getDim(dim_src.substring(0, dim_src.length() - 1));
}


for (int i = 0; i < dim_list.size(); i++) {
StringBuffer sb_tmp_1 = new StringBuffer();
String dim_cube = "";
int num1 = 0;

if (dim_list.get(i).contains("ALL")) {
sb_tmp_1.append("ALL").append(split_str);
} else {
sb_tmp_1.append(dim_list.get(i)).append(split_str);

for (int j = 0; j < dim_list.get(i).length(); j++) {
num1 = Integer.parseInt(dim_list.get(i).substring(j,
j + 1));
sb_tmp_1.append(dim_array[num1]).append(sub_split_str);

}
}
dim_cube = sb_tmp_1.toString().substring(0,
sb_tmp_1.toString().length() - 1);


if (hm_out.containsKey(dim_cube)) {
hm_out.put(dim_cube,
getAddStr(hm_out.get(dim_cube), in_num));
} else {
hm_out.put(dim_cube, in_num);
}
}

mcount++;

return true;
}

public Map terminatePartial() {
Map hm_ext = new HashMap();
for (Map.Entry entry : hm_out.entrySet()) {
String key = entry.getKey().toString();
String val = entry.getValue().toString();
String v=getSrcDim(key,dim_num);
hm_ext.put(v, val);
}
return hm_ext;
}

public boolean merge(Map hm_merge) {
for (Map.Entry entry : hm_merge.entrySet()) {
String key = entry.getKey().toString();
String value = entry.getValue().toString();
if (hm_ret.containsKey(key)) {
hm_ret.put(key, getAddStr(hm_ret.get(key), value));
} else {
hm_ret.put(key, value);
}
}


return true;
}

public ArrayList terminate() {
for (Map.Entry entry : hm_ret.entrySet()) {
String key = entry.getKey().toString();
String val = entry.getValue().toString();
ret_al.add(key+val);
}

return ret_al;
}


public ArrayList getDim(String dim_src) {
String src_in = dim_src;

String[] src_in_array = src_in.split(",");
ArrayList src_out_array = new ArrayList();
String slipt_str = ",";

int j = 0;
int flag = 0;
int flag2 = 0;
String tmp_new = "";
String[] last_item_arry = null;
StringBuffer tmp_sb = new StringBuffer();

for (int i = 0; i < src_in_array.length; i++) {
tmp_sb = new StringBuffer();
j = i;
if (i == 0) {
while (j < src_in_array.length) {
tmp_sb.append(src_in_array[j]).append(slipt_str);
j++;
continue;
}
} else {
for (int k = 0; k < last_item_arry.length; k++) {
for (int l = k; l < src_in_array.length; l++) {
if (last_item_arry[k].contains(src_in_array[l])) {
continue;
} else {

for (int f = 0; f < tmp_sb.toString().split(
slipt_str).length; f++) {
tmp_new = last_item_arry[k]
.concat(src_in_array[l]);
flag = 0;
for (int d = 0; d < tmp_new.length(); d++) {
if (tmp_sb.toString().split(slipt_str)[f]
.contains(tmp_new.substring(d,
d + 1))) {
flag++;
flag2 = 1;
}
}
if (flag == tmp_new.length()) {
flag2 = flag;
break;
}
}

if (flag <= i && flag2 < tmp_new.length()) {
tmp_sb.append(last_item_arry[k])
.append(src_in_array[l])
.append(slipt_str);
} else {
flag2 = 1;
}
}
}
}
}
src_out_array.add(tmp_sb.toString());
last_item_arry = tmp_sb.toString().split(slipt_str);
}

ArrayList out_array = new ArrayList();
String tmp_str = "";
for (int e = 0; e < src_out_array.size(); e++) {
tmp_str = src_out_array.get(e).toString();
for (int w = 0; w < tmp_str.split(slipt_str).length; w++) {
out_array.add(tmp_str.split(slipt_str)[w].toString());
}
}
out_array.add("ALL");
return out_array;
}


public static String getSrcDim(String arg, int num) {
String ret = "";
String tmp1 = "";
String[] tmp2 = new String[1];
String[] tmp3= new String[num];

for(int r1=0;r1 tmp3[r1]="all";
}

if ((!arg.contains("ALL")) ) {
tmp1 = arg.split("_")[0];
tmp2= arg.split("_")[1].split(",");
int tmp_f=0;

for (int r2 = 0; r2 < tmp1.length(); r2++) {
tmp_f=(int)tmp1.charAt(r2)-48;
tmp3[tmp_f] = tmp2[r2];
}


}
for(int r3=0;r3 ret=ret+tmp3[r3]+",";
}
return ret;

}

}

}

你可能感兴趣的:(hadoop)