hive的udf通过Waterdrop把hive的数据抽到ClickHouse

1,因业务需要把hive的数据通过WaterDrop抽取到clickhouse,但是每次都要写配置文件,故写一个配置文件自动生成配置信息.

1,版本信息:

 waterdrop版本 :1.5.0

 spark版本:3.0.0
 
 hive版本: 3.0.0

3. pom.xml文件如下:


	4.0.0

	hdp-udf
	hdp-udf
	4.1
	jar

	hdp-udf
	http://maven.apache.org

	
		UTF-8
		3.1.0
		3.1.0
	

	
		
			org.apache.hadoop
			hadoop-client
			${hadoop.version}
			
				
					log4j
					log4j
				
				
					org.slf4j
					slf4j-log4j12
				
			
			provided
		
		
			org.apache.hadoop
			hadoop-common
			${hadoop.version}
			provided
		
		
			org.apache.hive
			hive-exec
			${hive.version}
			provided
		
		
			org.apache.hive
			hive-jdbc
			2.1.0.2.6.4.76-1
			provided
		
		
			org.apache.hive
			hive-service
			2.1.0
			provided
		

		
		
			com.crgecent
			crgt-util
			1.1.0
		

		
			redis.clients
			jedis
			2.9.0
		
		
			junit
			junit
			4.12
			test
		
		
			org.apache.kafka
			kafka-clients
			1.1.1
		
		
			org.apache.kafka
			kafka_2.11
			1.1.1
		
		
			mysql
			mysql-connector-java
			5.1.25
		
		
			ru.yandex.clickhouse
			clickhouse-jdbc
			0.2
		
		
		
			com.alibaba
			fastjson
			1.2.62
		
	

	
		hiveudf_${version}
		
			
				src/main/java
			
		
		
			
				net.alchim31.maven
				scala-maven-plugin
				3.3.1
				
					
						scala-compile-first
						process-resources
						
							add-source
							compile
						
					

					
						compile
						
							compile
							testCompile
						
					
				
			
			
				maven-assembly-plugin
				3.0.0
				
					
						jar-with-dependencies
					





				
				
					
						make-assembly
						package
						
							single
						
					
				
			
			
				org.apache.maven.plugins
				maven-compiler-plugin
				
					8
					8
				
			
		
	


	
		
			ownaliyunmaven
			own aliyun maven repository
			http://10.3.1.29:8081/repository/aliyun/
		
		
			ownmaven
			own maven repository
			http://10.3.1.29:8081/repository/maven-central/
		

		
			Hortonworks Repository
			http://repo.hortonworks.com/content/repositories/releases/
			
				true
			
			
				false
			
		

		
			
				true
			
			
				true
			
			hortonworks.other
			Hortonworks Other Dependencies
			http://repo.hortonworks.com/content/groups/public
		

	


2,因为waterdrop抽取hive的分区表有点问题,所以目前只支持非分区表的情况,如果有需要支持分区表,可以修改udf完成支持

package com.xxx;


import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.Text;

import java.sql.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;


/**
 * https://blog.csdn.net/xiao_jun_0820/article/details/76638198
 * 根据hive的表在,自动生成一个ck的表,输入值要求是带ck的中间表
 * wpp 自己开发的
 * 部署在06上
 * /home/admin/waterdrop
 *
 */
public class TableCreateGenericUDF extends GenericUDF {

    private static String driverName = "org.apache.hive.jdbc.HiveDriver";
    StringObjectInspector keyElementOI;

    @Override
    public Object evaluate(DeferredObject[] arg0)   throws HiveException{

            String str = keyElementOI.getPrimitiveJavaObject(arg0[0].get());

            if(  str == null ||  str == "null" ){
                return new Text("获取参数错误,需要");
            }
            StringBuffer res = new StringBuffer( );
            System.out.println("输入值 : " +str);
            String newStr = str.replace("'", "").replace("\"", "");

            System.out.println("table: " +newStr);
            if(newStr.split("\\.").length == 1 ){
                return new Text("请输入数据库名");
            }

            String databaseName = newStr.split("\\.")[0];
            String tableName = newStr.split("\\.")[1];//要求带ck

            try{
               String createTableInfo = getCKCreateTableInfo(databaseName,tableName);
               res.append(createTableInfo);

               res.append("\n========================配置信息 "+ tableName+ ".conf ==================\n");
               String tableMetaInfo = TableParse.getTableMetaInfo(newStr);
               String ckConfInfo = getCKConfInfo(databaseName,tableName,tableMetaInfo);
               res.append(ckConfInfo);

            }catch (Exception e){
                e.printStackTrace();
                System.out.println("获取结果异常:" +e.getMessage());
            }

            return new Text(res.toString());
    }
    public static  String getCKConfInfo(String databaseName ,String tableName,String tableMetaInfo ) {

        String fullTableName = databaseName +"."+ tableName;

        String ckFullTableName = databaseName +"."+ tableName.replaceAll("_ck","");

        String res = "spark {\n" +
                "  spark.app.name = \"" + tableName+"_2_ck\"\n" +
                "  spark.executor.instances = 6\n" +
                "  spark.executor.cores = 2\n" +
                "  spark.executor.memory = \"2g\"\n" +
                "  spark.sql.catalogImplementation = \"hive\"\n" +
                "}\n" +
                "\n" +
                "input {\n" +
                "   hive {\n" +
                "        pre_sql = \"select * from "+ fullTableName +"\" \n" +
                "        table_name = \""+tableName+"\"\n" +
                "    }\n" +
                "}\n" +
                "\n" +
                "filter {\n" +
                "    remove {\n" +
                "     #   source_field = [\"bizdate\"]\n" +
                "    }\n" +
                "    \n" +
                tableMetaInfo +"\n"+
                "}\n" +
                "\n" +
                "output {\n" +
                "   clickhouse {\n" +
                "        host = \"10.2.12.56:8123\"\n" +
                "        database = \""+databaseName+"\"\n" +
                "        table = \""+ ckFullTableName+"_cls\"\n" +
                "        username = \"root\"\n" +
                "        password = \"root\"\n" +
                "    }\n" +
                "}\n";

        return res;
    }

    public static  String getCKCreateTableInfo(String databaseName ,String tableName) throws SQLException{
        String dbTableName = databaseName +"."+tableName;
        try {
            Class.forName(driverName);
        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
            System.out.println("=================================");
            e.printStackTrace();
            System.exit(1);
        }
        Connection con = DriverManager.getConnection(Utils.HIVE_JDBC_URL, "admin", "admin");

        Statement stmt = con.createStatement();

        String sql = "describe  " + dbTableName ;
        System.out.println("Running: " + sql);
        ResultSet res = stmt.executeQuery(sql);
        StringBuffer result = new StringBuffer();
        StringBuffer resultCls = new StringBuffer();

        String ckTable = tableName.replaceAll("_ck$","");
        String ckDbTableName = databaseName +"." +ckTable;

        result.append("==============请确定主键和date类型的列,并把Nullable去掉=============\n");
        result.append("CREATE TABLE " +ckDbTableName +" on cluster crm_4shards_1replicas (\n");
        resultCls.append("CREATE TABLE " +ckDbTableName +"_cls on cluster crm_4shards_1replicas (\n");

        while (res.next()) {
            String dataKey =  res.getString(1);
            String dataType =  res.getString(2);
            System.out.println(dataKey + "\t" + dataType);
            String ckDataType = Utils.getParseType(dataType.toLowerCase());

            if(dataKey.equals("ckbizdate")){
                result.append(" `" + dataKey    + "` Date,\n" );
                resultCls.append(" `" + dataKey + "` Date,\n" );
            }else {
                result.append(" `" + dataKey + "` Nullable(" + ckDataType + "),\n");
                resultCls.append(" `" + dataKey + "` Nullable(" + ckDataType + "),\n");
            }
        }

        result =new StringBuffer(  result.substring(0,result.length()-2));

        resultCls =new StringBuffer(  resultCls.substring(0,resultCls.length()-2));
        result.append("\n)ENGINE = MergeTree(ckbizdate, id, 8192);\n\n");
        resultCls.append("\n)ENGINE = Distributed(crm_4shards_1replicas,"+databaseName+" ," +ckTable+ ",rand());");

        result.append(resultCls);

        return  result.toString();
    }


    @Override
    public String getDisplayString(String[] arg0) {
        return "TableCreateGenericUDF(database.tablename)";
    }

    @Override
    public ObjectInspector initialize(ObjectInspector[] arg0) throws UDFArgumentException {

        if (arg0.length != 1) {
            throw new UDFArgumentException(" Expecting  one  arguments: database.tablename");
        }

        // 1. 检查是否接收到正确的参数类型
        ObjectInspector key = arg0[0];

        if (!(key instanceof StringObjectInspector) ) {
            throw new UDFArgumentException("one argument  must be a string");
        }

        this.keyElementOI = (StringObjectInspector) key;

//        return PrimitiveObjectInspectorFactory.writableIntObjectInspector;
        return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
    }


}

2.1 utils.java

package com.crgt;

import java.util.regex.Pattern;

/**
 * @Author: wpp
 * @Date: 2019/10/21 19:41
 */
public class Utils {

    public static  String  HIVE_JDBC_URL= "jdbc:hive2://xx.xx.xx.xx:10000/default";

    public static String getParseType(String oriDataType){

        String dataType = null;

        switch (oriDataType.toLowerCase()){
            case "boolean":
                dataType="Int64";break;
            case "TINYINT":
                dataType="Int64";break;
            case "SMALLINT":
                dataType="Int64";break;
            case "int":
                dataType="Int64";break;
            case "bigint":
                dataType="Int64";break;
            case "float":
                dataType="Float64";break;
            case "double":
                dataType="Float64";break;
            case "decimal":
                dataType="Float64";break;
            case "string":
                dataType="String";break;
            case "datetime":
                dataType="String";break;
            case "timestamp":
                dataType="String";break;
            default:
                dataType="999999999";
        }

        return  dataType;
    }


    public static String getMysqlParseType(String oriDataType){

        String dataType = null;

        if(Pattern.matches(".*varchar.*",oriDataType)
            ||Pattern.matches(".*datetime.*",oriDataType)
            ||Pattern.matches(".*time.*",oriDataType)
            ||Pattern.matches(".*text.*",oriDataType)
        ){
            dataType="String";
        }else if(Pattern.matches(".*bigint.*.*unsigned.*",oriDataType)
                || Pattern.matches(".*tinyint.*.*unsigned.*",oriDataType)
                || Pattern.matches(".*int.*.*unsigned.*",oriDataType)

        ){
            dataType="UInt64";

        }else if(Pattern.matches(".*bigint.*",oriDataType)
                || Pattern.matches(".*tinyint.*",oriDataType)
                || Pattern.matches(".*int.*",oriDataType)
        ){
            dataType="Int64";
        }else if(Pattern.matches(".*float.*",oriDataType)
                || Pattern.matches(".*double.*",oriDataType)
                || Pattern.matches(".*decimal.*",oriDataType)
        ){
            dataType="Float64";
        }else {
            dataType="9999999999999";
        }

        return  dataType;
    }




}

3,创建永久udf函数

 

create function tableinfo as 'com.xxx.TableCreateGenericUDF'  using jar 'hdfs:///hiveudf/hiveudf_3.7-jar-with-dependencies.jar';

4,在hive命令行执行如下代码(只能输入非分区的表):

select default.tableinfo('hive_table_name');

5,查询结果如下,主要是对hive的int型转换,(clickhouse的int64或者Float64):

######
###### This config file is a demonstration of batch processing in waterdrop config
######

spark {

  spark.app.name = "dwd_ord_coupon_base_df_2_ck"
  spark.executor.instances = 6
  spark.executor.cores = 2
  spark.executor.memory = "2g"
  spark.sql.catalogImplementation = "hive"
}

input {


   hive {
        pre_sql = "select * from cdm_dwd.dwd_ord_coupon_base_df where bizdate='20191013' limit 10 "
        #result_table_name = "ads.ads_user_portrait_vertical_df"
        table_name = "dwd_ord_coupon_base_df"
    }



}

filter {
  # split data by specific delimiter

    remove {
        source_field = ["bizdate"]
    }


convert {
    source_field = "coupon_type"
    new_type = "long"
}
convert {
    source_field = "scene"
    new_type = "long"
}
convert {
    source_field = "status"
    new_type = "long"
}
convert {
    source_field = "threshold"
    new_type = "long"
}
convert {
    source_field = "discount"
    new_type = "long"
}
convert {
    source_field = "time_sharing"
    new_type = "long"
}
convert {
    source_field = "version"
    new_type = "long"
}

}

output {
  # choose stdout output plugin to output data to console

   stdout {
        host = "xx.xx.xx.xx:8123"
        database = "cdm_dwd"
        table = "cdm_dwd.dwd_ord_coupon_base_df2_cls"
       # fields = ["user_no","first_classification","second_classification","third_classification","fourth_classification","options","detail","create_time"]
        username = "root"
        password = "root"
    }



}

 

你可能感兴趣的:(clickhouse,HQL,UDF)