自定义udf把mysql的表结构自动转换成clickhouse的表结构

1,因业务需求,需要把mysql的数据同步到clickhouse,但是clickhouse建表语法有比较繁琐,故写了一个udf来生成数据

1.0 相关pom.xml


	4.0.0

	hdp-udf
	hdp-udf
	4.1
	jar

	hdp-udf
	http://maven.apache.org

	
		UTF-8
		3.1.0
		3.1.0
	

	
		
			org.apache.hadoop
			hadoop-client
			${hadoop.version}
			
				
					log4j
					log4j
				
				
					org.slf4j
					slf4j-log4j12
				
			
			provided
		
		
			org.apache.hadoop
			hadoop-common
			${hadoop.version}
			provided
		
		
			org.apache.hive
			hive-exec
			${hive.version}
			provided
		
		
			org.apache.hive
			hive-jdbc
			2.1.0.2.6.4.76-1
			provided
		
		
			org.apache.hive
			hive-service
			2.1.0
			provided
		

		
		
			com.crgecent
			crgt-util
			1.1.0
		

		
			redis.clients
			jedis
			2.9.0
		
		
			junit
			junit
			4.12
			test
		
		
			org.apache.kafka
			kafka-clients
			1.1.1
		
		
			org.apache.kafka
			kafka_2.11
			1.1.1
		
		
			mysql
			mysql-connector-java
			5.1.25
		
		
			ru.yandex.clickhouse
			clickhouse-jdbc
			0.2
		
		
		
			com.alibaba
			fastjson
			1.2.62
		
	

	
		hiveudf_${version}
		
			
				src/main/java
			
		
		
			
				net.alchim31.maven
				scala-maven-plugin
				3.3.1
				
					
						scala-compile-first
						process-resources
						
							add-source
							compile
						
					

					
						compile
						
							compile
							testCompile
						
					
				
			
			
				maven-assembly-plugin
				3.0.0
				
					
						jar-with-dependencies
					





				
				
					
						make-assembly
						package
						
							single
						
					
				
			
			
				org.apache.maven.plugins
				maven-compiler-plugin
				
					8
					8
				
			
		
	


	
		
			ownaliyunmaven
			own aliyun maven repository
			http://10.3.1.29:8081/repository/aliyun/
		
		
			ownmaven
			own maven repository
			http://10.3.1.29:8081/repository/maven-central/
		

		
			Hortonworks Repository
			http://repo.hortonworks.com/content/repositories/releases/
			
				true
			
			
				false
			
		

		
			
				true
			
			
				true
			
			hortonworks.other
			Hortonworks Other Dependencies
			http://repo.hortonworks.com/content/groups/public
		

	


package com.crgt;


import com.crgecent.common.util.DateUtil;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.Text;

import java.sql.*;
import java.util.Date;


/**
 * https://blog.csdn.net/xiao_jun_0820/article/details/76638198
 * 根据hive的表在,自动生成一个ck的表,输入值要求是带ck的中间表
 * wpp 自己开发的
 *
 *
 */
public class Mysql2CKGenericUDF extends GenericUDF {

    private static String driverName = "com.mysql.jdbc.Driver";
    StringObjectInspector keyElementOI;

    StringObjectInspector tableElementOI;

    @Override
    public Object evaluate(DeferredObject[] arg0)   throws HiveException{

            String jdbcUrl = keyElementOI.getPrimitiveJavaObject(arg0[0].get());
            String tableName = tableElementOI.getPrimitiveJavaObject(arg0[1].get());


            if(  jdbcUrl == null ||  tableName == null ){
                return new Text("获取参数错误,需要");
            }
            System.out.println("输入值 : " +jdbcUrl);

            StringBuffer res = new StringBuffer( );
            try{
               res.append("\n========================mysql转成ck的建表语句 "+ tableName+ " ==================\n");
               String ckConfInfo = getCKCreateTableInfo(jdbcUrl,tableName);
               res.append(ckConfInfo);

            }catch (Exception e){
                e.printStackTrace();
                System.out.println("获取结果异常:" +e.getMessage());
            }

            return new Text(res.toString());
    }

    public static  String getCKCreateTableInfo(String cconn_str ,String tableName) throws SQLException {

        String databaseName ="rt_dwd";
        String dbTableName = tableName;
        try {
            Class.forName(driverName);
        } catch (ClassNotFoundException e) {
            System.out.println("=================================");
            e.printStackTrace();
            System.exit(1);
        }


        String conn_str = cconn_str +"&useUnicode=true&characterEncoding=UTF8";
        Connection con = DriverManager.getConnection(conn_str);

        Statement stmt = con.createStatement();
        StringBuffer result = new StringBuffer();
        StringBuffer resultCls = new StringBuffer();
        StringBuffer insertSql = new StringBuffer();

        String createSql = "show create  table " + dbTableName ;

        ResultSet res1 = stmt.executeQuery(createSql);
        result.append("==============mysql原表的格式=============\n");
        while (res1.next()) {
//            System.out.println(res1.getString(2));
            result.append(res1.getString(2) +"\n");
        }

        String sql = "describe  " + dbTableName ;
        System.out.println("Running: " + sql);
        ResultSet res = stmt.executeQuery(sql);


        String ckTable = tableName.replaceAll("_ck$","");
        String ckDbTableName = databaseName +"." +ckTable;

        result.append("==============请确定主键和date类型的列,并把Nullable去掉=============\n");
        result.append("CREATE TABLE " +ckDbTableName +" on cluster crm_4shards_1replicas (\n");
        resultCls.append("CREATE TABLE " +ckDbTableName +"_cls on cluster crm_4shards_1replicas (\n");

        insertSql.append("INSERT INTO " +ckDbTableName +"_cls \nselect \n");

        String[] jdbcUrlArr =  cconn_str.replaceAll("jdbc:mysql://","").replaceAll("\\?","/").replaceAll("user=","").replaceAll("&password=","/").split("/");

        while (res.next()) {
            String dataKey =  res.getString(1);
            String dataType =  res.getString(2);
//            System.out.println(dataKey + "\t" + dataType);
            String ckDataType = Utils.getMysqlParseType(dataType.toLowerCase());

            if(dataKey.equals("id")){
                result.append(" `" + dataKey + "` " + ckDataType + ",\n");
                resultCls.append(" `" + dataKey + "` " + ckDataType + ",\n");
            }else {
                result.append(" `" + dataKey + "` Nullable(" + ckDataType + "),\n");
                resultCls.append(" `" + dataKey + "` Nullable(" + ckDataType + "),\n");
            }

            insertSql.append( dataKey+ ",\n");
        }

        result.append(" `" + "ck_update_date"    + "` Date,\n" );
        resultCls.append(" `" + "ck_update_date" + "` Date,\n" );
        insertSql.append(  "if(update_time is null ,toDate(create_time),toDate(update_time)) \n");

        result =new StringBuffer(  result.substring(0,result.length()-2));

        resultCls =new StringBuffer(  resultCls.substring(0,resultCls.length()-2));
        result.append("\n)ENGINE = MergeTree(ck_update_date, id, 8192);\n\n");
        resultCls.append("\n)ENGINE = Distributed(crm_4shards_1replicas,"+databaseName+" ," +ckTable+ ",rand());\n");
        insertSql.append("FROM mysql('"+jdbcUrlArr[0]+"', '"+jdbcUrlArr[1]+"', '"+dbTableName+"', '"+jdbcUrlArr[2]+"', '"+jdbcUrlArr[3]+"') ;\n");
//        insertSql.append("where toDate(create_time) < '"+ DateUtil.format(new Date(),"yyyy-MM-dd") +"';");

        result.append(resultCls);

        result.append("=============insert into ================\n");
        result.append(insertSql.toString());



        return  result.toString();
    }


  

    @Override
    public String getDisplayString(String[] arg0) {
        return "Mysql2ckTableCreateGenericUDF(jdbcUrl,tableName)";
    }

    @Override
    public ObjectInspector initialize(ObjectInspector[] arg0) throws UDFArgumentException {

        if (arg0.length != 2) {
            throw new UDFArgumentException(" Expecting  two  arguments: jdbcUrl ,tableName");
        }

        // 1. 检查是否接收到正确的参数类型
        ObjectInspector key = arg0[0];

        if (!(key instanceof StringObjectInspector) ) {
            throw new UDFArgumentException("one argument  must be a string");
        }
        this.keyElementOI = (StringObjectInspector) key;


        // 2 检查是否接收到正确的参数类型
        ObjectInspector table = arg0[1];
        if (!(table instanceof StringObjectInspector) ) {
            throw new UDFArgumentException("one argument  must be a string");
        }
        this.tableElementOI = (StringObjectInspector) table;

        return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
    }


}

1.2 utils

package com.crgt;

import java.util.regex.Pattern;

/**
 * @Author: wpp
 * @Date: 2019/10/21 19:41
 */
public class Utils {

    public static  String  HIVE_JDBC_URL= "jdbc:hive2://xx.xx.xx.xx:10000/default";

    public static String getParseType(String oriDataType){

        String dataType = null;

        switch (oriDataType.toLowerCase()){
            case "boolean":
                dataType="Int64";break;
            case "TINYINT":
                dataType="Int64";break;
            case "SMALLINT":
                dataType="Int64";break;
            case "int":
                dataType="Int64";break;
            case "bigint":
                dataType="Int64";break;
            case "float":
                dataType="Float64";break;
            case "double":
                dataType="Float64";break;
            case "decimal":
                dataType="Float64";break;
            case "string":
                dataType="String";break;
            case "datetime":
                dataType="String";break;
            case "timestamp":
                dataType="String";break;
            default:
                dataType="999999999";
        }

        return  dataType;
    }


    public static String getMysqlParseType(String oriDataType){

        String dataType = null;

        if(Pattern.matches(".*varchar.*",oriDataType)
            ||Pattern.matches(".*datetime.*",oriDataType)
            ||Pattern.matches(".*time.*",oriDataType)
            ||Pattern.matches(".*text.*",oriDataType)
        ){
            dataType="String";
        }else if(Pattern.matches(".*bigint.*.*unsigned.*",oriDataType)
                || Pattern.matches(".*tinyint.*.*unsigned.*",oriDataType)
                || Pattern.matches(".*int.*.*unsigned.*",oriDataType)

        ){
            dataType="UInt64";

        }else if(Pattern.matches(".*bigint.*",oriDataType)
                || Pattern.matches(".*tinyint.*",oriDataType)
                || Pattern.matches(".*int.*",oriDataType)
        ){
            dataType="Int64";
        }else if(Pattern.matches(".*float.*",oriDataType)
                || Pattern.matches(".*double.*",oriDataType)
                || Pattern.matches(".*decimal.*",oriDataType)
        ){
            dataType="Float64";
        }else {
            dataType="9999999999999";
        }

        return  dataType;
    }




}

 

2,创建hive的udf函数

create function mysqlinfo as 'com.crgt.Mysql2CKGenericUDF' using jar 'hdfs:///hiveudf/hiveudf_3.8-jar-with-dependencies.jar';

3,在hive上执行语句


select mysqlinfo("jdbc:mysql://10.xx.xx.xx:3306/order_center?user=user123&password=passwd123",'mysql_table_name');

4,执行结构如下:


========================mysql转成ck的建表语句 mysql_table_name ==================
==============mysql原表的格式=============
CREATE TABLE `mysql_table_name` (
  `id` varchar(25) NOT NULL COMMENT '自定义主键',
  `order_id` varchar(25) NOT NULL COMMENT '',
  `p_order_id` varchar(25) NOT NULL COMMENT '',
  `user_no` varchar(25) NOT NULL COMMENT '用户id',
  `wx_openid` varchar(100) DEFAULT NULL COMMENT '',
  `machine_id` varchar(50) DEFAULT NULL COMMENT '',
  `scene_type` tinyint(2) DEFAULT NULL COMMENT '',
  `order_type` tinyint(2) DEFAULT NULL COMMENT '',
  `merchant_shop_id` varchar(50) DEFAULT NULL COMMENT '',
  `shop_name` varchar(50) DEFAULT NULL COMMENT '',
  `site_name` varchar(50) DEFAULT NULL COMMENT '车站',
  `run_no` varchar(10) DEFAULT NULL COMMENT '',
  `section_no` varchar(5) DEFAULT NULL COMMENT '',
  `seat` varchar(10) DEFAULT NULL COMMENT '',
  `opening_time` datetime DEFAULT NULL COMMENT '',
  `riding_time` datetime DEFAULT NULL COMMENT '',
  `delivery_type` tinyint(2) DEFAULT NULL COMMENT ',0:自取,1:送达',
  `user_name` varchar(50) DEFAULT NULL COMMENT '用户名',
  `brands_name` varchar(200) DEFAULT NULL COMMENT '品牌名',
  `mobile` varchar(50) DEFAULT NULL COMMENT '',
  `invoice_type` tinyint(2) DEFAULT NULL COMMENT '0:不开发票,1:开票',
  `delivery_range` varchar(50) DEFAULT NULL COMMENT '',
  `refund_expire_time` datetime DEFAULT NULL COMMENT '',
  `box_count` int(5) DEFAULT NULL COMMENT '',
  `ware_count` int(5) DEFAULT NULL COMMENT '',
  `remark` varchar(150) DEFAULT NULL COMMENT '备注',
  `delivery_start_time` datetime DEFAULT NULL COMMENT '',
  `delivery_end_time` datetime DEFAULT NULL COMMENT '',
  `create_time` datetime DEFAULT NULL COMMENT '创建时间',
  `update_time` datetime DEFAULT NULL COMMENT '修改时间',
  PRIMARY KEY (`id`,`user_no`) USING BTREE,
  KEY `idx_order_id` (`order_id`,`user_no`) USING BTREE,
  KEY `idx_order_pid` (`p_order_id`,`user_no`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='点餐表' shardkey=user_no
==============请确定主键和date类型的列,并把Nullable去掉=============

CREATE TABLE rt_dwd.mysql_table_name on cluster crm_4shards_1replicas (
 `id` String,
 `order_id` Nullable(String),
 `p_order_id` Nullable(String),
 `user_no` Nullable(String),
 `wx_openid` Nullable(String),
 `machine_id` Nullable(String),
 `scene_type` Nullable(Int64),
 `order_type` Nullable(Int64),
 `merchant_shop_id` Nullable(String),
 `shop_name` Nullable(String),
 `site_name` Nullable(String),
 `run_no` Nullable(String),
 `section_no` Nullable(String),
 `seat` Nullable(String),
 `opening_time` Nullable(String),
 `riding_time` Nullable(String),
 `delivery_type` Nullable(Int64),
 `user_name` Nullable(String),
 `brands_name` Nullable(String),
 `mobile` Nullable(String),
 `invoice_type` Nullable(Int64),
 `delivery_range` Nullable(String),
 `refund_expire_time` Nullable(String),
 `box_count` Nullable(Int64),
 `ware_count` Nullable(Int64),
 `remark` Nullable(String),
 `delivery_start_time` Nullable(String),
 `delivery_end_time` Nullable(String),
 `create_time` Nullable(String),
 `update_time` Nullable(String),
 `ck_update_date` Date
)ENGINE = MergeTree(ck_update_date, id, 8192);

CREATE TABLE rt_dwd.mysql_table_name_cls on cluster crm_4shards_1replicas (
 `id` String,
 `order_id` Nullable(String),
 `p_order_id` Nullable(String),
 `user_no` Nullable(String),
 `wx_openid` Nullable(String),
 `machine_id` Nullable(String),
 `scene_type` Nullable(Int64),
 `order_type` Nullable(Int64),
 `merchant_shop_id` Nullable(String),
 `shop_name` Nullable(String),
 `site_name` Nullable(String),
 `run_no` Nullable(String),
 `section_no` Nullable(String),
 `seat` Nullable(String),
 `opening_time` Nullable(String),
 `riding_time` Nullable(String),
 `delivery_type` Nullable(Int64),
 `user_name` Nullable(String),
 `brands_name` Nullable(String),
 `mobile` Nullable(String),
 `invoice_type` Nullable(Int64),
 `delivery_range` Nullable(String),
 `refund_expire_time` Nullable(String),
 `box_count` Nullable(Int64),
 `ware_count` Nullable(Int64),
 `remark` Nullable(String),
 `delivery_start_time` Nullable(String),
 `delivery_end_time` Nullable(String),
 `create_time` Nullable(String),
 `update_time` Nullable(String),
 `ck_update_date` Date
)ENGINE = Distributed(crm_4shards_1replicas,rt_dwd ,mysql_table_name,rand());
=============insert into ================
INSERT INTO rt_dwd.mysql_table_name_cls 
select 
id,
order_id,
p_order_id,
user_no,
wx_openid,
machine_id,
scene_type,
order_type,
merchant_shop_id,
shop_name,
site_name,
run_no,
section_no,
seat,
opening_time,
riding_time,
delivery_type,
user_name,
brands_name,
mobile,
invoice_type,
delivery_range,
refund_expire_time,
box_count,
ware_count,
remark,
delivery_start_time,
delivery_end_time,
create_time,
update_time,
if(update_time is null ,toDate(create_time),toDate(update_time)) 
FROM mysql('xxxx:3306', 'order_center', 'mysql_table_name', 'user123', 'passwd123') ;

 

你可能感兴趣的:(HQL,UDF,clickhouse)