使用spark将hive数据写入elasticsearch或hbase

使用spark将hive数据写入elasticsearch或hbase

  • 将hive或者其他关系型数据库中的数据搬迁到es或hbase
    • 代码
      • 依赖
      • socket入口:
      • MyServerThread实现
      • 工具类PropertiesUtil
      • 配置文件config.properties
      • 客户端MyClient测试
      • 执行脚本

将hive或者其他关系型数据库中的数据搬迁到es或hbase

需求:因为需要使用hadoop能力,所以程序最后打包丢到spark所在服务器上运行,自己的web通过socket连接该jar,传递参数,执行数据搬迁任务,最后返回执行结果。

代码

依赖


        2.1.2
        1.7
        1.2.0-cdh5.8.4
    

    
        
            org.apache.spark
            spark-core_2.11
            ${spark.version}
            provided
        

         
            org.apache.spark
            spark-sql_2.11
            ${spark.version}
            provided
        

        
            org.apache.spark
            spark-hive_2.11
            ${spark.version}
            provided
        

        
            mysql
            mysql-connector-java
            5.1.47
            provided
        

        
            org.apache.commons
            commons-lang3
            3.4
            provided
        

        
            com.google.code.gson
            gson
            2.2.4
            provided
        

        
            org.elasticsearch
            elasticsearch-spark-20_2.11
            6.3.2
            provided
        
        
            com.oracle
            ojdbc6
            11.2.0.3
            provided
        

        
        
            org.apache.zookeeper
            zookeeper
            3.4.13
            pom
        
        
            org.apache.hbase
            hbase-client
            ${hbase.version}
            
                
                    org.slf4j
                    slf4j-log4j12
                
            
        
        
            org.apache.hbase
            hbase-common
            ${hbase.version}
            
                
                    org.slf4j
                    slf4j-log4j12
                
            
        
        
            org.apache.hbase
            hbase-server
            ${hbase.version}
            
                
                    org.slf4j
                    slf4j-log4j12
                
            
        
    

socket入口:

public class MyServer {

    public static void main(String[] args) throws Exception
    {
        int port = 2018;
        if(args != null && args.length>0){
            port = Integer.parseInt(args[0]);
        }

        SparkSession spark = SparkSession.builder().appName("SparkUtil") // .master("local[*]")
                .config("spark.sql.hive.verifyPartitionPath",true) //解决分区损坏问题
                .config("spark.hadoop.yarn.timeline-service.enabled",false)//java.lang.NoClassDefFoundError: com/sun/jersey/api/client/config/ClientConfig
                .config("spark.sql.broadcastTimeout",PropertiesUtil.getProperty("spark.sql.broadcastTimeout","3600"))
                .config("spark.network.timeout",PropertiesUtil.getProperty("spark.network.timeout"))
                .config("spark.sql.parquet.compression.codec",PropertiesUtil.getProperty("spark.sql.parquet.compression.codec","gzip"))
                .config("es.index.auto.create", "false")//不自动创建,提前创建索引
                .config("es.nodes",PropertiesUtil.getProperty("es.nodes"))
                .config("es.port",PropertiesUtil.getProperty("es.port"))
                .config("es.batch.size.bytes",PropertiesUtil.getProperty("es.batch.size.bytes"))
                .config("es.batch.size.entries",PropertiesUtil.getProperty("es.batch.size.entries","100")) // 100 解决Could not write all entries
                .config("es.batch.write.refresh",PropertiesUtil.getProperty("es.batch.write.refresh","false"))
                .config("es.batch.write.retry.count",PropertiesUtil.getProperty("es.batch.write.retry.count","30")) // 3 解决Could not write all entries
                .config("es.batch.write.retry.wait",PropertiesUtil.getProperty("es.batch.write.retry.wait","100")) // 10s 解决Could not write all entries
                .config("es.http.timeout",PropertiesUtil.getProperty("es.http.timeout"))
                .config("es.http.retries",PropertiesUtil.getProperty("es.http.retries"))
                .config("es.action.heart.beat.lead",PropertiesUtil.getProperty("es.action.heart.beat.lead"))
                .config("es.nodes.wan.only",PropertiesUtil.getProperty("es.nodes.wan.only","true"))
                .config("es.nodes.data.only",PropertiesUtil.getProperty("es.nodes.data.only","true"))
                .config("es.nodes.discovery",PropertiesUtil.getProperty("es.nodes.discovery","true"))
                .config("es.input.use.sliced.partitions",PropertiesUtil.getProperty("es.input.use.sliced.partitions","50000"))
                .config("es.input.max.docs.per.partition",PropertiesUtil.getProperty("es.input.max.docs.per.partition","100000"))
                .config("es.net.http.header.Accept-Languag",PropertiesUtil.getProperty("es.net.http.header.Accept-Languag","gzip"))
                .enableHiveSupport().getOrCreate();

        ServerSocket serverSocket = new ServerSocket(port);//是一个能够接受其他通信实体请求的类
        System.out.println("服务器正在等待客户端的连接请求----");
        // 创建线程池
        ThreadPoolExecutor executor = new ThreadPoolExecutor(50, 80, 200, MILLISECONDS,
                new ArrayBlockingQueue(5));

        //用一个while循环可以同时响应多个客户端的请求
        while(true){
            Socket socket = serverSocket.accept();//服务器监听对应端口的输入
            MyServerThread thread = new MyServerThread(socket,spark);//创建线程
            executor.execute(thread);
            //thread.start();
        }
    }
}

MyServerThread实现

public class MyServerThread implements Runnable, Serializable {
    public static final String SUCCESS = "1";
    public static final String FAIL = "0";
    public static final String DS_HIVE = "hive";
    public static final String DS_ORACLE = "oracle";
    public static final String DS_ES = "es";
    public static final String DS_HBASE = "hbase";
    public static final String DS_MYSQL = "mysql";

    public transient Map translateMaps = new HashMap<>();

    private transient Socket socket;
    private transient SparkSession spark;

    public MyServerThread(Socket socket,SparkSession spark){
        this.socket = socket;
        this.spark = spark;
    }

    public void run() {
        BufferedReader bufferedReader = null;
        BufferedWriter bufferedWriter = null;
        ObjectInputStream ois = null;
        Map map = null;
        HashMap paramsMap = null;
        try{
            ois = new ObjectInputStream(socket.getInputStream());
            paramsMap = (HashMap) ois.readObject();
            System.out.println("来自客户端的数据:"+paramsMap);

            if(paramsMap != null) {
                String method = PropertiesUtil.getStrValue(paramsMap, "method");
                if ("doImportToEs".equals(method)) {
                    map = doImportToEs(paramsMap, spark);
                }
                if ("doImportToHBase".equals(method)) {
                    map = doImportToHBase(paramsMap, spark);
                }
                bufferedWriter = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream()));
                bufferedWriter.write(map.get("code") + ":" + map.get("msg") + "\n");
                bufferedWriter.flush();
            }
        } catch(Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if(ois != null){
                    ois.close();
                }
                if(bufferedReader != null){
                    bufferedReader.close();
                }
                if(bufferedWriter != null){
                    bufferedWriter.close();
                }
                if(socket != null){
                    socket.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    //数据保存
    private Map doImportToEs(Map paramsMap, SparkSession spark){
        long start = System.currentTimeMillis();
        Map result = new HashMap();
        long totalCount = 0L;
        try {
            String sql = PropertiesUtil.getStrValue(paramsMap,"sql");
            String tableCode = PropertiesUtil.getStrValue(paramsMap,"table_code");
            String index = PropertiesUtil.getStrValue(paramsMap,"index");
            String srcTableType = PropertiesUtil.getStrValue(paramsMap,"src_table_type");
            String tableType = PropertiesUtil.getStrValue(paramsMap,"table_type");
            String mappingId = PropertiesUtil.getStrValue(paramsMap,"mapping_id");
            //保存到ES
            if (DS_HIVE.equalsIgnoreCase(srcTableType)) {
                Dataset toTransDs = spark.sql(sql).persist();
                totalCount = toTransDs.count();
                if(StringUtils.isEmpty(mappingId)) {
                    JavaEsSparkSQL.saveToEs(toTransDs, index + "/" + index);
                } else {
                    //索引主键,使用默认主键;更多配置可参考ConfigurationOptions
                    JavaEsSparkSQL.saveToEs(toTransDs, index + "/" + index,ImmutableMap.of("es.mapping.id", mappingId));
                }
                toTransDs.unpersist();//释放缓存
            }

            if(DS_ORACLE.equalsIgnoreCase(srcTableType) && DS_ES.equalsIgnoreCase(tableType)){
                //连接oracle库
                Map options = new HashMap();
                options.put("driver",PropertiesUtil.getProperty("oracle.driver"));
                options.put("url",PropertiesUtil.getStrValue(paramsMap,"url"));
                options.put("user",PropertiesUtil.getStrValue(paramsMap,"user"));
                options.put("password",PropertiesUtil.getStrValue(paramsMap,"password"));
                options.put("dbtable",tableCode);
                Dataset oraDs = spark.read().format("jdbc").options(options).load();
                oraDs.createOrReplaceTempView(tableCode);
                Dataset oraTable = spark.sql(sql);//String sqltext="(" + sql + ") t";
                oraTable.show();
                totalCount = oraTable.count();
                if(StringUtils.isEmpty(mappingId)) {
                    JavaEsSparkSQL.saveToEs(oraTable, index + "/" + index);
                } else {
                    //索引主键,使用默认主键;更多配置可参考ConfigurationOptions
                    JavaEsSparkSQL.saveToEs(oraTable, index + "/" + index,ImmutableMap.of("es.mapping.id", mappingId));
                }
            }

            if(DS_MYSQL.equalsIgnoreCase(srcTableType) && DS_ES.equalsIgnoreCase(tableType)) {
                Properties connectionProperties = new Properties();
                connectionProperties.put("user", PropertiesUtil.getStrValue(paramsMap,"user"));
                connectionProperties.put("password", PropertiesUtil.getStrValue(paramsMap,"password"));
                String url = PropertiesUtil.getStrValue(paramsMap,"url");
                Dataset mysqlTable = spark.read().jdbc(url, tableCode, connectionProperties);
                mysqlTable.show();
                totalCount = mysqlTable.count();
                if(StringUtils.isEmpty(mappingId)) {
                    JavaEsSparkSQL.saveToEs(mysqlTable, index + "/" + index);
                } else {
                    //索引主键,使用默认主键;更多配置可参考ConfigurationOptions
                    JavaEsSparkSQL.saveToEs(mysqlTable, index + "/" + index,ImmutableMap.of("es.mapping.id", mappingId));
                }
            }

            result.put("code",totalCount);//SUCCESS
            long end = System.currentTimeMillis();
            result.put("msg","保存到ES执行完成,源数据量:"+totalCount+"条,耗时:"+(end-start)/1000+"s");//注意不要用英文:
            System.out.println(result);
        } catch (Exception e) {
            result.put("code",FAIL);
            result.put("msg",e.getMessage());
            e.printStackTrace();
        }
        return result;
    }

    private Map doImportToHBase(Map paramsMap, SparkSession spark){
        long start = System.currentTimeMillis();
        Map result = new HashMap();
        long totalCount = 0L;
        long rowCount = 0L;

        try {
            String sql = PropertiesUtil.getStrValue(paramsMap,"sql");
            String srcTableCode = PropertiesUtil.getStrValue(paramsMap,"src_table_code");
            String tableCode = PropertiesUtil.getStrValue(paramsMap,"table_code");
            String srcTableType = PropertiesUtil.getStrValue(paramsMap,"src_table_type");
            String tableType = PropertiesUtil.getStrValue(paramsMap,"table_type");
            String mappingId = PropertiesUtil.getStrValue(paramsMap,"mapping_id");

            if(StringUtils.isEmpty(mappingId)) {
                result.put("code",FAIL);
                result.put("msg","Rowkey column is NULL!");
                return result;
            }

            if (DS_HIVE.equalsIgnoreCase(srcTableType)) {
                Dataset toTransDs = spark.sql(sql).persist();
                totalCount = toTransDs.count();
                rowCount = saveToHBase(toTransDs,tableCode,mappingId);//保存数据到hbase
                toTransDs.unpersist();//释放缓存
            }

            if(DS_ORACLE.equalsIgnoreCase(srcTableType) && DS_HBASE.equalsIgnoreCase(tableType)){
                //连接oracle库
                Map options = new HashMap();
                options.put("driver",PropertiesUtil.getProperty("oracle.driver"));
                options.put("url",PropertiesUtil.getStrValue(paramsMap, "url"));
                options.put("user",PropertiesUtil.getStrValue(paramsMap, "user"));
                options.put("password",PropertiesUtil.getStrValue(paramsMap, "password"));
                options.put("dbtable",srcTableCode);
                Dataset oraDs = spark.read().format("jdbc").options(options).load();
                oraDs.createOrReplaceTempView(srcTableCode);
                Dataset oraTable = spark.sql(sql);//String sqltext="(" + sql + ") t";
                oraTable.show();
                totalCount = oraTable.count();
                rowCount = saveToHBase(oraTable,tableCode,mappingId);//保存数据到hbase
            }

            if(DS_MYSQL.equalsIgnoreCase(srcTableType) && DS_HBASE.equalsIgnoreCase(tableType)){
                //连接mysql库
                Properties connectionProperties = new Properties();
                connectionProperties.put("user", PropertiesUtil.getStrValue(paramsMap,"user"));
                connectionProperties.put("password", PropertiesUtil.getStrValue(paramsMap,"password"));
                String url = PropertiesUtil.getStrValue(paramsMap,"url");
                Dataset mysqlTable = spark.read().jdbc(url, srcTableCode, connectionProperties);
                mysqlTable.show();
                totalCount = mysqlTable.count();
                rowCount = saveToHBase(mysqlTable,tableCode,mappingId);//保存数据到hbase
            }

            long end = System.currentTimeMillis();
            result.put("code",SUCCESS);
            String msgs = "保存到HBase执行完成,源数据量:"+totalCount+"条,耗时:"+(end-start)/1000+"s";//注意不要用英文:
            if(totalCount != rowCount){
                result.put("code",FAIL);
                msgs += ";保存到HBase数据量:" + rowCount + ",数据量不一致";
            }
            result.put("msg",msgs);
            System.out.println(result);
        } catch (Exception e) {
            result.put("code",FAIL);
            result.put("msg",e.getMessage());
            e.printStackTrace();
        }
        return result;
    }

    private Long saveToHBase(Dataset resultDs, String tableCode, final String mappingId) throws Exception {
        String tableName = tableCode.toLowerCase();//表名
        final String columnFamily = "columnFamily";//列簇名

        Configuration config = spark.sparkContext().hadoopConfiguration();
        config.set("hbase.zookeeper.quorum", PropertiesUtil.getProperty("hbase.zookeeper.quorum"));
        config.set("hbase.zookeeper.property.clientPort",PropertiesUtil.getProperty("hbase.zookeeper.property.clientPort"));
        config.set("hbase.master", PropertiesUtil.getProperty("hbase.master"));
        config.set("hbase.cluster.distributed", PropertiesUtil.getProperty("hbase.cluster.distributed","true"));
        config.set("zookeeper.session.timeout", PropertiesUtil.getProperty("zookeeper.session.timeout","30000"));
        config.set("hbase.hregion.majorcompaction", PropertiesUtil.getProperty("hbase.hregion.majorcompaction","0"));
        config.set("hbase.regionserver.regionSplitLimit", PropertiesUtil.getProperty("hbase.regionserver.regionSplitLimit","1"));
        config.set("dfs.client.socket-timeout", PropertiesUtil.getProperty("dfs.client.socket-timeout","60000"));
        config.set("hbase.regionserver.handler.count", PropertiesUtil.getProperty("hbase.regionserver.handler.count","20"));

        Job hbaseJob = Job.getInstance(config,"spark-hbase");
        hbaseJob.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class);
        hbaseJob.getConfiguration().set(TableInputFormat.INPUT_TABLE, tableName);
        hbaseJob.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, tableName);

        Connection connection = ConnectionFactory.createConnection(config);
        Admin admin = connection.getAdmin();

        //删除无需保留的数据
        String regex = "^" + tableName + "$";
        TableName[] tableNames = admin.listTableNames(regex);//查询所有表名
        for (TableName tname : tableNames) {
            admin.disableTable(tname); //禁用表
            admin.deleteTable(tname); //删除表
        }

        //新建表
        TableName htableName = TableName.valueOf(tableName);
        HTableDescriptor hbaseTable = new HTableDescriptor(htableName);
        hbaseTable.addFamily(new HColumnDescriptor(columnFamily));//列簇默认
        admin.createTable(hbaseTable);

        final String[] mids = mappingId.split(",");
        JavaPairRDD hbasePuts = resultDs.javaRDD().mapToPair(new PairFunction() {
            @Override
            public Tuple2 call(Row row) throws Exception {
                StructType schema = row.schema();
                Iterator it = schema.iterator();
                Iterator it1 = schema.iterator();
                //获取rowkey值
                int i = 0;
                List values = new ArrayList<>();
                while (it.hasNext()) {
                    StructField next = it.next();
                    for(String mid : mids){
                        if(mid.equalsIgnoreCase(next.name())){
                            values.add(row.getString(i));
                        }
                    }
                    i++;
                }
                Put put = new Put(Bytes.toBytes(StringUtils.join(values,":")));//连接多个字段
                //设置数据
                i = 0;
                while (it1.hasNext()) {
                    StructField next = it1.next();
                    Object value = row.get(i++);
                    String v = (value == null) ? "" :String.valueOf(value);
                    put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes(next.name()), Bytes.toBytes(v));
                }

                return new Tuple2(new ImmutableBytesWritable(), put);
            }
        });
        //保存数据
        hbasePuts.saveAsNewAPIHadoopDataset(hbaseJob.getConfiguration());

        //数据稽核
        String coprocessorClassName = "org.apache.hadoop.hbase.coprocessor.AggregateImplementation";
        admin.disableTable(htableName);
        hbaseTable.addCoprocessor(coprocessorClassName);
        admin.modifyTable(htableName, hbaseTable);
        admin.enableTable(htableName);

        AggregationClient ac = new AggregationClient(config);
        Scan scan = new Scan();
        scan.addFamily(Bytes.toBytes(columnFamily));
        ColumnInterpreter longColumnInterpreter = new LongColumnInterpreter();
        long rowCount = 0;
        try {
            rowCount = ac.rowCount(htableName,longColumnInterpreter,scan);
        } catch (Throwable throwable) {
            throwable.printStackTrace();
        }
        System.out.println("hbase rowCount:"+rowCount);

        admin.close();
        connection.close();

        return rowCount;
    }
}

工具类PropertiesUtil

public class PropertiesUtil {
    private static Properties prop = null;
    static{
        //获取配置
        try {
            prop = new Properties();
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            InputStream inputStream = loader.getResourceAsStream("config.properties");
            prop.load(inputStream);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static String getProperty(String property){
        return prop.getProperty(property);
    }

    public static String getProperty(String property,String defaultValue){
        String value = prop.getProperty(property);
        if(value == null || "".equals(value)){
            value = defaultValue;
        }
        return value;
    }

    public static String getStrValue(Map map, String key) {
        if (map == null || map.isEmpty() || StringUtils.isBlank(key)) {
            return "";
        }
        Object t = map.get(key);
        if (t != null) {
            if(t instanceof Integer){
                return t+"";
            }
            return t.toString();
        } else {
            for (Object o : map.keySet()) {
                String name = (String) o;
                if (name.toLowerCase().equals(key.toLowerCase())) {
                    Object value = map.get(o);
                    if (value == null) {
                        return "";
                    }
                    return value.toString();
                }
            }
        }

        return "";
    }
}

配置文件config.properties

#spark配置
spark.sql.broadcastTimeout=3600
spark.network.timeout=1400s
spark.sql.parquet.compression.codec=snappy

################################################################

#数据来源类型[hive、oracle、mysql]
#数据来源类型为oracle时,oracle配置
oracle.driver=oracle.jdbc.driver.OracleDriver

################################################################
#数据目标类型[es、hbase]
#数据目标类型为es时,elasticsearch配置
es.nodes=hostes
es.port=9200
# 4000 解决Could not write all entries
es.batch.size.bytes=20000000
es.batch.size.entries=5000
es.batch.write.refresh=true
es.batch.write.retry.count=50
es.batch.write.retry.wait=500
es.http.timeout=5m
es.http.retries=50
es.action.heart.beat.lead=50
es.nodes.wan.only=true
es.nodes.data.only =false
es.nodes.discovery=false
es.input.use.sliced.partitions=50000
es.input.max.docs.per.partition=100000
es.net.http.header.Accept-Languag=gzip

#数据目标类型为hbase时,hbase配置
hbase.zookeeper.quorum=hosthbase
hbase.zookeeper.property.clientPort=2181
hbase.master=hosthbase:60000
hbase.cluster.distributed=true
zookeeper.session.timeout=1200000
hbase.hregion.majorcompaction=0
hbase.regionserver.regionSplitLimit=150
dfs.client.socket-timeout=60000
hbase.regionserver.handler.count=50

客户端MyClient测试

public class MyClient {

    private Socket socket;

    public MyClient (String address, int port) throws IOException {
        socket = new Socket();
        socket.connect(new InetSocketAddress(InetAddress.getByName(address), port));
    }

    public Socket getSocket(){
        return socket;
    }

    public static void main(String[] args){

        Socket socket = null;
        BufferedReader buffer = null;
        OutputStream outputStream = null;
        ObjectOutputStream oos = null;
        InputStream inputStream = null;
        try{
            MyClient myClient = new MyClient ("hostss",2018);
            socket = myClient.getSocket();
            outputStream = socket.getOutputStream();
            inputStream = socket.getInputStream();

            oos = new ObjectOutputStream(outputStream);
            Map paramMap = new HashMap();
            paramMap.put("method","doImportToHBase");
            paramMap.put("sql","select * from hive_test.big_data_test where p_day = 20181220");
            paramMap.put("table_code","big_data_test_hbase");
            paramMap.put("src_table_type","hive");
            paramMap.put("table_type","hbase");
            paramMap.put("mapping_id","pk_id,phone_no");
            oos.writeObject(paramMap);
            oos.flush();

            buffer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
            String line = buffer.readLine();
            System.out.println("来自服务端的数据:"+line);

        } catch (IOException e) {
            e.printStackTrace();
        } finally{
            try {
                if (oos != null){
                    oos.close();
                }
                if (outputStream != null) {
                    outputStream.close();
                }
                if (inputStream != null) {
                    inputStream.close();
                }
                if (socket != null){
                    socket.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

执行脚本

nohup /home/hadoop/sparkForThrift/bin/spark-submit \
--class com.sd.data.spark.MyServer \
--driver-java-options -Xss10m \
--conf spark.executor.extraJavaOptions="-XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:-CMSConcurrentMTEnabled -XX:CMSInitiatingOccupancyFraction=70 -XX:+CMSParallelRemarkEnabled" \
--driver-memory 50G \
--executor-memory 50G \
--executor-cores 5 \
--conf spark.default.parallelism=1000 \
--name spark-es-hbase \
--queue root.ses \
--master yarn \
>> spark-es-hbase.log &

你可能感兴趣的:(大数据)