Spark Hive 导入数据到 HBase

导读:
         如何使用MapReduce将textfile、rcfile格式的Hive表数据导入到HBase,上一篇博客中已经做了介绍 MapReduce Hive 导入数据到 HBase,MR这种方式有两个缺点,一是当数据量特别大的时候,执行较慢,hive中表的存储格式有多种,除了上面两种,还有常用的两种就是ORC和Parquet,这两种压缩比和查询效率都很高,当hive表中存储格式发生变化后,可能就得需要修改代码重新打包。可以使用Spark来完成hive表数据导入HBase,执行速度快且不受hive表存储格式影响,一石两鸟,何乐而不为,说干就干,笔者写了一个Spark 程序如下。

工程构建笔者就不再介绍了,和上篇博客中一样,不同的是再创建完Maven项目后需要引入Scala,项目右键选择Add Framework Support 然后弹出页面中勾选Scala即可。
Spark Hive 导入数据到 HBase_第1张图片

代码如下

1.common 包

object Constants {

  // --------------------------------------------- develop.properties key ------------------------------------------
  val ZOOKEEPER_CLIENT_PORT: String = "hbase.zookeeper.property.clientPort"
  val ZOOKEEPER_QUORUM: String = "hbase.zookeeper.quorum"
  val HBASE_MASTER: String = "hbase.master"
  val ZOOKEEPER_ZNODE_PARENT: String = "zookeeper.znode.parent"
  val HIVE_URL: String = "hive.url"
  val HIVE_DRIVER_CLASS: String = "hive.driver.class"
  val HIVE_USER: String = "hive.user"
  val HIVE_PASSWORD: String = "hive.password"
  val HIVE_DATA_INPUT_PATH: String = "hive.data.input.path"

  // --------------------------------------------- constant key ------------------------------------------
  val DESC_TABLE: String = "DESC FORMATTED ";
  val SELECT: String = "select "
  val FROM: String = " from ";
  val WHERE: String = " where ";
  val PARTITION_PREFIX: String = "dt = "
  val HBASE_COLUMN_FAMILY: String = "info"
  val FIELDS_TERMINATED: Char = '\001'
  val COL_NAME: String = "col_name"
  val APP_NAME_PREFIX = "hive2hbase-";
  val SHARP: String = "#"
  val POINT = "."
}

2. config 包

object ConfigureContext {

  // 解析文件返回map
  def loadConfig(): Map[String, String] ={
    val bundle = ResourceBundle.getBundle("develop")
    var configMap:Map[String, String] = Map()
    val enum = bundle.getKeys
    while(enum.hasMoreElements){
      val key = enum.nextElement()
      configMap +=((key, bundle.getString(key)))
    }
    configMap
  }

  def main(args: Array[String]): Unit = {
      var map = loadConfig()
      map.keys.foreach{
        i => println("key = " + i, "value = " + map(i))
     }
  }
}

3.datasync 包

import java.sql.{Connection, DriverManager, ResultSet}
import java.text.SimpleDateFormat
import java.util
import java.util.Calendar

import com.sqyc.bigdata.common.Constants
import com.sqyc.bigdata.config.ConfigureContext
import com.sqyc.bigdata.utils.MD5Utils
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, HBaseAdmin, Put}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
import scala.util.control._

object Hive2HBase {

  def main(args: Array[String]): Unit = {
    if (args.length < 3) {
      throw new IllegalArgumentException("Please input parameters.【hiveDatabase hiveTable hiveIgnoreFields [hivePartition]】")
    }

    val formatter = new SimpleDateFormat("yyyy-MM-dd")
    val date = Calendar.getInstance()
    val configMap: Map[String, String] = ConfigureContext.loadConfig()

    val hiveDatabase: String = args(0)
    val hiveTable: String = args(1)
    val hiveIgnoreFields: String = args(2)
    var hivePartition: String = null
    if (args.length == 4) {
      hivePartition = args(3)
    } else {
      date.add(Calendar.DAY_OF_MONTH, -1)
      hivePartition = formatter.format(date.getTime)
    }
    
    // 创建hbase表
    createTable(hiveTable, configMap)

    // 获取hive中列
    val columnList = getColumnList(hiveDatabase, hiveTable, hiveIgnoreFields, configMap)

    // 拼接sql
    val columns: String = getSelectColumns(columnList)
    val sql = getSql(hiveDatabase, hiveTable, hivePartition, columns)

    // 查询数据
    val spark = SparkSession.builder().appName(Constants.APP_NAME_PREFIX + hiveTable).enableHiveSupport().getOrCreate()
    val dataDF = spark.sql(sql).toDF()

    // 写入数据到hbase
    dataDF.foreachPartition(it => {
      val conn = getHBaseConnection(configMap)
      val admin = conn.getAdmin.asInstanceOf[HBaseAdmin]
      val tableName = TableName.valueOf(hiveTable)
      val table = conn.getTable(tableName);
      it.foreach( row => {
        def checkValue(v: Any): String = if (v == null || v.toString.trim.eq("")) "null" else v.toString

        val rowkey = MD5Utils.string2MD5(row(0).toString).getBytes()
        val columnFamily = Constants.HBASE_COLUMN_FAMILY.getBytes()
        val put = new Put(rowkey)
        for (i <- 0 until columnList.size) {
          put.addColumn(columnFamily, columnList(i).getBytes, checkValue(row(i)).getBytes())
        }
        table.put(put)
      })
      conn.close()
    })
  }

  def getColumnList(hiveDatabase: String, hiveTable: String, hiveIgnoreFields: String, configMap: Map[String, String]) = {
    Class.forName(configMap(Constants.HIVE_DRIVER_CLASS))
    val conn = DriverManager.getConnection(configMap(Constants.HIVE_URL) + hiveDatabase, configMap(Constants.HIVE_USER), configMap(Constants.HIVE_PASSWORD))
    var columnList = new ListBuffer[String]
    val ps = conn.prepareStatement(Constants.DESC_TABLE + hiveDatabase + Constants.POINT + hiveTable)
    val rs = ps.executeQuery

    val breakWhile = new Breaks;
    val continueWhile = new Breaks;
    val ignoreList = hiveIgnoreFields.split(",").toList
    while (rs.next) {
      if (startColumnsDescRow(rs)) {
        breakWhile.breakable {
          while (rs.next()) {
            continueWhile.breakable {
              val colName = rs.getString(Constants.COL_NAME)
              if (colName == null || colName.trim().equals("") || ignoreList.contains(colName)) {
                continueWhile.break()
              } else if (colName.startsWith(Constants.SHARP)) {
                breakWhile.break()
              } else {
                columnList.append(colName)
              }
            }
          }
        }
      }
    }
    if (conn != null) conn.close()
    columnList
  }

  def startColumnsDescRow(rs: ResultSet) = {
    val colName = rs.getString(Constants.COL_NAME)
    colName.trim == "# col_name"
  }

  def getSelectColumns(columnList: ListBuffer[String]) = {
    var columns = new StringBuilder()
    for (column <- columnList) {
      columns.append(column)
      columns.append(",")
    }
    columns.deleteCharAt(columns.length - 1).toString()
  }

  def getSql(hiveDatabase: String, hiveTable: String, hivePartition: String, columns: String) = {
    var sql = new StringBuilder()
      .append(Constants.SELECT)
      .append(columns)
      .append(Constants.FROM)
      .append(hiveDatabase)
      .append(Constants.POINT)
      .append(hiveTable)
      .append(Constants.WHERE)
      .append(Constants.PARTITION_PREFIX + "'" + hivePartition + "'")
    sql.toString()
  }

  def getHBaseConnection(configMap: Map[String, String]) = {
    val conf = HBaseConfiguration.create
    conf.set("hbase.zookeeper.property.clientPort", configMap(Constants.ZOOKEEPER_CLIENT_PORT))
    conf.set("hbase.zookeeper.quorum", configMap(Constants.ZOOKEEPER_QUORUM))
    conf.set("hbase.master", configMap(Constants.HBASE_MASTER))
    conf.set("zookeeper.znode.parent", configMap(Constants.ZOOKEEPER_ZNODE_PARENT))
    ConnectionFactory.createConnection(conf)
  }

  def createTable(hiveTable: String, configMap: Map[String, String]): Unit = {
    val conn = getHBaseConnection(configMap)
    val admin = conn.getAdmin.asInstanceOf[HBaseAdmin]
    val tableName = TableName.valueOf(hiveTable)
    if (!admin.tableExists(tableName)) {
      // 表不存在则创建
      val desc = new HTableDescriptor(tableName)
      val columnDesc = new HColumnDescriptor(Constants.HBASE_COLUMN_FAMILY)
      desc.addFamily(columnDesc)
      admin.createTable(desc)
    }
  }
}

4. pom.xml


	UTF-8
	UTF-8
	2.1.2
	2.7.3
	1.2.0
	2.11.8
	1.2.6



	
		org.apache.spark
		spark-core_2.11
		${spak.version}
	
	
		org.apache.spark
		spark-sql_2.11
		${spak.version}
	
	
		org.apache.spark
		spark-hive_2.11
		${spak.version}
	
	
		org.apache.hadoop
		hadoop-client
		${hadoop.version}
	
	
		org.apache.hive
		hive-jdbc
		${hive.version}
	
	
		org.scala-lang
		scala-library
		${scala.version}
	
	
		org.apache.hbase
		hbase-server
		${hbase.version}
	



	
		dev
		
			true
		
		
			
				src/main/filters/dev.properties
			
			
				
					src/main/resources
				
			
		
	
	
		online
		
			
				src/main/filters/online.properties
			
			
				
					src/main/resources
				
			
		
	
	
		test
		
			
				src/main/filters/test.properties
			
			
				
					src/main/resources
				
			
		
	



	src/main/scala
	
	
		
		
			src/main/resources
			
				**/*.properties
			
			true
		
	
	
		
			org.scala-tools
			maven-scala-plugin
			2.15.2
			
				
					
						compile
						testCompile
					
					
						
							-dependencyfile
							${project.build.directory}/.scala_dependencies
						
					
				
			
		
		
			org.apache.maven.plugins
			maven-surefire-plugin
			2.10
			
				false
				true
				
				
				
					**/*Test.*
					**/*Suite.*
				
			
		
		
			org.apache.maven.plugins
			maven-assembly-plugin
			2.4
			
				
					jar-with-dependencies
				
				
					
						datasync.Hive2HBase
					
				
			
			
				
					make-assembly
					package
					
						single
					
				
			
		
	


	
		
			org.scala-tools
			maven-scala-plugin
			
				${scala.version}
			
		
	

4. 打成jar 提交spark 执行

执行如下命令

spark-submit --master yarn-client --num-executors 2 --executor-memory 4g --driver-memory 4g --executor-cores 3 --class datasync.Hive2HBase xxx.jar dbname tbname partitionname

如果spark on yarn 则执行如下命令

spark-submit --master spark://ip:port --num-executors 2 --executor-memory 2g --driver-memory 2g --executor-cores 10 --total-executor-cores 20

你可能感兴趣的:(Spark)