导读:
如何使用MapReduce将textfile、rcfile格式的Hive表数据导入到HBase,上一篇博客中已经做了介绍 MapReduce Hive 导入数据到 HBase,MR这种方式有两个缺点,一是当数据量特别大的时候,执行较慢,hive中表的存储格式有多种,除了上面两种,还有常用的两种就是ORC和Parquet,这两种压缩比和查询效率都很高,当hive表中存储格式发生变化后,可能就得需要修改代码重新打包。可以使用Spark来完成hive表数据导入HBase,执行速度快且不受hive表存储格式影响,一石两鸟,何乐而不为,说干就干,笔者写了一个Spark 程序如下。
工程构建笔者就不再介绍了,和上篇博客中一样,不同的是再创建完Maven项目后需要引入Scala,项目右键选择Add Framework Support 然后弹出页面中勾选Scala即可。
代码如下
object Constants {
// --------------------------------------------- develop.properties key ------------------------------------------
val ZOOKEEPER_CLIENT_PORT: String = "hbase.zookeeper.property.clientPort"
val ZOOKEEPER_QUORUM: String = "hbase.zookeeper.quorum"
val HBASE_MASTER: String = "hbase.master"
val ZOOKEEPER_ZNODE_PARENT: String = "zookeeper.znode.parent"
val HIVE_URL: String = "hive.url"
val HIVE_DRIVER_CLASS: String = "hive.driver.class"
val HIVE_USER: String = "hive.user"
val HIVE_PASSWORD: String = "hive.password"
val HIVE_DATA_INPUT_PATH: String = "hive.data.input.path"
// --------------------------------------------- constant key ------------------------------------------
val DESC_TABLE: String = "DESC FORMATTED ";
val SELECT: String = "select "
val FROM: String = " from ";
val WHERE: String = " where ";
val PARTITION_PREFIX: String = "dt = "
val HBASE_COLUMN_FAMILY: String = "info"
val FIELDS_TERMINATED: Char = '\001'
val COL_NAME: String = "col_name"
val APP_NAME_PREFIX = "hive2hbase-";
val SHARP: String = "#"
val POINT = "."
}
object ConfigureContext {
// 解析文件返回map
def loadConfig(): Map[String, String] ={
val bundle = ResourceBundle.getBundle("develop")
var configMap:Map[String, String] = Map()
val enum = bundle.getKeys
while(enum.hasMoreElements){
val key = enum.nextElement()
configMap +=((key, bundle.getString(key)))
}
configMap
}
def main(args: Array[String]): Unit = {
var map = loadConfig()
map.keys.foreach{
i => println("key = " + i, "value = " + map(i))
}
}
}
import java.sql.{Connection, DriverManager, ResultSet}
import java.text.SimpleDateFormat
import java.util
import java.util.Calendar
import com.sqyc.bigdata.common.Constants
import com.sqyc.bigdata.config.ConfigureContext
import com.sqyc.bigdata.utils.MD5Utils
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, HBaseAdmin, Put}
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
import scala.util.control._
object Hive2HBase {
def main(args: Array[String]): Unit = {
if (args.length < 3) {
throw new IllegalArgumentException("Please input parameters.【hiveDatabase hiveTable hiveIgnoreFields [hivePartition]】")
}
val formatter = new SimpleDateFormat("yyyy-MM-dd")
val date = Calendar.getInstance()
val configMap: Map[String, String] = ConfigureContext.loadConfig()
val hiveDatabase: String = args(0)
val hiveTable: String = args(1)
val hiveIgnoreFields: String = args(2)
var hivePartition: String = null
if (args.length == 4) {
hivePartition = args(3)
} else {
date.add(Calendar.DAY_OF_MONTH, -1)
hivePartition = formatter.format(date.getTime)
}
// 创建hbase表
createTable(hiveTable, configMap)
// 获取hive中列
val columnList = getColumnList(hiveDatabase, hiveTable, hiveIgnoreFields, configMap)
// 拼接sql
val columns: String = getSelectColumns(columnList)
val sql = getSql(hiveDatabase, hiveTable, hivePartition, columns)
// 查询数据
val spark = SparkSession.builder().appName(Constants.APP_NAME_PREFIX + hiveTable).enableHiveSupport().getOrCreate()
val dataDF = spark.sql(sql).toDF()
// 写入数据到hbase
dataDF.foreachPartition(it => {
val conn = getHBaseConnection(configMap)
val admin = conn.getAdmin.asInstanceOf[HBaseAdmin]
val tableName = TableName.valueOf(hiveTable)
val table = conn.getTable(tableName);
it.foreach( row => {
def checkValue(v: Any): String = if (v == null || v.toString.trim.eq("")) "null" else v.toString
val rowkey = MD5Utils.string2MD5(row(0).toString).getBytes()
val columnFamily = Constants.HBASE_COLUMN_FAMILY.getBytes()
val put = new Put(rowkey)
for (i <- 0 until columnList.size) {
put.addColumn(columnFamily, columnList(i).getBytes, checkValue(row(i)).getBytes())
}
table.put(put)
})
conn.close()
})
}
def getColumnList(hiveDatabase: String, hiveTable: String, hiveIgnoreFields: String, configMap: Map[String, String]) = {
Class.forName(configMap(Constants.HIVE_DRIVER_CLASS))
val conn = DriverManager.getConnection(configMap(Constants.HIVE_URL) + hiveDatabase, configMap(Constants.HIVE_USER), configMap(Constants.HIVE_PASSWORD))
var columnList = new ListBuffer[String]
val ps = conn.prepareStatement(Constants.DESC_TABLE + hiveDatabase + Constants.POINT + hiveTable)
val rs = ps.executeQuery
val breakWhile = new Breaks;
val continueWhile = new Breaks;
val ignoreList = hiveIgnoreFields.split(",").toList
while (rs.next) {
if (startColumnsDescRow(rs)) {
breakWhile.breakable {
while (rs.next()) {
continueWhile.breakable {
val colName = rs.getString(Constants.COL_NAME)
if (colName == null || colName.trim().equals("") || ignoreList.contains(colName)) {
continueWhile.break()
} else if (colName.startsWith(Constants.SHARP)) {
breakWhile.break()
} else {
columnList.append(colName)
}
}
}
}
}
}
if (conn != null) conn.close()
columnList
}
def startColumnsDescRow(rs: ResultSet) = {
val colName = rs.getString(Constants.COL_NAME)
colName.trim == "# col_name"
}
def getSelectColumns(columnList: ListBuffer[String]) = {
var columns = new StringBuilder()
for (column <- columnList) {
columns.append(column)
columns.append(",")
}
columns.deleteCharAt(columns.length - 1).toString()
}
def getSql(hiveDatabase: String, hiveTable: String, hivePartition: String, columns: String) = {
var sql = new StringBuilder()
.append(Constants.SELECT)
.append(columns)
.append(Constants.FROM)
.append(hiveDatabase)
.append(Constants.POINT)
.append(hiveTable)
.append(Constants.WHERE)
.append(Constants.PARTITION_PREFIX + "'" + hivePartition + "'")
sql.toString()
}
def getHBaseConnection(configMap: Map[String, String]) = {
val conf = HBaseConfiguration.create
conf.set("hbase.zookeeper.property.clientPort", configMap(Constants.ZOOKEEPER_CLIENT_PORT))
conf.set("hbase.zookeeper.quorum", configMap(Constants.ZOOKEEPER_QUORUM))
conf.set("hbase.master", configMap(Constants.HBASE_MASTER))
conf.set("zookeeper.znode.parent", configMap(Constants.ZOOKEEPER_ZNODE_PARENT))
ConnectionFactory.createConnection(conf)
}
def createTable(hiveTable: String, configMap: Map[String, String]): Unit = {
val conn = getHBaseConnection(configMap)
val admin = conn.getAdmin.asInstanceOf[HBaseAdmin]
val tableName = TableName.valueOf(hiveTable)
if (!admin.tableExists(tableName)) {
// 表不存在则创建
val desc = new HTableDescriptor(tableName)
val columnDesc = new HColumnDescriptor(Constants.HBASE_COLUMN_FAMILY)
desc.addFamily(columnDesc)
admin.createTable(desc)
}
}
}
UTF-8
UTF-8
2.1.2
2.7.3
1.2.0
2.11.8
1.2.6
org.apache.spark
spark-core_2.11
${spak.version}
org.apache.spark
spark-sql_2.11
${spak.version}
org.apache.spark
spark-hive_2.11
${spak.version}
org.apache.hadoop
hadoop-client
${hadoop.version}
org.apache.hive
hive-jdbc
${hive.version}
org.scala-lang
scala-library
${scala.version}
org.apache.hbase
hbase-server
${hbase.version}
dev
true
src/main/filters/dev.properties
src/main/resources
online
src/main/filters/online.properties
src/main/resources
test
src/main/filters/test.properties
src/main/resources
src/main/scala
src/main/resources
**/*.properties
true
org.scala-tools
maven-scala-plugin
2.15.2
compile
testCompile
-dependencyfile
${project.build.directory}/.scala_dependencies
org.apache.maven.plugins
maven-surefire-plugin
2.10
false
true
**/*Test.*
**/*Suite.*
org.apache.maven.plugins
maven-assembly-plugin
2.4
jar-with-dependencies
datasync.Hive2HBase
make-assembly
package
single
org.scala-tools
maven-scala-plugin
${scala.version}
执行如下命令
spark-submit --master yarn-client --num-executors 2 --executor-memory 4g --driver-memory 4g --executor-cores 3 --class datasync.Hive2HBase xxx.jar dbname tbname partitionname
如果spark on yarn 则执行如下命令
spark-submit --master spark://ip:port --num-executors 2 --executor-memory 2g --driver-memory 2g --executor-cores 10 --total-executor-cores 20