spark sql读取hive底层_原创-spark sql 写入hive较慢优化思路

在《spark sql 写入hive较慢原因分析》中已经分析了spark sql 写入hive分区文件慢的原因,笔者提供几种优化思路供参考:

(1)spark 直接生成hive库表底层分区文件,然后再使用add partion语句添加分区信息

spark.sql(s"alter table legend.test_log_hive_text add partition (name_par='${dirName}')")

(2)spark 生成文件存放到HDFS目录下,使用hive脚本命令,load数据到hive中

hive -e "load data inpath '/test/test_log_hive/name_par=test$i' overwrite into table legend.test_log_hive_text partition(name_par='test$i') "

(3)修改spark配置文件,指定hive metastore版本及jar所在位置,查看spark源码可看到spark支持的hive版本在0.12.0-2.3.3版本之间,修改参数spark.sql.hive.metastore.version及spark.sql.hive.metastore.jars参数

private[spark] object HiveUtils extends Logging {

def withHiveExternalCatalog(sc: SparkContext): SparkContext = {

sc.conf.set(CATALOG_IMPLEMENTATION.key, "hive")

sc

}

/** The version of hive used internally by Spark SQL. */

val builtinHiveVersion: String = "1.2.1"

val HIVE_METASTORE_VERSION = buildConf("spark.sql.hive.metastore.version")

.doc("Version of the Hive metastore. Available options are " +

s"0.12.0 through 2.3.3.")

.stringConf

.createWithDefault(builtinHiveVersion)

// A fake config which is only here for backward compatibility reasons. This config has no effect

// to Spark, just for reporting the builtin Hive version of Spark to existing applications that

// already rely on this config.

val FAKE_HIVE_VERSION = buildConf("spark.sql.hive.version")

.doc(s"deprecated, please use ${HIVE_METASTORE_VERSION.key} to get the Hive version in Spark.")

.stringConf

.createWithDefault(builtinHiveVersion)

val HIVE_METASTORE_JARS = buildConf("spark.sql.hive.metastore.jars")

.doc(s"""

| Location of the jars that should be used to instantiate the HiveMetastoreClient.

| This property can be one of three options: "

| 1. "builtin"

| Use Hive ${builtinHiveVersion}, which is bundled with the Spark assembly when

| -Phive is enabled. When this option is chosen,

| spark.sql.hive.metastore.version must be either

| ${builtinHiveVersion} or not defined.

| 2. "maven"

| Use Hive jars of specified version downloaded from Maven repositories.

| 3. A classpath in the standard format for both Hive and Hadoop.

""".stripMargin)

.stringConf

.createWithDefault("builtin")

笔者根据自己需求实际情况采用的是第二种方法,笔者实际使用场景:Oracle GG实时读取上游DB日志数据,推送到kafka,流处理程序实时保存变化日志到hbase表中,hbase表每天合并操作日志生成T-1日切片表,再使用spark读取hbase表数据,同步到离线库中供离线分析使用(主要是借用hbase完成数据的更新,删除)以下是demo

package cn.com.spark.hbase.hive

import java.net.URI

import java.util

import org.apache.hadoop.conf.Configuration

import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}

import org.apache.hadoop.hbase.HBaseConfiguration

import org.apache.hadoop.hbase.client.{Result, Scan}

import org.apache.hadoop.hbase.io.ImmutableBytesWritable

import org.apache.hadoop.hbase.mapreduce.TableInputFormat

import org.apache.hadoop.hbase.protobuf.ProtobufUtil

import org.apache.hadoop.hbase.util.{Base64, Bytes}

import org.apache.spark.SparkConf

import org.apache.spark.sql.types.{DataTypes, StringType, StructField}

import org.apache.spark.sql.{RowFactory, SparkSession}

import org.slf4j.LoggerFactory

import scala.collection.mutable

import scala.collection.mutable.ArrayBuffer

object HbaseToHive {

val log = LoggerFactory.getLogger(HbaseToHive.getClass)

// private val hdfsPath = "/user/hive/warehouse/legend.db/test_log_hive_text"

private val hdfsPath = "/test/test_log_hive"

def main(args: Array[String]): Unit = {

val sparkConf = new SparkConf()

.setAppName(s"${this.getClass.getSimpleName}")

sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

sparkConf.set("spark.broadcast.compress", "true")

sparkConf.set("spark.rdd.compress", "true")

sparkConf.set("spark.hadoop.mapreduce.output.fileoutputformat.compress", "false")

// sparkConf.set("spark.io.compression.codec", "org.apache.spark.io.SnappyCompressionCodec")

sparkConf.registerKryoClasses(Array(classOf[ImmutableBytesWritable]))

val spark = SparkSession

.builder()

.config(sparkConf)

.appName(s"${this.getClass.getSimpleName}")

.enableHiveSupport()

.getOrCreate()

val conf = HBaseConfiguration.create()

// conf.set("hbase.zookeeper.quorum", "node1:2181,node2:2181,node3:2181")

conf.set("hbase.zookeeper.quorum", "30.4.137.224:2181,30.4.137.228:2181,30.4.137.229:2181")

conf.set(TableInputFormat.INPUT_TABLE, "test:test_log_hive")

val scan = new Scan()

val proto = ProtobufUtil.toScan(scan)

conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray))

val hBaseRDD = spark.sparkContext.newAPIHadoopRDD(

conf,

classOf[TableInputFormat],

classOf[ImmutableBytesWritable],

classOf[Result])

val list = new util.ArrayList[StructField]()

val rowKey = DataTypes.createStructField("rowKey", StringType, true)

val name = DataTypes.createStructField("name", StringType, true)

val age = DataTypes.createStructField("age", StringType, true)

val mobile = DataTypes.createStructField("mobile", StringType, true)

val addr = DataTypes.createStructField("addr", StringType, true)

list.add(rowKey)

list.add(name)

list.add(age)

list.add(mobile)

list.add(addr)

val schema = DataTypes.createStructType(list)

val mapHbaseRDD = hBaseRDD.map(x => {

val result = x._2

val rowKey = Bytes.toString(result.getRow)

val name = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"))

val age = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("age"))

val mobile = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("mobile"))

val addr = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("addr"))

RowFactory.create(rowKey, Bytes.toString(name), Bytes.toString(age), Bytes.toString(mobile), Bytes.toString(addr))

})

val df = spark.createDataFrame(mapHbaseRDD, schema)

df.rdd.map(r => {

(r.getString(1), r.mkString(","))

}).repartition(3).saveAsHadoopFile(hdfsPath, classOf[String], classOf[String],

classOf[RDDMultipleTextOutputFormat])

val dirs = getDirs(hdfsPath)

val loadSql = dirs.map(dir => {

val dirNames = dir.split("/")

val dirName = dirNames(dirNames.length - 1)

s"load data inpath '${dir}' overwrite into table legend.test_log_hive_text partition (name_par='${dirName}')"

})

val loadSqlMap = spliceList(loadSql.toList, 30)

val loadSqlGroups = new ArrayBuffer[String]

loadSqlMap.foreach(x => {

loadSqlGroups += x._2.mkString(";") + System.lineSeparator()

})

spark.sparkContext.makeRDD(loadSqlGroups).repartition(1).saveAsTextFile(hdfsPath + "/" + "load_sql")

//

// spark.sql("use legend")

// spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")

// spark.sql("create table test_log_hive_text(rowKey STRING, name STRING,age STRING,mobile STRING,addr " +

// "STRING) partitioned by(name_par STRING) row format delimited fields terminated by ','")

//

// for (dirPath

// val dirNames = dirPath.split("/")

// val dirName = dirNames(dirNames.length - 1).split("=")(1)

// spark.sql(s"alter table legend.test_log_hive_text add partition (name_par='${dirName}')")

// }

// df.repartition(5)

// df.createTempView("result")

//

// spark.sql("use legend")

// spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")

// spark.sql("insert into legend.test_log_hive partition(name_par) select rowKey,name,age,mobile,addr,name as name_par from result")

// df.write.mode(SaveMode.Overwrite).format("parquet").partitionBy("name").insertInto("test.test_log")

// spark.sql("use legend")

// spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")

// spark.sql("load data inpath '/test/test_log_hive' OVERWRITE INTO TABLE legend.test_log_hive_text PARTITION

// " +

// "(create_day='2019-04-28') ")

// spark.sql("insert overwrite table legend.test_log_hive_orc PARTITION(name_par) select rowKey,name,age,

// mobile," +

// "addr,name as name_par from test_log_hive_text where create_day='2019-04-28' ")

}

//获取目录下的一级目录

def getDirs(path: String): Array[String] = {

getFilesAndDirs(path).filter(getHdfs(path).getFileStatus(_).isDirectory)

.map(_.toString)

}

//获取目录下的一级文件和目录

def getFilesAndDirs(path: String): Array[Path] = {

val fs = getHdfs(path).listStatus(new Path(path))

FileUtil.stat2Paths(fs)

}

//生成FileSystem

def getHdfs(path: String): FileSystem = {

val conf = new Configuration()

FileSystem.get(URI.create(path), conf)

}

/**

* 拆分集合

*

* @param datas

* @param splitSize

* @return

*/

def spliceList(datas: List[String], splitSize: Int): mutable.HashMap[String, List[String]] = {

if (datas == null || splitSize < 1) return null

val totalSize = datas.size

val count = if (totalSize % splitSize == 0) totalSize / splitSize

else totalSize / splitSize + 1

val map = new mutable.HashMap[String, List[String]]();

for (i

val cols = datas.slice(i * splitSize, if (i == count - 1) totalSize

else splitSize * (i + 1))

map(i.toString) = cols

}

map

}

}

package cn.com.spark.hbase.hive

import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat

class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {

override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = {

// ("name_par=" + key + "/" + name)

(key + "/" + name)

}

override def generateActualKey(key: Any, value: Any): String = {

null

}

}

demo主要功能是读取hbase数据并按照分区字段值,分别保存到hdfs目录上,最后使用hive命令脚本load数据到hive表中

你可能感兴趣的:(spark,sql读取hive底层)