原创-spark sql 写入hive较慢优化思路

在《spark sql 写入hive较慢原因分析》中已经分析了spark sql 写入hive分区文件慢的原因,笔者提供几种优化思路供参考:
(1)spark 直接生成hive库表底层分区文件,然后再使用add partion语句添加分区信息

spark.sql(s"alter table legend.test_log_hive_text add partition (name_par='${dirName}')")

(2)spark 生成文件存放到HDFS目录下,使用hive脚本命令,load数据到hive中

 hive -e  "load data inpath '/test/test_log_hive/name_par=test$i' overwrite into table legend.test_log_hive_text partition(name_par='test$i') "

(3)修改spark配置文件,指定hive metastore版本及jar所在位置,查看spark源码可看到spark支持的hive版本在0.12.0-2.3.3版本之间,修改参数spark.sql.hive.metastore.version及spark.sql.hive.metastore.jars参数

private[spark] object HiveUtils extends Logging {

  def withHiveExternalCatalog(sc: SparkContext): SparkContext = {
    sc.conf.set(CATALOG_IMPLEMENTATION.key, "hive")
    sc
  }

  /** The version of hive used internally by Spark SQL. */
  val builtinHiveVersion: String = "1.2.1"

  val HIVE_METASTORE_VERSION = buildConf("spark.sql.hive.metastore.version")
    .doc("Version of the Hive metastore. Available options are " +
        s"0.12.0 through 2.3.3.")
    .stringConf
    .createWithDefault(builtinHiveVersion)

  // A fake config which is only here for backward compatibility reasons. This config has no effect
  // to Spark, just for reporting the builtin Hive version of Spark to existing applications that
  // already rely on this config.
  val FAKE_HIVE_VERSION = buildConf("spark.sql.hive.version")
    .doc(s"deprecated, please use ${HIVE_METASTORE_VERSION.key} to get the Hive version in Spark.")
    .stringConf
    .createWithDefault(builtinHiveVersion)

  val HIVE_METASTORE_JARS = buildConf("spark.sql.hive.metastore.jars")
    .doc(s"""
      | Location of the jars that should be used to instantiate the HiveMetastoreClient.
      | This property can be one of three options: "
      | 1. "builtin"
      |   Use Hive ${builtinHiveVersion}, which is bundled with the Spark assembly when
      |   -Phive is enabled. When this option is chosen,
      |   spark.sql.hive.metastore.version must be either
      |   ${builtinHiveVersion} or not defined.
      | 2. "maven"
      |   Use Hive jars of specified version downloaded from Maven repositories.
      | 3. A classpath in the standard format for both Hive and Hadoop.
      """.stripMargin)
    .stringConf
    .createWithDefault("builtin")

笔者根据自己需求实际情况采用的是第二种方法,笔者实际使用场景:Oracle GG实时读取上游DB日志数据,推送到kafka,流处理程序实时保存变化日志到hbase表中,hbase表每天合并操作日志生成T-1日切片表,再使用spark读取hbase表数据,同步到离线库中供离线分析使用(主要是借用hbase完成数据的更新,删除)以下是demo

package cn.com.spark.hbase.hive

import java.net.URI
import java.util

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.spark.SparkConf
import org.apache.spark.sql.types.{DataTypes, StringType, StructField}
import org.apache.spark.sql.{RowFactory, SparkSession}
import org.slf4j.LoggerFactory

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

object HbaseToHive {

  val log = LoggerFactory.getLogger(HbaseToHive.getClass)

  //  private val hdfsPath = "/user/hive/warehouse/legend.db/test_log_hive_text"
  private val hdfsPath = "/test/test_log_hive"

  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf()
      .setAppName(s"${this.getClass.getSimpleName}")

    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sparkConf.set("spark.broadcast.compress", "true")
    sparkConf.set("spark.rdd.compress", "true")
    sparkConf.set("spark.hadoop.mapreduce.output.fileoutputformat.compress", "false")
    //    sparkConf.set("spark.io.compression.codec", "org.apache.spark.io.SnappyCompressionCodec")
    sparkConf.registerKryoClasses(Array(classOf[ImmutableBytesWritable]))


    val spark = SparkSession
      .builder()
      .config(sparkConf)
      .appName(s"${this.getClass.getSimpleName}")
      .enableHiveSupport()
      .getOrCreate()


    val conf = HBaseConfiguration.create()
    //    conf.set("hbase.zookeeper.quorum", "node1:2181,node2:2181,node3:2181")
    conf.set("hbase.zookeeper.quorum", "30.4.137.224:2181,30.4.137.228:2181,30.4.137.229:2181")
    conf.set(TableInputFormat.INPUT_TABLE, "test:test_log_hive")

    val scan = new Scan()
    val proto = ProtobufUtil.toScan(scan)
    conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray))

    val hBaseRDD = spark.sparkContext.newAPIHadoopRDD(
      conf,
      classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result])

    val list = new util.ArrayList[StructField]()
    val rowKey = DataTypes.createStructField("rowKey", StringType, true)
    val name = DataTypes.createStructField("name", StringType, true)
    val age = DataTypes.createStructField("age", StringType, true)
    val mobile = DataTypes.createStructField("mobile", StringType, true)
    val addr = DataTypes.createStructField("addr", StringType, true)
    list.add(rowKey)
    list.add(name)
    list.add(age)
    list.add(mobile)
    list.add(addr)

    val schema = DataTypes.createStructType(list)

    val mapHbaseRDD = hBaseRDD.map(x => {
      val result = x._2
      val rowKey = Bytes.toString(result.getRow)
      val name = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"))
      val age = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("age"))
      val mobile = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("mobile"))
      val addr = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("addr"))
      RowFactory.create(rowKey, Bytes.toString(name), Bytes.toString(age), Bytes.toString(mobile), Bytes.toString(addr))
    })

    val df = spark.createDataFrame(mapHbaseRDD, schema)

    df.rdd.map(r => {
      (r.getString(1), r.mkString(","))
    }).repartition(3).saveAsHadoopFile(hdfsPath, classOf[String], classOf[String],
      classOf[RDDMultipleTextOutputFormat])

    val dirs = getDirs(hdfsPath)
    val loadSql = dirs.map(dir => {
      val dirNames = dir.split("/")
      val dirName = dirNames(dirNames.length - 1)
      s"load data inpath '${dir}' overwrite into table legend.test_log_hive_text partition (name_par='${dirName}')"
    })

    val loadSqlMap = spliceList(loadSql.toList, 30)
    val loadSqlGroups = new ArrayBuffer[String]
    loadSqlMap.foreach(x => {
      loadSqlGroups += x._2.mkString(";") + System.lineSeparator()
    })

    spark.sparkContext.makeRDD(loadSqlGroups).repartition(1).saveAsTextFile(hdfsPath + "/" + "load_sql")

    //
    //    spark.sql("use legend")
    //    spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
    //    spark.sql("create table test_log_hive_text(rowKey STRING, name STRING,age STRING,mobile STRING,addr " +
    //      "STRING) partitioned by(name_par STRING) row format delimited fields terminated by ','")
    //
    //    for (dirPath <- dirs) {
    //      val dirNames = dirPath.split("/")
    //      val dirName = dirNames(dirNames.length - 1).split("=")(1)
    //      spark.sql(s"alter table legend.test_log_hive_text add partition (name_par='${dirName}')")
    //    }


    //    df.repartition(5)
    //    df.createTempView("result")
    //
//        spark.sql("use legend")
//        spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
//        spark.sql("insert into legend.test_log_hive partition(name_par) select rowKey,name,age,mobile,addr,name as name_par from result")

    //    df.write.mode(SaveMode.Overwrite).format("parquet").partitionBy("name").insertInto("test.test_log")


    //    spark.sql("use legend")
    //    spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
    //    spark.sql("load data inpath '/test/test_log_hive' OVERWRITE  INTO TABLE legend.test_log_hive_text PARTITION
    // " +
    //      "(create_day='2019-04-28') ")
    //    spark.sql("insert overwrite table legend.test_log_hive_orc PARTITION(name_par) select rowKey,name,age,
    // mobile," +
    //      "addr,name as name_par from test_log_hive_text where create_day='2019-04-28' ")

  }

  //获取目录下的一级目录
  def getDirs(path: String): Array[String] = {
    getFilesAndDirs(path).filter(getHdfs(path).getFileStatus(_).isDirectory)
      .map(_.toString)
  }

  //获取目录下的一级文件和目录
  def getFilesAndDirs(path: String): Array[Path] = {
    val fs = getHdfs(path).listStatus(new Path(path))
    FileUtil.stat2Paths(fs)
  }

  //生成FileSystem
  def getHdfs(path: String): FileSystem = {
    val conf = new Configuration()
    FileSystem.get(URI.create(path), conf)
  }

  /**
   * 拆分集合
   *
   * @param datas
   * @param splitSize
   * @return
   */
  def spliceList(datas: List[String], splitSize: Int): mutable.HashMap[String, List[String]] = {
    if (datas == null || splitSize < 1) return null
    val totalSize = datas.size
    val count = if (totalSize % splitSize == 0) totalSize / splitSize
    else totalSize / splitSize + 1
    val map = new mutable.HashMap[String, List[String]]();
    for (i <- 1 until count) {
      val cols = datas.slice(i * splitSize, if (i == count - 1) totalSize
      else splitSize * (i + 1))
      map(i.toString) = cols
    }
    map
  }
}

package cn.com.spark.hbase.hive

import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat

class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
  override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = {
    //    ("name_par=" + key + "/" + name)
    (key + "/" + name)
  }

  override def generateActualKey(key: Any, value: Any): String = {
    null
  }
}

demo主要功能是读取hbase数据并按照分区字段值,分别保存到hdfs目录上,最后使用hive命令脚本load数据到hive表中

你可能感兴趣的:(原创-spark sql 写入hive较慢优化思路)