在《spark sql 写入hive较慢原因分析》中已经分析了spark sql 写入hive分区文件慢的原因,笔者提供几种优化思路供参考:
(1)spark 直接生成hive库表底层分区文件,然后再使用add partion语句添加分区信息
spark.sql(s"alter table legend.test_log_hive_text add partition (name_par='${dirName}')")
(2)spark 生成文件存放到HDFS目录下,使用hive脚本命令,load数据到hive中
hive -e "load data inpath '/test/test_log_hive/name_par=test$i' overwrite into table legend.test_log_hive_text partition(name_par='test$i') "
(3)修改spark配置文件,指定hive metastore版本及jar所在位置,查看spark源码可看到spark支持的hive版本在0.12.0-2.3.3版本之间,修改参数spark.sql.hive.metastore.version及spark.sql.hive.metastore.jars参数
private[spark] object HiveUtils extends Logging {
def withHiveExternalCatalog(sc: SparkContext): SparkContext = {
sc.conf.set(CATALOG_IMPLEMENTATION.key, "hive")
sc
}
/** The version of hive used internally by Spark SQL. */
val builtinHiveVersion: String = "1.2.1"
val HIVE_METASTORE_VERSION = buildConf("spark.sql.hive.metastore.version")
.doc("Version of the Hive metastore. Available options are " +
s"0.12.0
through 2.3.3
.")
.stringConf
.createWithDefault(builtinHiveVersion)
// A fake config which is only here for backward compatibility reasons. This config has no effect
// to Spark, just for reporting the builtin Hive version of Spark to existing applications that
// already rely on this config.
val FAKE_HIVE_VERSION = buildConf("spark.sql.hive.version")
.doc(s"deprecated, please use ${HIVE_METASTORE_VERSION.key} to get the Hive version in Spark.")
.stringConf
.createWithDefault(builtinHiveVersion)
val HIVE_METASTORE_JARS = buildConf("spark.sql.hive.metastore.jars")
.doc(s"""
| Location of the jars that should be used to instantiate the HiveMetastoreClient.
| This property can be one of three options: "
| 1. "builtin"
| Use Hive ${builtinHiveVersion}, which is bundled with the Spark assembly when
| -Phive
is enabled. When this option is chosen,
| spark.sql.hive.metastore.version
must be either
| ${builtinHiveVersion}
or not defined.
| 2. "maven"
| Use Hive jars of specified version downloaded from Maven repositories.
| 3. A classpath in the standard format for both Hive and Hadoop.
""".stripMargin)
.stringConf
.createWithDefault("builtin")
笔者根据自己需求实际情况采用的是第二种方法,笔者实际使用场景:Oracle GG实时读取上游DB日志数据,推送到kafka,流处理程序实时保存变化日志到hbase表中,hbase表每天合并操作日志生成T-1日切片表,再使用spark读取hbase表数据,同步到离线库中供离线分析使用(主要是借用hbase完成数据的更新,删除)以下是demo
package cn.com.spark.hbase.hive
import java.net.URI
import java.util
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.spark.SparkConf
import org.apache.spark.sql.types.{DataTypes, StringType, StructField}
import org.apache.spark.sql.{RowFactory, SparkSession}
import org.slf4j.LoggerFactory
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
object HbaseToHive {
val log = LoggerFactory.getLogger(HbaseToHive.getClass)
// private val hdfsPath = "/user/hive/warehouse/legend.db/test_log_hive_text"
private val hdfsPath = "/test/test_log_hive"
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName(s"${this.getClass.getSimpleName}")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.set("spark.broadcast.compress", "true")
sparkConf.set("spark.rdd.compress", "true")
sparkConf.set("spark.hadoop.mapreduce.output.fileoutputformat.compress", "false")
// sparkConf.set("spark.io.compression.codec", "org.apache.spark.io.SnappyCompressionCodec")
sparkConf.registerKryoClasses(Array(classOf[ImmutableBytesWritable]))
val spark = SparkSession
.builder()
.config(sparkConf)
.appName(s"${this.getClass.getSimpleName}")
.enableHiveSupport()
.getOrCreate()
val conf = HBaseConfiguration.create()
// conf.set("hbase.zookeeper.quorum", "node1:2181,node2:2181,node3:2181")
conf.set("hbase.zookeeper.quorum", "30.4.137.224:2181,30.4.137.228:2181,30.4.137.229:2181")
conf.set(TableInputFormat.INPUT_TABLE, "test:test_log_hive")
val scan = new Scan()
val proto = ProtobufUtil.toScan(scan)
conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray))
val hBaseRDD = spark.sparkContext.newAPIHadoopRDD(
conf,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])
val list = new util.ArrayList[StructField]()
val rowKey = DataTypes.createStructField("rowKey", StringType, true)
val name = DataTypes.createStructField("name", StringType, true)
val age = DataTypes.createStructField("age", StringType, true)
val mobile = DataTypes.createStructField("mobile", StringType, true)
val addr = DataTypes.createStructField("addr", StringType, true)
list.add(rowKey)
list.add(name)
list.add(age)
list.add(mobile)
list.add(addr)
val schema = DataTypes.createStructType(list)
val mapHbaseRDD = hBaseRDD.map(x => {
val result = x._2
val rowKey = Bytes.toString(result.getRow)
val name = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"))
val age = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("age"))
val mobile = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("mobile"))
val addr = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("addr"))
RowFactory.create(rowKey, Bytes.toString(name), Bytes.toString(age), Bytes.toString(mobile), Bytes.toString(addr))
})
val df = spark.createDataFrame(mapHbaseRDD, schema)
df.rdd.map(r => {
(r.getString(1), r.mkString(","))
}).repartition(3).saveAsHadoopFile(hdfsPath, classOf[String], classOf[String],
classOf[RDDMultipleTextOutputFormat])
val dirs = getDirs(hdfsPath)
val loadSql = dirs.map(dir => {
val dirNames = dir.split("/")
val dirName = dirNames(dirNames.length - 1)
s"load data inpath '${dir}' overwrite into table legend.test_log_hive_text partition (name_par='${dirName}')"
})
val loadSqlMap = spliceList(loadSql.toList, 30)
val loadSqlGroups = new ArrayBuffer[String]
loadSqlMap.foreach(x => {
loadSqlGroups += x._2.mkString(";") + System.lineSeparator()
})
spark.sparkContext.makeRDD(loadSqlGroups).repartition(1).saveAsTextFile(hdfsPath + "/" + "load_sql")
//
// spark.sql("use legend")
// spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
// spark.sql("create table test_log_hive_text(rowKey STRING, name STRING,age STRING,mobile STRING,addr " +
// "STRING) partitioned by(name_par STRING) row format delimited fields terminated by ','")
//
// for (dirPath <- dirs) {
// val dirNames = dirPath.split("/")
// val dirName = dirNames(dirNames.length - 1).split("=")(1)
// spark.sql(s"alter table legend.test_log_hive_text add partition (name_par='${dirName}')")
// }
// df.repartition(5)
// df.createTempView("result")
//
// spark.sql("use legend")
// spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
// spark.sql("insert into legend.test_log_hive partition(name_par) select rowKey,name,age,mobile,addr,name as name_par from result")
// df.write.mode(SaveMode.Overwrite).format("parquet").partitionBy("name").insertInto("test.test_log")
// spark.sql("use legend")
// spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
// spark.sql("load data inpath '/test/test_log_hive' OVERWRITE INTO TABLE legend.test_log_hive_text PARTITION
// " +
// "(create_day='2019-04-28') ")
// spark.sql("insert overwrite table legend.test_log_hive_orc PARTITION(name_par) select rowKey,name,age,
// mobile," +
// "addr,name as name_par from test_log_hive_text where create_day='2019-04-28' ")
}
//获取目录下的一级目录
def getDirs(path: String): Array[String] = {
getFilesAndDirs(path).filter(getHdfs(path).getFileStatus(_).isDirectory)
.map(_.toString)
}
//获取目录下的一级文件和目录
def getFilesAndDirs(path: String): Array[Path] = {
val fs = getHdfs(path).listStatus(new Path(path))
FileUtil.stat2Paths(fs)
}
//生成FileSystem
def getHdfs(path: String): FileSystem = {
val conf = new Configuration()
FileSystem.get(URI.create(path), conf)
}
/**
* 拆分集合
*
* @param datas
* @param splitSize
* @return
*/
def spliceList(datas: List[String], splitSize: Int): mutable.HashMap[String, List[String]] = {
if (datas == null || splitSize < 1) return null
val totalSize = datas.size
val count = if (totalSize % splitSize == 0) totalSize / splitSize
else totalSize / splitSize + 1
val map = new mutable.HashMap[String, List[String]]();
for (i <- 1 until count) {
val cols = datas.slice(i * splitSize, if (i == count - 1) totalSize
else splitSize * (i + 1))
map(i.toString) = cols
}
map
}
}
package cn.com.spark.hbase.hive
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = {
// ("name_par=" + key + "/" + name)
(key + "/" + name)
}
override def generateActualKey(key: Any, value: Any): String = {
null
}
}
demo主要功能是读取hbase数据并按照分区字段值,分别保存到hdfs目录上,最后使用hive命令脚本load数据到hive表中