Syncing to Hive 有两种方式:
run_sync_tool.sh
脚本进行同步改方法最终会同步元数据,但是会抛出异常
val spark = SparkSession
.builder()
.config(sparkConf)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate()
val insertData = spark.read.parquet(inputDir)
//非分区表
// 设置主键列名
var data = insertData.write.format("org.apache.hudi")
//copy_on_writer or merge_on_read
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, allHudiConfig("hoodie.datasource.write.storage.type"))
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, allHudiConfig("hoodie.datasource.write.recordkey.field"))
// 设置数据更新时间的列名
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, allHudiConfig("hoodie.datasource.write.precombine.field"))
//merge逻辑class
.option(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY, allHudiConfig("hoodie.datasource.write.payload.class"))
//是否开启hive同步
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true")
//hive库
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, databaseName)
//hive表
.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, tableName)
//hive连接jdbcurl,我这边的环境存在kerberos认证,所以url要加一些配置
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://master:10000/xh;principal=hive/[email protected]")
//1.判断当前表是否为分区表
if ("true".equals(flag)) {
//该表为分区表
data
//hudi表分区字段
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "year")
//表数据发生变更时,分区是否发生变更
.option(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH, "true")
//设置全局索引
.option(HoodieIndexConfig.INDEX_TYPE_PROP, HoodieIndex.IndexType.GLOBAL_BLOOM.name())
//hudi表主键生成
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY, classOf[SimpleKeyGenerator].getName)
//hive表分区列名
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "dt")
//hive_sync.partition_extractor_class
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, classOf[MultiPartKeysValueExtractor].getName)
} else {
//分区表与非分区表的主键生成策略不同,需要注意
data.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY, classOf[NonpartitionedKeyGenerator].getName)
//hive_sync.partition_extractor_class
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, classOf[NonPartitionedExtractor].getName)
}
// 表名称设置
data
// 并行度参数设置
.option("hoodie.insert.shuffle.parallelism", allHudiConfig("hoodie.insert.shuffle.parallelism"))
.option("hoodie.upsert.shuffle.parallelism", allHudiConfig("hoodie.insert.shuffle.parallelism"))
.option(HoodieWriteConfig.TABLE_NAME, tableName)
.mode(if (exists) SaveMode.Append else SaveMode.Overwrite)
// 写入路径设置
.save(writePath)
上面代码可以同步 hive表,最终的数据是没问题的。但是会抛出如下信息,后面有一篇更详细的博客描述如何使用第一种方式进行同步,请点击使用代码进行hudi hive sync
36516 [main] ERROR org.apache.hudi.hive.HiveSyncTool - Got runtime exception when hive syncing
org.apache.hudi.hive.HoodieHiveSyncException: Failed to get update last commit time synced to 20200918091211
at org.apache.hudi.hive.HoodieHiveClient.updateLastCommitTimeSynced(HoodieHiveClient.java:658)
at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:128)
at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:87)
at org.apache.hudi.HoodieSparkSqlWriter$.syncHive(HoodieSparkSqlWriter.scala:229)
at org.apache.hudi.HoodieSparkSqlWriter$.checkWriteStatus(HoodieSparkSqlWriter.scala:279)
at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:184)
at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:91)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
at com.clb.utils.HoodieImportHelper$.hudiUpsert(HoodieImportHelper.scala:159)
at com.clb.HoodieImportHandler$$anonfun$importDataToHudi$1.apply$mcV$sp(HoodieImportHandler.scala:103)
at scala.util.control.Breaks.breakable(Breaks.scala:38)
at com.clb.HoodieImportHandler$.importDataToHudi(HoodieImportHandler.scala:65)
at com.clb.HoodieImportHandlerTest.testImportDataToHudiPartition(HoodieImportHandlerTest.scala:48)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:44)
at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:15)
at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:41)
at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:20)
at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:76)
at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50)
at org.junit.runners.ParentRunner$3.run(ParentRunner.java:193)
at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:52)
at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:191)
at org.junit.runners.ParentRunner.access$000(ParentRunner.java:42)
at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:184)
at org.junit.runners.ParentRunner.run(ParentRunner.java:236)
at org.junit.runner.JUnitCore.run(JUnitCore.java:157)
at com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:68)
at com.intellij.rt.execution.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:47)
at com.intellij.rt.execution.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:242)
at com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:70)
Caused by: NoSuchObjectException(message:hid0101_cache_xdcs_pacs_hj.merge_test13 table not found)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_table_core(HiveMetaStore.java:1808)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_table(HiveMetaStore.java:1778)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:107)
at com.sun.proxy.$Proxy39.get_table(Unknown Source)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.getTable(HiveMetaStoreClient.java:1208)
at org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.getTable(SessionHiveMetaStoreClient.java:131)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:156)
at com.sun.proxy.$Proxy40.getTable(Unknown Source)
at org.apache.hudi.hive.HoodieHiveClient.updateLastCommitTimeSynced(HoodieHiveClient.java:654)
... 53 more
36551 [main] INFO org.apache.hadoop.hive.metastore.HiveMetaStore - 0: Shutting down the object store...
下载hudi源码,编译打包
git clone https://github.com/apache/hudi.git && cd hudi
mvn clean package -DskipTests -DskipITs
打包结束后查看脚本位置
cd hudi/hudi-hive-sync/
# 可以看见存在:run_sync_tool.sh脚本
因为我本地的hadoop环境是CDH-6.2.0,run_sync_tool.sh 不适合我使用,我修改了某些内容
将下列内容添加或修改部分配置
export HADOOP_HOME=/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/hadoop
export HIVE_HOME=/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/hive
# 大概在脚本的55行
#HADOOP_HIVE_JARS=${HIVE_JARS}:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/mapreduce/*:${HADOOP_HOME}/share/hadoop/hdfs/*:${HADOOP_HOME}/share/hadoop/common/lib/*:${H ADOOP_HOME}/share/hadoop/hdfs/lib/*
HADOOP_HIVE_JARS=${HIVE_JARS}:/opt/cloudera/parcels/CDH-6.2.01.cdh6.2.0.p0.967373/lib/hadoop/client/*
这样配置后,启动shell脚本,会报错:
Exception in thread "main" java.lang.NoClassDefFoundError: com/facebook/fb303/FacebookService$Iface
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:763)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:467)
at java.net.URLClassLoader.access$100(URLClassLoader.java:73)
at java.net.URLClassLoader$1.run(URLClassLoader.java:368)
at java.net.URLClassLoader$1.run(URLClassLoader.java:362)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:361)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.hive.metastore.MetaStoreUtils.getClass(MetaStoreUtils.java:1739)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:128)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:101)
at org.apache.hadoop.hive.ql.metadata.Hive.createMetaStoreClient(Hive.java:3815)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3867)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3847)
at org.apache.hadoop.hive.ql.metadata.Hive.getAllFunctions(Hive.java:4101)
at org.apache.hadoop.hive.ql.metadata.Hive.reloadFunctions(Hive.java:254)
at org.apache.hadoop.hive.ql.metadata.Hive.registerAllFunctionsOnce(Hive.java:237)
at org.apache.hadoop.hive.ql.metadata.Hive.<init>(Hive.java:394)
at org.apache.hadoop.hive.ql.metadata.Hive.create(Hive.java:338)
at org.apache.hadoop.hive.ql.metadata.Hive.getInternal(Hive.java:318)
at org.apache.hadoop.hive.ql.metadata.Hive.get(Hive.java:294)
at org.apache.hudi.hive.HoodieHiveClient.<init>(HoodieHiveClient.java:105)
at org.apache.hudi.hive.HiveSyncTool.<init>(HiveSyncTool.java:65)
at org.apache.hudi.hive.HiveSyncTool.main(HiveSyncTool.java:207)
Caused by: java.lang.ClassNotFoundException: com.facebook.fb303.FacebookService$Iface
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 30 more
原因是没有:libfb303-0.9.3.jar
本地全局搜索:find / -name libfb303-0.9.3.jar
如果存在,将其复制到一个文件夹里面:
cp xxx/xxx/xx/libfb303-0.9.3.jar /opt/lib
# 重新修改run_sync_tool.sh脚本
HADOOP_HIVE_JARS=${HIVE_JARS}:/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/hadoop/client/*:/opt/lib/*
启动脚本
分区表同步
sh /opt/hudi/hudi-hive-sync/run_sync_tool.sh
--base-path /test/sys001/xh/partition #hudi表路径
--database xh #hive库名
--table test # hive表名
--partitioned-by dt
--jdbc-url 'jdbc:hive2://master:10000/xh;principal=hive/[email protected]' #jdbcurl
--partition-value-extractor org.apache.hudi.hive.MultiPartKeysValueExtractor # hive_sync.partition_extractor_class
--user hive #hive user
--pass hive #hive pass
--partitioned-by dt # hive 表分区
非分区表同步
sh /opt/hudi/hudi-hive-sync/run_sync_tool.sh
--base-path /test/sys001/xh/partition #hudi表路径
--database xh #hive库名
--table test2 # hive表名
--jdbc-url 'jdbc:hive2://master:10000/xh;principal=hive/[email protected]' #jdbcurl
--partition-value-extractor org.apache.hudi.hive.NonPartitionedExtractor # hive_sync.partition_extractor_class
--user hive #hive user
--pass hive #hive pass