Spark——0基础入门数据湖Hudi的读写

前言

开发环境:

spark:3.2.1
hive:2.1
hudi:0.11.1
scala:2.12

hudi建表语句

CREATE EXTERNAL TABLE `tb1_trips_cow_w`(
  `_hoodie_commit_time` string,
  `_hoodie_commit_seqno` string,
  `_hoodie_record_key`   string,
  `_hoodie_partition_path` string,
  `_hoodie_file_name` string,
  begin_lat double,
  begin_lon double, 
  driver string,
  end_lat double, 
  end_lon double, 
  fare double,
  rider string, 
  ts bigint,
  uuid string,
  partitionpath string
  )
PARTITIONED BY (area string,county string ,city string)
ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
  'org.apache.hudi.hadoop.HoodieParquetInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION 'hdfs://10.254.21.4:51000/user/hdpu/warehouse/kyuubi_hudi.db/tb1_trips_cow_w';
  
ALTER TABLE hudi.tb1_trips_cow_w ADD IF NOT EXISTS PARTITION (area='asia',county='india' ,city='chennai' ) LOCATION '/hudidatas/hudi-warehouse/tb1_trips_cow_w/asia/india/chennai';

 

Pom文件



    4.0.0

    org.example
    hudi-test-scala
    1.0-SNAPSHOT


    
        2.12.10
        2.12
        2.7.7
        3.2.1
        0.11.1
    
    

        
            org.xerial.snappy
            snappy-java
            1.1.8.4
        

        
            org.scala-lang
            scala-library
            ${scala.version}
        

        
        
            org.apache.spark
            spark-core_${scala.binary.version}
            ${spark.version}
        
        
        
            org.apache.spark
            spark-sql_${scala.binary.version}
            ${spark.version}
        
        
        
            org.apache.spark
            spark-sql-kafka-0-10_${scala.binary.version}
            ${spark.version}
        

        
        
            org.apache.hadoop
            hadoop-client
            ${hadoop.version}
        

        
        
            org.apache.hudi
            hudi-spark3-bundle_2.12
            ${hoodie.version}
        
        
            org.apache.spark
            spark-avro_2.12
            ${spark.version}
        

        
        
            org.apache.spark
            spark-hive_${scala.binary.version}
            ${spark.version}
        
        
            org.apache.spark
            spark-hive-thriftserver_${scala.binary.version}
            ${spark.version}
        
        
            org.apache.httpcomponents
            httpcore
            4.4.13
        
        
            org.apache.httpcomponents
            httpclient
            4.5.12
        

        
        
            org.apache.hive
            hive-jdbc
            3.1.3
        
        
            mysql
            mysql-connector-java
            5.1.49
        

    


    
        src/main/scala
        src/test/scala
        

            
                org.apache.maven.plugins
                maven-compiler-plugin
                3.0
                
                    1.8
                    1.8
                    UTF-8
                
            

            
                net.alchim31.maven
                scala-maven-plugin
                3.2.0
                
                    
                        
                            compile
                            testCompile
                        
                        
                            
                                -dependencyfile
                                ${project.build.directory}/.scala_dependencies
                            
                        
                    
                
            
        
    

Scala代码

package com.hudi.spark.test


import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.QuickstartUtils._
import org.apache.hudi.config.HoodieWriteConfig._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

import scala.collection.JavaConversions._


object HudiSparkTest {


  def main(args: Array[String]): Unit = {
    // 创建sparkSQL的运行环境
    System.setProperty("HADOOP_USER_NAME", "hdpu")
    val conf = new SparkConf().setMaster("local[*]").setAppName("insertDatasToHudi")
    val spark = SparkSession.builder().config(conf)
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("hive.metastore.uris", "thrift://10.254.21.3:53083")
      .config("dfs.client.use.datanode.hostname", "true")
      .getOrCreate()

    // 设置序列化方式:Kryo
    //定义变量:表名,数据存储路径
    val tableName: String = "tb1_trips_cow_w"
    val tablePath: String = "hdfs://10.254.21.4:51000/user/hdpu/warehouse/kyuubi_hudi.db/tb1_trips_cow_w"

    //数据生成器
    val generator = new DataGenerator()

    //插人数据
    insertData(spark, tableName, tablePath, "append", generator)

    //查询数据
//    queryData(spark, tablePath)

    //根据时间查询
//     queryDataByTime(spark,tablePath)

    //更新数据
//    updateData(spark,generator,tableName,tablePath)

    //增量查询
//    incrementQueryData(spark,tablePath)

    //关闭
    spark.stop()
  }

  /**
   * 插入数据
   *
   * @param spark
   * @param tableName
   * @param tablePath
   * @param savemode
   */
  def insertData(spark: SparkSession, tableName: String, tablePath: String, savemode: String, dataGen: DataGenerator): Unit = {
    //导入隐式转换
    import spark.implicits._
    // 第1步、模拟乘车数据
    //val generator: DataGenerator = new DataGenerator()
    val insertData = convertToStringList {
      dataGen.generateInserts(100)
    }
    val dataDF = spark.read.json(spark.sparkContext.parallelize(insertData, 2).toDS())
    //保存数据
    dataDF.write
      .format("hudi")
      .mode(savemode)
      .option("hoodie.insert.shuffle.parallelism", "2")
      .option("hoodie.upsert.shuffle.parallelism", "2")
      // Hudi 表的属性值设置
      .option(PRECOMBINE_FIELD.key(), "ts")
      .option(RECORDKEY_FIELD.key(), "uuid")
      .option(PARTITIONPATH_FIELD.key(), "partitionpath")
      .option(TBL_NAME.key(), tableName)
      .save(tablePath)
  }

  /**
   * 查询数据
   *
   * @param spark
   * @param tablePath
   */
  def queryData(spark: SparkSession, tablePath: String): Unit = {
    // spark-shell
    val tripsSnapshotDF = spark.read
      .format("hudi")
      .load(tablePath)

    tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot")
    tripsSnapshotDF.printSchema()

    //通过spark-sql查询
    spark.sql("select * from  hudi_trips_snapshot where fare > 20.0").show()


    //通过DF 查询费用大于20,小于50的乘车数据
    //    tripsSnapshotDF
    //      .filter($"fare" >= 20 && $"fare" <= 50)
    //      .select($"driver", $"rider", $"fare", $"begin_lat", $"begin_lon", $"partitionpath", $"_hoodie_commit_time")
    //      .orderBy($"fare".desc, $"_hoodie_commit_time".desc)
    //      .show(20, truncate = false)
  }

  /**
   * 按日期查询
   *
   * @param spark
   * @param tablepath
   */
  def queryDataByTime(spark: SparkSession, tablepath: String): Unit = {
    import org.apache.spark.sql.functions._
    // 方式一:指定字符串,格式 yyyyMMddHHmmss
//    val df1 = spark.read
//      .format("hudi")
//      .option("as.of.instant", "20220902092804")
//      .load(tablepath)
//      .sort(col("_hoodie_commit_time").desc)
//    df1.show(numRows = 5, truncate = false)

    // 方式二:指定字符串,格式 yyyyMMddHHmmss
//    val df2 = spark.read
//      .format("hudi")
//      .option("as.of.instant", "2022-09-02 09:28:04")
//      .load(tablepath)
//      .sort(col("_hoodie_commit_time").desc)
//    df2.show(numRows = 5, truncate = false)

    // 方式三
    val df3 = spark.read
      .format("hudi")
      .option("as.of.instant", "2022-09-02")
      .load(tablepath)
    df3.show(numRows = 5, truncate = false)
  }

  /*
       更新Hudi数据,运行程序时,更新数据Key是存在的
       必须要求与插入数据使用同一个DataGenerator对象
   */
  def updateData(spark: SparkSession, dataGen: DataGenerator, tableName: String, tablePath: String): Unit = {
    import org.apache.hudi.QuickstartUtils._
    import spark.implicits._

    import scala.collection.JavaConverters._

    val updates = convertToStringList(dataGen.generateUpdates(100)) //generateUpdates 区别
    val updateDF = spark.read.json(spark.sparkContext.parallelize(updates.asScala, 2).toDS())

    import org.apache.hudi.DataSourceWriteOptions._
    import org.apache.hudi.config.HoodieWriteConfig._
    updateDF.write
      .mode("append")
      .format("hudi")
      .option("hoodie.insert.shuffle.parallelism", "2")
      .option("hoodie.upsert.shuffle.parallelism", "2")
      .option(PRECOMBINE_FIELD.key(), "ts")
      .option(RECORDKEY_FIELD.key(), "uuid")
      .option(PARTITIONPATH_FIELD.key(), "partitionpath")
      .option(TBL_NAME.key(), tableName)
      .save(tablePath)
  }

  /**
   * 增量查询
   *
   * @param spark
   */
  def incrementQueryData(spark: SparkSession, path: String): Unit = {
    import spark.implicits._
    spark.read
      .format("hudi")
      .load(path)
      .createOrReplaceTempView("view_temp_hudi_trips")
    val commits: Array[String] = spark
      .sql(
        """
          |select
          |  distinct(_hoodie_commit_time) as commitTime
          |from
          |  view_temp_hudi_trips
          |order by
          |  commitTime DESC
          |""".stripMargin
      )
      .map(row => row.getString(0))
      .take(50)
    val beginTime = commits(commits.length - 1) // commit time we are interested in
    println(s"beginTime = ${beginTime}")

    // TODO: b. 设置Hudi数据CommitTime时间阈值,进行增量查询数据
    val tripsIncrementalDF = spark.read
      .format("hudi")
      // 设置查询数据模式为:incremental,增量读取
      .option(QUERY_TYPE.key(), QUERY_TYPE_INCREMENTAL_OPT_VAL)
      // 设置增量读取数据时开始时间
      .option(BEGIN_INSTANTTIME.key(), beginTime)
      .load(path)

    // TODO: c. 将增量查询数据注册为临时视图,查询费用fare大于20的数据信息
    tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental")
    spark
      .sql(
        """
          |select
          |  `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts
          |from
          |  hudi_trips_incremental
          |where
          |  fare > 20.0
          |""".stripMargin
      )
      .show(10, truncate = false)


    //      spark.read()
    //      .format("org.apache.hudi")
    //      .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
    //      .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), )
    //        .option(DataSourceReadOptions.INCR_PATH_GLOB_OPT_KEY(), "/year=2020/month=*/day=*") // Optional, use glob pattern if querying certain partitions
    //        .load(tablePath); // For incremental query, pass in the root/base path of table
    //
    //        hudiIncQueryDF.createOrReplaceTempView("hudi_trips_incremental")
    //        spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from  hudi_trips_incremental where fare > 20.0").show()

  }


}


你可能感兴趣的:(大数据,spark,大数据,scala,hudi)