iceberg1.4.2+spark3.4.2+minio

在idea 里面编写iceberg的数据写入和创建表动作,虽然简单,但是官网没有给出完整例子,包括jar包的依赖。最大的坑就是版本不兼容。通过下面完整例子,编写Iceberg的完整代码。

pom.xml文件需要引入的包

 
      org.scala-lang
      scala-library
      ${scala.version}
    
    
      junit
      junit
      4.4
      test
    
    
      org.specs
      specs
      1.2.5
      test
    
    
    

    
      org.apache.iceberg
      iceberg-core
      1.4.2
    

    
      io.minio
      minio
      8.5.7
    
    
    
      com.amazonaws
      aws-java-sdk-s3
      1.12.620
    
    
      org.apache.hadoop
      hadoop-aws
      3.2.2
    
    
      org.apache.hadoop
      hadoop-common
      3.2.2
    


    
    
      org.apache.iceberg
      iceberg-data
      1.4.2
    
    
    org.apache.spark
    spark-core_2.12
    3.4.2 
  
    
      org.apache.spark
      spark-sql_2.12
      3.4.2 
    
    
      org.apache.spark
      spark-streaming_2.12
      3.4.2 
    
    
    
      org.apache.iceberg
      iceberg-spark
      1.4.2
    
      
      
          org.apache.iceberg
          iceberg-spark-runtime-3.4_2.12
          1.4.2
      
    
      com.fasterxml.jackson.core
      jackson-databind
      2.14.2

    
    
      org.apache.iceberg
      iceberg-data
      1.4.2
    
    
    
      org.apache.hadoop
      hadoop-aws
      3.2.2
    
    
      org.apache.iceberg
      iceberg-aws
      1.4.2
    
    
      com.amazonaws
      aws-java-sdk-bundle
      1.11.375
    
    
      org.apache.iceberg
      iceberg-parquet
      1.4.2
    

以下示例给出完整代码:

def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder().master("local").appName("test")
      /* .config("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
       .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
       .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
       .config("spark.hadoop.fs.s3a.endpoint", "http://127.0.0.1:9000")
       .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
       .config("spark.hadoop.fs.s3a.path.style.access", "true")
       .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
       .config("spark.debug.maxToStringFields", "2048")*/
      .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
      .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
      .config("spark.hadoop.spark.hadoop.fs.s3a.endpoint", "http://127.0.0.1:9000")
      .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
      .config("spark.hadoop.fs.s3a.path.style.access", "true")
      .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
      .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
      .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
      //指定hadoop catalog,catalog名称为hadoop_prod
      .config("spark.sql.catalog.hadoop_prod", "org.apache.iceberg.spark.SparkCatalog")
      .config("spark.sql.catalog.hadoop_prod.type", "hadoop")
      .config("spark.sql.catalog.hadoop_prod.hadoop.fs.s3a.access.key", "minioadmin")
        .config("spark.sql.catalog.hadoop_prod.hadoop.fs.s3a.secret.key", "minioadmin")
        .config("spark.sql.catalog.hadoop_prod.hadoop.fs.s3a.endpoint", "http://127.0.0.1:9000")


      .config("spark.sql.catalog.hadoop_prod.warehouse", "s3a://test1/")
      .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
      .getOrCreate()
    import org.apache.iceberg.spark.SparkSessionCatalog
    // 将 Iceberg 的 SparkSessionCatalog 注册到 Spark 中// 将 Iceberg 的 SparkSessionCatalog 注册到 Spark 中

    // 将 Iceberg 的 SparkSessionCatalog 注册到 Spark 中


    //1.创建Iceberg表,并插入数据
    //spark.sql("create table hadoop_prod.mydb.mytest (id int,name string,age int) using iceberg".stripMargin)

    spark.sql(
      """
        |insert into hadoop_prod.mydb.mytest values (1,"zs",18),(2,"ls",19),(3,"ww",20)
      """.stripMargin)
    //1.SQL 方式读取Iceberg中的数据
   // spark.sql("select * from hadoop_prod.mydb.mytest").show()
    spark.sql(
      """
        |select * from hadoop_prod.mydb.mytest VERSION AS OF 4696493712637386339;

      """.stripMargin).show()
    /**
      * 2.使用Spark查询Iceberg中的表除了使用sql 方式之外,还可以使用DataFrame方式,建议使用SQL方式
      */
    //第一种方式使用DataFrame方式查询Iceberg表数据snapshots,history,manifests,files
  val frame1: DataFrame = spark.table("hadoop_prod.mydb.mytest.snapshots")
   frame1.show()
    val frame2: DataFrame = spark.table("hadoop_prod.mydb.mytest.history")
    frame2.show()
   // spark.read.option("snapshot-id","4696493712637386339"). format("iceberg").load("3a://test/mydb/mytest")
    //第二种方式使用DataFrame加载 Iceberg表数据
   val frame3: DataFrame = spark.read.format("iceberg").load("hadoop_prod.mydb.mytest")
   frame3.show()

你可能感兴趣的:(spark)