Spark DataFrame替换column中值

 

 话不多说,上代码


import java.util.UUID

import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.functions._

object TestReplaceAndFill {
  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession
      .builder()
      .master("local[*]")
      .appName("TestReplaceAndFill")
      .config("spark.sql.shuffle.partitions","10")
      .enableHiveSupport()
      .getOrCreate()

    val resultDF = spark.read.format("jdbc").options(Map(
      "driver" -> "com.mysql.jdbc.Driver",
      "url" -> "jdbc:mysql://127.0.0.1:3306",
      "dbtable" -> "test.test1",
      "user" -> "root",
      "password" -> "root"
    )).load()

    //构造一个列,值为UUID
    val generateUUID = udf(() => UUID.randomUUID().toString().replace("-", "").toLowerCase())


    resultDF
      //替换 "A","B","C","D" 这4个列中的Null值为对应值
      .na.fill(Map(
      "A" -> "A_unknown",
      "B" -> 1.0,
      "C" -> "C_unknown",
      "D" -> "D_unknown"
      )) 
      //替换"C" , "D" 2列中"haha"为 "hehe" 
      .na.replace("C" :: "D" ::Nil, Map("haha" -> "hehe")) 
      //添加一列指定列名为"UUID",值为UUID
      .withColumn("UUID",generateUUID())
      .write
      .mode(SaveMode.Overwrite)
      .format("jdbc")
      .options(Map(
        "driver" -> "com.mysql.jdbc.Driver",
        "url" -> "jdbc:mysql://127.0.0.1:3306",
        "dbtable" -> "test.test2",
        "user" -> "root",
        "password" -> "root"
      ))
      .save()

    spark.stop()
  }

}

表test1:

Spark DataFrame替换column中值_第1张图片

表test2:

Spark DataFrame替换column中值_第2张图片

你可能感兴趣的:(Spark)