SparkSQL中,有时需要根据需求基于原数据新增一些列,下面介绍一下dataframe新增列的四种方法,根据情况可采用不同的方法实现需求。
package com.longi.bigdata.spark.etl
import com.longi.bigdata.spark.udf.DataFrameForCreatingNewColumnUDF
import com.longi.bigdata.spark.utils.SparkSessionCreate
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.functions._
/**
* Author: whn
* Date: 2019-12-17 13:39
* Version: 1.0
* Function: dataframe新增一列的四种方法
*/
object DataFrameForCreatingNewColumn {
case class TestCase(a: String, b: String, c: Int)
def main(args: Array[String]): Unit = {
val spark = SparkSessionCreate.getSparkSession("DataFrameForCreatingNewColumn", "local[6]")
spark.udf.register("add_col", DataFrameForCreatingNewColumnUDF.addCol _)
import spark.implicits._
val inputDF = spark.sparkContext.parallelize(Seq(
("WYC", "a", 81), ("WYC", "a", 77), ("WYC", "a", 81),
("WYC", "a", 66), ("WYC", "a", 99), ("WYC", "a", 22),
("WYC", "b", 92), ("WYC", "b", 92), ("WYC", "b", 92),
("CJJ", "b", 22), ("CJJ", "b", 33), ("CJJ", "b", 13),
("CJJ", "b", 66), ("CJJ", "b", 88), ("CJJ", "b", 11),
("CJJ", "b", 99), ("CJJ", "b", 99), ("CJJ", "b", 12),
("CJJ", "a", 66), ("CJJ", "a", 66), ("CJJ", "a", 33),
("CJJ", "a", 55), ("CJJ", "a", 99), ("CJJ", "a", 64),
("CJJ", "a", 66), ("CJJ", "a", 97), ("CJJ", "a", 66),
("WHN", "c", 32), ("WHN", "c", 26), ("WHN", "c", 84),
("WHN", "c", 88), ("WHN", "c", 88), ("WHN", "c", 34),
("WHN", "c", 88), ("WHN", "c", 77), ("WHN", "c", 88)
)).map(xs => TestCase(xs._1, xs._2, xs._3)).toDF("name", "class", "scores")
val res1 = createNewColumnByCreateDataFrameMethod(inputDF, spark)
// res1.show()
val res2 = createNewColumnByUDF(inputDF, spark)
// res2.show()
val res3 = createNewColumnBySQL(inputDF, spark)
// res3.show()
val res4 = createNewColumnForIndex(inputDF, spark)
res4.show()
}
// TODO 方法 1.通过createDataFrame方法新增一列
def createNewColumnByCreateDataFrameMethod(inputDF: DataFrame, spark: SparkSession): DataFrame = {
val newRdd: RDD[Row] = inputDF.rdd.map((row: Row) => {
if (row.getInt(2) >= 60) Row(row.getString(0), row.getString(1), row.getInt(2), "及格")
else Row(row.getString(0), row.getString(1), row.getInt(2), "不及格")
})
val schema = inputDF.schema.add("newCol", StringType, nullable = true)
spark.createDataFrame(newRdd, schema)
}
// TODO 方法 2.通过udf新增一列
// -》udf通过 spark.udf.register("fun_name", func)方式注册的UDF函数可以用于df.selectExpr("fun_name(colName)")新增列
-》udf通过 val addColUDF: UserDefinedFunction = udf(addCol _)方式注册的UDF函数可以用于withColumn方法新增列
def createNewColumnByUDF(inputDF: DataFrame, spark: SparkSession): DataFrame = {
// 用withColumn的方法,如果想要替换原数据中的某列, 则新名字与该列命名相同
val newDF = inputDF.withColumn("newCol", DataFrameForCreatingNewColumnUDF.addColUDF(col("scores")))
// val newDF2 = inputDF.selectExpr("name", "class", "scores", "add_col(scores) AS newCol")
newDF2
}
// TODO 方法 3.通过sql新增一列
def createNewColumnBySQL(inputDF: DataFrame, spark: SparkSession): DataFrame = {
inputDF.createOrReplaceTempView("temp")
val newDF = spark.sql(
"""
|SELECT name, class, scores,
| case when scores >= 60 then '及格'
| when scores < 60 then '不及格'
| else '不可能'
| end AS newCol
|FROM temp
|
""".stripMargin)
newDF
}
// TODO 方法 4.新增序号列
def createNewColumnForIndex(inputDF: DataFrame, spark: SparkSession): DataFrame = {
val wd = Window.partitionBy("name","class").orderBy("scores")
val newDF = inputDF.withColumn("index", row_number().over(wd))
// val newDF1 = inputDF.withColumn("index", monotonically_increasing_id) // 单调递增序号
newDF
}
}