spark自定义注册函数

自定义函数的原因

因为在一些情况下,sparksql里面自带的一些函数可能满足不了需求,而一些操作可能又需要多次去执行,比如对矿井下测出来的一些数据需要进行分离解析,得出具体的字段来存放到表里,那么如果频繁的使用spark的API进行多次的操作,代码也会很繁琐,这时候就可以考虑通过自定义注册函数,来解析数据,下面是简单的代码操作。

 import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.functions.udf
 object DataFramewithColumnApp {
   def main(args: Array[String]): Unit = {
  val sparkSession = SparkSession.builder()
    .appName("DataFramewithColumnApp")
    .master("local[2]")
    .getOrCreate()
  //functionApp(sparkSession)
  withColumu(sparkSession)
  sparkSession.stop()
   }
 //自定义注册函数,用sql增加表字段
   def functionApp(sparkSession: SparkSession)={
  val info = sparkSession.sparkContext.textFile("file:///C:\\Users\\HJ\\Desktop/parse.txt")
  val rdd = info.map(_.split("\t")).map(x=>Row(x(0),x(2).toLong))
  val struct = StructType(Array(
    StructField("name", StringType, true),
    StructField("number", LongType, false)
  ))
  val DF = sparkSession.createDataFrame(rdd,struct)
  sparkSession.udf.register("parsename",(name:String,num:Int)=>{
    DFwithColumnUtils.parseUtils(name,num)
  })
  DF.createOrReplaceTempView("infos")
  sparkSession.sql("select name,parsename(name,0)as fruit,parsename(name,1) from infos").show
   }


   //用withColumn的方法增加表字段,需要导入内置functions下面的udf
   def withColumu(sparkSession: SparkSession)={
  val info = sparkSession.sparkContext.textFile("file:///C:\\Users\\HJ\\Desktop/parse.txt")
  val rdd = info.map(_.split("\t")).map(x=>Row(x(0),x(2).toLong))
  val struct = StructType(Array(
    StructField("name", StringType, true),
    StructField("number", LongType, false)
  ))
  val DF = sparkSession.createDataFrame(rdd,struct)
  //定义temp方法的时候,只能传入一个参数,用于下面传入列字段,再多出其他参数都会报错
  //比如temp=(name:String,num:Int)本来想在方法里面创建索引,发现有两个参数就报错
  val temp=(name:String)=>{
    val vall=name.split("/")
    vall
  }
  val parsename1=udf(temp)
  val result=DF.withColumn("friut",parsename1(DF("name"))(0)).withColumn("color",parsename1(DF("name"))(1))
    .show()
   }
     }
 object DFwithColumnUtils {
   def parseUtils(name:String,num:Int) = {
  val value=name.split("/")
  value(num)
   }
 }

parse.txt

apple/red 2018-12-12 6
orange/yellow 2018-03-12 7
banana/yellow 2018-07-12 3
pear/white 2018-05-12 10

结果

 使用withColumn
  +-------------+------+------+------+
  |         name|number| friut| color|
  +-------------+------+------+------+
  |    apple/red|     6| apple|   red|
  |orange/yellow|     7|orange|yellow|
  |banana/yellow|     3|banana|yellow|
  |   pear/white|    10|  pear| white|
  +-------------+------+------+------+

注册为临时表,使用sql
 sql
  +-------------+------+----------------------+
  |         name| fruit|UDF:parsename(name, 1)|
  +-------------+------+----------------------+
  |    apple/red| apple|                   red|
  |orange/yellow|orange|                yellow|
  |banana/yellow|banana|                yellow|
  |   pear/white|  pear|                 white|
  +-------------+------+----------------------+

你可能感兴趣的:(spark,udf)