[Spark]根据父子节点构建树

数据集中有两列child和parent,需要依据此父子节点来构建树,找到根节点以及调用链。

 一、递归遍历

import org.apache.spark.sql.DataFrame;
val sqlContext=new org.apache.spark.sql.SQLContext(sc);

val df = sqlContext.createDataFrame(List(
  ("a", "b", "b", "a-b", "20190201"),
  ("b", "c", "c", "b-c", "20190201"),
  ("c", "d", "d", "c-d", "20190201"),
  ("d", "e", "e", "d-e", "20190201")
)).toDF("child", "parent", "grandparent", "chain", "dt")

var result = df.selectExpr("child as child1","parent as parent1","grandparent as grandparent1","chain as chain1","dt as dt1")

def findroot(dataframe:DataFrame) : DataFrame = {
    val r = result.where("grandparent1 is not null").take(1)
    if(r.isEmpty) {
        return result
    }
    result = dataframe
    result = df.join(result,df("parent") === result("child1"),"left_outer").selectExpr("child as child1","NVL(parent1, parent) as parent1","grandparent1","concat_ws('-',child,chain1) as chain1","dt as dt1")
    findroot(result)
}

val output = findroot(result)

output.show

运行结果如下:

[Spark]根据父子节点构建树_第1张图片

二、优化

  import org.apache.spark.sql.functions._
  import org.apache.spark.sql.DataFrame
  def findRoot(df:DataFrame):DataFrame = {
    import df.sparkSession.implicits._
    df.cache()
    var result = df.withColumn("chain",concat_ws("->",df("child"),df("parent"))).withColumn("root",df("parent")).withColumn("tmp",df("parent"))
    var sample = result.where("tmp is not null").take(1)
    while (!sample.isEmpty) {
      result = result.as("result").join(df.as("source"),($"source.child"===$"result.tmp").and($"result.tmp".isNotNull),"left_outer")
        .select(col("result.child"),col("result.parent")
          ,concat_ws ("->",col("result.chain"),col("source.parent")).as("chain")
          ,coalesce(col("source.parent"),column("result.root")).as("root"),col("source.parent").as("tmp")  )
      result.cache()
      sample = result.where("tmp is not null").take(1)
    }

    df.unpersist()
    result.drop("tmp")
  } 

val sqlContext=new org.apache.spark.sql.SQLContext(sc);

val df = sqlContext.createDataFrame(List(
  ("a", "b"),
  ("b", "c"),
  ("c", "d"),
  ("d", "e")
)).toDF("child", "parent")

val result = findRoot(df)
result.show

运行结果如下:

[Spark]根据父子节点构建树_第2张图片

 

你可能感兴趣的:(spark,spark,大数据,nosql)