数据集中有两列child和parent,需要依据此父子节点来构建树,找到根节点以及调用链。
import org.apache.spark.sql.DataFrame;
val sqlContext=new org.apache.spark.sql.SQLContext(sc);
val df = sqlContext.createDataFrame(List(
("a", "b", "b", "a-b", "20190201"),
("b", "c", "c", "b-c", "20190201"),
("c", "d", "d", "c-d", "20190201"),
("d", "e", "e", "d-e", "20190201")
)).toDF("child", "parent", "grandparent", "chain", "dt")
var result = df.selectExpr("child as child1","parent as parent1","grandparent as grandparent1","chain as chain1","dt as dt1")
def findroot(dataframe:DataFrame) : DataFrame = {
val r = result.where("grandparent1 is not null").take(1)
if(r.isEmpty) {
return result
}
result = dataframe
result = df.join(result,df("parent") === result("child1"),"left_outer").selectExpr("child as child1","NVL(parent1, parent) as parent1","grandparent1","concat_ws('-',child,chain1) as chain1","dt as dt1")
findroot(result)
}
val output = findroot(result)
output.show
运行结果如下:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrame
def findRoot(df:DataFrame):DataFrame = {
import df.sparkSession.implicits._
df.cache()
var result = df.withColumn("chain",concat_ws("->",df("child"),df("parent"))).withColumn("root",df("parent")).withColumn("tmp",df("parent"))
var sample = result.where("tmp is not null").take(1)
while (!sample.isEmpty) {
result = result.as("result").join(df.as("source"),($"source.child"===$"result.tmp").and($"result.tmp".isNotNull),"left_outer")
.select(col("result.child"),col("result.parent")
,concat_ws ("->",col("result.chain"),col("source.parent")).as("chain")
,coalesce(col("source.parent"),column("result.root")).as("root"),col("source.parent").as("tmp") )
result.cache()
sample = result.where("tmp is not null").take(1)
}
df.unpersist()
result.drop("tmp")
}
val sqlContext=new org.apache.spark.sql.SQLContext(sc);
val df = sqlContext.createDataFrame(List(
("a", "b"),
("b", "c"),
("c", "d"),
("d", "e")
)).toDF("child", "parent")
val result = findRoot(df)
result.show
运行结果如下: