GraphX挖掘极大团

设计思路:

  1. 聚合节点的所有邻居
  2. 求边上节点的共同邻居,并通过共同邻居生成极大团的ID,发送至源节点和目标节点
  3. 获取收到极大团ID的节点

代码如下:

package mu.atlas.graph.community

import mu.atlas.graph.utils.BaseTool._
import org.apache.spark.graphx.{EdgeContext, Graph, VertexId}

import scala.reflect.ClassTag

/**
  * 挖掘极大团
  * Created by zhoujiamu on 2019/8/28.
  */
object Clique {

  def run[VD, ED: ClassTag](graph: Graph[VD, ED]): Graph[Set[VertexId], ED] = {

    val initGraph = graph.mapVertices{case(vid, _) => Set(vid)}

    val vertexRDD = initGraph.aggregateMessages[Set[VertexId]](
      triplet => {
        triplet.sendToSrc(triplet.dstAttr)
        triplet.sendToDst(triplet.srcAttr)
      },
      (a, b) => a++b
    )

    val graphWithNeigs = initGraph.joinVertices(vertexRDD)((vid, vdata, msg) => vdata++msg)

    val vertexWithLabel = graphWithNeigs.aggregateMessages[Set[VertexId]](
      triplet => {
        val intersect = triplet.srcAttr.intersect(triplet.dstAttr).toSeq.sorted
        val msg = md5ToLong(md5(intersect.mkString("")))
        if (intersect.length >= 3){
          triplet.sendToSrc(Set(msg))
          triplet.sendToDst(Set(msg))
        }
      },
      (a, b) => a++b
    )
    graphWithNeigs.outerJoinVertices(vertexWithLabel)((vid, vdata, msg) => msg.getOrElse(Set.empty[VertexId]))
      .subgraph(vpred = (vid, vdata) => vdata.nonEmpty)

  }
}

测试代码:

package mu.atlas.graph.community

import org.junit._
import Assert._

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.Graph

/**
  * Created by zhoujiamu on 2019/8/28.
  */

@Test
class CliqueTest {

  @Test
  def testRun() = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setMaster("local").setAppName("Kcore")

    val sc = new SparkContext(conf)

    val rdd = sc.makeRDD(Array(
      1L -> 2L,
      1L -> 4L,
      2L -> 3L,
      2L -> 4L,
      2L -> 5L,
      2L -> 6L,
      3L -> 5L,
      4L -> 7L
    ))

    val graph = Graph.fromEdgeTuples(rdd, null)

    val clique = Clique.run(graph)

    println("-"*30)
    clique.vertices.foreach(println)

    val res = clique.vertices.flatMap{case(vid, set) => set.map(cid => cid -> Seq(vid))}
      .reduceByKey(_++_).map(_._1).count()

    assertEquals(2, res)

  }

}

测试结果:

(4,Set(-3963192249907337487))
(1,Set(-3963192249907337487))
(3,Set(6304741602190573741))
(5,Set(6304741602190573741))
(2,Set(-3963192249907337487, 6304741602190573741))

2、3、5是一个极大团,1、2、4为另一个极大团,这两个极大团有公共的节点2

你可能感兴趣的:(Spark,知识图谱,Scala)