graphx pagerank 源码解析

1.在spark官网下载source code。





 * PageRank algorithm implementation. There are two implementations of PageRank implemented.
 * The first implementation uses the standalone `Graph` interface and runs PageRank
 * for a fixed number of iterations:
 * {{{
 * var PR = Array.fill(n)( 1.0 )
 * val oldPR = Array.fill(n)( 1.0 )
 * for( iter <- 0 until numIter ) {
 *   swap(oldPR, PR)
 *   for( i <- 0 until n ) {
 *     PR[i] = alpha + (1 - alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum
 *   }
 * }
 * }}}
 * The second implementation uses the `Pregel` interface and runs PageRank until
 * convergence:
 * {{{
 * var PR = Array.fill(n)( 1.0 )
 * val oldPR = Array.fill(n)( 0.0 )
 * while( max(abs(PR - oldPr)) > tol ) {
 *   swap(oldPR, PR)
 *   for( i <- 0 until n if abs(PR[i] - oldPR[i]) > tol ) {
 *     PR[i] = alpha + (1 - \alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum
 *   }
 * }
 * }}}
 * `alpha` is the random reset probability (typically 0.15), `inNbrs[i]` is the set of
 * neighbors which link to `i` and `outDeg[j]` is the out degree of vertex `j`.
 * @note This is not the "normalized" PageRank and as a consequence pages that have no
 * inlinks will have a PageRank of alpha.






def staticPageRank(numIter: Int, resetProb: Double = 0.15): Graph[Double, Double]

Run PageRank for a fixed number of iterations returning a graph with vertex attributes containing the PageRank and edge attributes the normalized edge weight.


def staticPersonalizedPageRank(src: VertexId, numIter: Int, resetProb: Double = 0.15): Graph[Double, Double]

Run Personalized PageRank for a fixed number of iterations with with all iterations originating at the source node returning a graph with vertex attributes containing the PageRank and edge attributes the normalized edge weight.

def staticParallelPersonalizedPageRank(sources: Array[VertexId], numIter: Int, resetProb: Double = 0.15): Graph[Vector, Double]
Run parallel personalized PageRank for a given array of source vertices, such that all random walks are started relative to the source vertices


pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double]
Run a dynamic version of PageRank returning a graph with vertex attributes containing the PageRank and edge attributes containing the normalized edge weight.

第5种(静态, 指定源顶点)

def personalizedPageRank(src: VertexId, tol: Double, resetProb: Double = 0.15): Graph[Double, Double]
Run personalized PageRank for a given vertex, such that all random walks are started relative to the source node.


   * Run PageRank for a fixed number of iterations returning a graph
   * with vertex attributes containing the PageRank and edge
   * attributes the normalized edge weight.
   * @tparam VD the original vertex attribute (not used)
   * @tparam ED the original edge attribute (not used)
   * @param graph the graph on which to compute PageRank
   * @param numIter the number of iterations of PageRank to run
   * @param resetProb the random reset probability (alpha)
   * @return the graph containing with each vertex containing the PageRank and each edge
   *         containing the normalized weight.
  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int,
    resetProb: Double = 0.15): Graph[Double, Double] =
    runWithOptions(graph, numIter, resetProb)

   * Run PageRank for a fixed number of iterations returning a graph
   * with vertex attributes containing the PageRank and edge
   * attributes the normalized edge weight.
   * @tparam VD the original vertex attribute (not used)
   * @tparam ED the original edge attribute (not used)
   * @param graph the graph on which to compute PageRank
   * @param numIter the number of iterations of PageRank to run
   * @param resetProb the random reset probability (alpha)
   * @param srcId the source vertex for a Personalized Page Rank (optional)
   * @return the graph containing with each vertex containing the PageRank and each edge
   *         containing the normalized weight.
  def runWithOptions[VD: ClassTag, ED: ClassTag](
      graph: Graph[VD, ED], numIter: Int, resetProb: Double = 0.15,
      srcId: Option[VertexId] = None): Graph[Double, Double] =
    require(numIter > 0, s"Number of iterations must be greater than 0," +
      s" but got ${numIter}")
    require(resetProb >= 0 && resetProb <= 1, s"Random reset probability must belong" +
      s" to [0, 1], but got ${resetProb}")

    val personalized = srcId.isDefined
    val src: VertexId = srcId.getOrElse(-1L)

    // Initialize the PageRank graph with each edge attribute having
    // weight 1/outDegree and each vertex with attribute 1.0.
    // When running personalized pagerank, only the source vertex
    // has an attribute 1.0. All others are set to 0.
    var rankGraph: Graph[Double, Double] = graph
      // Associate the degree with each vertex 将每个顶点进行连接(度的传递)得到顶点属性值为出度数
      .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => deg.getOrElse(0) }
      // Set the weight on the edges based on the degree 通过顶点的出度数为每条边设置权重值 TripletFields.Src: Expose the source and edge fields but not the destination field
      .mapTriplets( e => 1.0 / e.srcAttr, TripletFields.Src )
      // Set the vertex attributes to the initial pagerank values 设置每个顶点的初始属性值为1.0 或者 0, 如果没有指定初始源顶点,则设置所有的顶点属性值为1.0;如果指定了初始源顶点,则将指定的源顶点属性值设为1.0,其他非源顶点属性值为0.0
      .mapVertices { (id, attr) =>
        if (!(id != src && personalized)) 1.0 else 0.0

    def delta(u: VertexId, v: VertexId): Double = { if (u == v) 1.0 else 0.0 }

    var iteration = 0
    var prevRankGraph: Graph[Double, Double] = null
    while (iteration < numIter) {

      // Compute the outgoing rank contributions of each vertex, perform local preaggregation, and
      // do the final aggregation at the receiving vertices. Requires a shuffle for aggregation.
      val rankUpdates = rankGraph.aggregateMessages[Double](
        ctx => ctx.sendToDst(ctx.srcAttr * ctx.attr), _ + _, TripletFields.Src)

      // Apply the final rank updates to get the new ranks, using join to preserve ranks of vertices
      // that didn't receive a message. Requires a shuffle for broadcasting updated ranks to the
      // edge partitions.
      prevRankGraph = rankGraph
      val rPrb = if (personalized) {
        (src: VertexId, id: VertexId) => resetProb * delta(src, id)
      } else {
        (src: VertexId, id: VertexId) => resetProb
      rankGraph = rankGraph.outerJoinVertices(rankUpdates) {
        (id, oldRank, msgSumOpt) => rPrb(src, id) + (1.0 - resetProb) * msgSumOpt.getOrElse(0.0)

      rankGraph.edges.foreachPartition(x => {}) // also materializes rankGraph.vertices
      logInfo(s"PageRank finished iteration $iteration.")

      iteration += 1

    // SPARK-18847 If the graph has sinks (vertices with no outgoing edges) correct the sum of ranks
    normalizeRankSum(rankGraph, personalized)


 * Run Personalized PageRank for a fixed number of iterations, for a
 * set of starting nodes in parallel. Returns a graph with vertex attributes
 * containing the pagerank relative to all starting nodes (as a sparse vector) and
 * edge attributes the normalized edge weight
 * @tparam VD The original vertex attribute (not used)
 * @tparam ED The original edge attribute (not used)
 * @param graph The graph on which to compute personalized pagerank
 * @param numIter The number of iterations to run
 * @param resetProb The random reset probability
 * @param sources The list of sources to compute personalized pagerank from
 * @return the graph with vertex attributes
 *         containing the pagerank relative to all starting nodes (as a sparse vector
 *         indexed by the position of nodes in the sources list) and
 *         edge attributes the normalized edge weight
def runParallelPersonalizedPageRank[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED],
  numIter: Int, resetProb: Double = 0.15,
  sources: Array[VertexId]): Graph[Vector, Double] = {
  require(numIter > 0, s"Number of iterations must be greater than 0," +
    s" but got ${numIter}")
  require(resetProb >= 0 && resetProb <= 1, s"Random reset probability must belong" +
    s" to [0, 1], but got ${resetProb}")
  require(sources.nonEmpty, s"The list of sources must be non-empty," +
    s" but got ${sources.mkString("[", ",", "]")}")

  // TODO if one sources vertex id is outside of the int range
  // we won't be able to store its activations in a sparse vector
  require(sources.max <= Int.MaxValue.toLong,
    s"This implementation currently only works for source vertex ids at most ${Int.MaxValue}")
  //Creates a sparse vector using unordered (index, value) pairs and Converts the instance to a breeze vector(基向量).
val zero = Vectors.sparse(sources.size, List()).asBreeze
  val sourcesInitMap = { case (vid, i) =>
    val v = Vectors.sparse(sources.size, Array(i), Array(1.0)).asBreeze
    (vid, v)
  val sc = graph.vertices.sparkContext
  val sourcesInitMapBC = sc.broadcast(sourcesInitMap)
  // Initialize the PageRank graph with each edge attribute having
  // weight 1/outDegree and each source vertex with attribute 1.0.
  var rankGraph = graph
    // Associate the degree with each vertex
    .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => deg.getOrElse(0) }
    // Set the weight on the edges based on the degree
    .mapTriplets(e => 1.0 / e.srcAttr, TripletFields.Src)
    .mapVertices { (vid, attr) =>
      if (sourcesInitMapBC.value contains vid) {
      } else {

  var i = 0
  while (i < numIter) {
    val prevRankGraph = rankGraph
    // Propagates the message along outbound edges
    // and adding start nodes back in with activation resetProb
    val rankUpdates = rankGraph.aggregateMessages[BV[Double]](
      ctx => ctx.sendToDst(ctx.srcAttr *:* ctx.attr),
      (a : BV[Double], b : BV[Double]) => a +:+ b, TripletFields.Src)

    rankGraph = rankGraph.outerJoinVertices(rankUpdates) {
      (vid, oldRank, msgSumOpt) =>
        val popActivations: BV[Double] = msgSumOpt.getOrElse(zero) *:* (1.0 - resetProb)
        val resetActivations = if (sourcesInitMapBC.value contains vid) {
          sourcesInitMapBC.value(vid) *:* resetProb
        } else {
        popActivations +:+ resetActivations

    rankGraph.edges.foreachPartition(x => {}) // also materializes rankGraph.vertices

    logInfo(s"Parallel Personalized PageRank finished iteration $i.")

    i += 1

  // SPARK-18847 If the graph has sinks (vertices with no outgoing edges) correct the sum of ranks
  val rankSums = rankGraph.vertices.values.fold(zero)(_ +:+ _)
  rankGraph.mapVertices { (vid, attr) =>
    Vectors.fromBreeze(attr /:/ rankSums)


   * Run a dynamic version of PageRank returning a graph with vertex attributes containing the
   * PageRank and edge attributes containing the normalized edge weight.
   * @tparam VD the original vertex attribute (not used)
   * @tparam ED the original edge attribute (not used)
   * @param graph the graph on which to compute PageRank
   * @param tol the tolerance allowed at convergence (smaller => more accurate).
   * @param resetProb the random reset probability (alpha)
   * @return the graph containing with each vertex containing the PageRank and each edge
   *         containing the normalized weight.
  def runUntilConvergence[VD: ClassTag, ED: ClassTag](
    graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15): Graph[Double, Double] =
      runUntilConvergenceWithOptions(graph, tol, resetProb)

   * Run a dynamic version of PageRank returning a graph with vertex attributes containing the
   * PageRank and edge attributes containing the normalized edge weight.
   * @tparam VD the original vertex attribute (not used)
   * @tparam ED the original edge attribute (not used)
   * @param graph the graph on which to compute PageRank
   * @param tol the tolerance allowed at convergence (smaller => more accurate).
   * @param resetProb the random reset probability (alpha)
   * @param srcId the source vertex for a Personalized Page Rank (optional)
   * @return the graph containing with each vertex containing the PageRank and each edge
   *         containing the normalized weight.
  def runUntilConvergenceWithOptions[VD: ClassTag, ED: ClassTag](
      graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15,
      srcId: Option[VertexId] = None): Graph[Double, Double] =
    require(tol >= 0, s"Tolerance must be no less than 0, but got ${tol}")
    require(resetProb >= 0 && resetProb <= 1, s"Random reset probability must belong" +
      s" to [0, 1], but got ${resetProb}")

    val personalized = srcId.isDefined
    val src: VertexId = srcId.getOrElse(-1L)

    // Initialize the pagerankGraph with each edge attribute
    // having weight 1/outDegree and each vertex with attribute 1.0.
    val pagerankGraph: Graph[(Double, Double), Double] = graph
      // Associate the degree with each vertex
      .outerJoinVertices(graph.outDegrees) {
        (vid, vdata, deg) => deg.getOrElse(0)
      // Set the weight on the edges based on the degree
      .mapTriplets( e => 1.0 / e.srcAttr )
      // Set the vertex attributes to (initialPR, delta = 0)
      .mapVertices { (id, attr) =>
        if (id == src) (0.0, Double.NegativeInfinity) else (0.0, 0.0)

    // Define the three functions needed to implement PageRank in the GraphX
    // version of Pregel
    def vertexProgram(id: VertexId, attr: (Double, Double), msgSum: Double): (Double, Double) = {
      val (oldPR, lastDelta) = attr
      val newPR = oldPR + (1.0 - resetProb) * msgSum
      (newPR, newPR - oldPR)

    def personalizedVertexProgram(id: VertexId, attr: (Double, Double),
      msgSum: Double): (Double, Double) = {
      val (oldPR, lastDelta) = attr
      val newPR = if (lastDelta == Double.NegativeInfinity) {
      } else {
        oldPR + (1.0 - resetProb) * msgSum
      (newPR, newPR - oldPR)

    def sendMessage(edge: EdgeTriplet[(Double, Double), Double]) = {
      if (edge.srcAttr._2 > tol) {
        Iterator((edge.dstId, edge.srcAttr._2 * edge.attr))
      } else {

    def messageCombiner(a: Double, b: Double): Double = a + b

    // The initial message received by all vertices in PageRank
    val initialMessage = if (personalized) 0.0 else resetProb / (1.0 - resetProb)

    // Execute a dynamic version of Pregel.
    val vp = if (personalized) {
      (id: VertexId, attr: (Double, Double), msgSum: Double) =>
        personalizedVertexProgram(id, attr, msgSum)
    } else {
      (id: VertexId, attr: (Double, Double), msgSum: Double) =>
        vertexProgram(id, attr, msgSum)

    val rankGraph = Pregel(pagerankGraph, initialMessage, activeDirection = EdgeDirection.Out)(
      vp, sendMessage, messageCombiner)
      .mapVertices((vid, attr) => attr._1)

    // SPARK-18847 If the graph has sinks (vertices with no outgoing edges) correct the sum of ranks
    normalizeRankSum(rankGraph, personalized)

  // Normalizes the sum of ranks to n (or 1 if personalized)
  private def normalizeRankSum(rankGraph: Graph[Double, Double], personalized: Boolean) = {
    val rankSum = rankGraph.vertices.values.sum()
    if (personalized) {
      rankGraph.mapVertices((id, rank) => rank / rankSum)
    } else {
      val numVertices = rankGraph.numVertices
      val correctionFactor = numVertices.toDouble / rankSum
      rankGraph.mapVertices((id, rank) => rank * correctionFactor)

