更多代码请见:https://github.com/xubo245/SparkLearning
1解释
使用PageRank来对wiki中有关berkeley的链接进行计算,取top(10)
2.代码:
/** * @author xubo * ref http://spark.apache.org/docs/1.5.2/graphx-programming-guide.html * time 20160503 */ package org.apache.spark.graphx.learning import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD object PageRankAboutBerkeleyWiki { def main(args: Array[String]) { //屏蔽日志 Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) //设置运行环境 val conf = new SparkConf().setAppName("PageRankAboutBerkeleyWiki").setMaster("local") val sc = new SparkContext(conf) //读入数据文件 val articles: RDD[String] = sc.textFile("file/data/graphx/input/graphx-wiki-vertices.txt") val links: RDD[String] = sc.textFile("file/data/graphx/input/graphx-wiki-edges.txt") //装载顶点和边 val vertices = articles.map { line => val fields = line.split('\t') (fields(0).toLong, fields(1)) } val edges = links.map { line => val fields = line.split('\t') Edge(fields(0).toLong, fields(1).toLong, 0) } //cache操作 //val graph = Graph(vertices, edges, "").persist(StorageLevel.MEMORY_ONLY_SER) val graph = Graph(vertices, edges, "").persist() //graph.unpersistVertices(false) //测试 println("**********************************************************") println("获取5个triplet信息") println("**********************************************************") graph.triplets.take(5).foreach(println(_)) //pageRank算法里面的时候使用了cache(),故前面persist的时候只能使用MEMORY_ONLY println("**********************************************************") println("PageRank计算,获取最有价值的数据") println("**********************************************************") val prGraph = graph.pageRank(0.001).cache() val titleAndPrGraph = graph.outerJoinVertices(prGraph.vertices) { (v, title, rank) => (rank.getOrElse(0.0), title) } titleAndPrGraph.vertices.top(10) { Ordering.by((entry: (VertexId, (Double, String))) => entry._2._1) }.foreach(t => println(t._2._2 + ": " + t._2._1)) sc.stop() } }
prGraph = graph.pageRank(0.001).cache()就行了,0.001为前后两次收敛的误差阈值,小于这个阈值时则结束计算,越小精度越到
3.结果:
********************************************************** 获取5个triplet信息 ********************************************************** ((146271392968588,Computer Consoles Inc.),(7097126743572404313,Berkeley Software Distribution),0) ((146271392968588,Computer Consoles Inc.),(8830299306937918434,University of California, Berkeley),0) ((625290464179456,List of Penguin Classics),(1735121673437871410,George Berkeley),0) ((1342848262636510,List of college swimming and diving teams),(8830299306937918434,University of California, Berkeley),0) ((1889887370673623,Anthony Pawson),(8830299306937918434,University of California, Berkeley),0) ********************************************************** PageRank计算,获取最有价值的数据 ********************************************************** University of California, Berkeley: 1321.111754312097 Berkeley, California: 664.8841977233583 Uc berkeley: 162.50132743397873 Berkeley Software Distribution: 90.4786038848606 Lawrence Berkeley National Laboratory: 81.90404939641944 George Berkeley: 81.85226118457985 Busby Berkeley: 47.871998218019655 Berkeley Hills: 44.76406979519754 Xander Berkeley: 30.324075347288037 Berkeley County, South Carolina: 28.908336483710308
参考
【1】 http://spark.apache.org/docs/1.5.2/graphx-programming-guide.html
【2】https://github.com/xubo245/SparkLearning
【3】 炼数成金视频