包含一系列顶点连接的回路(环路)
无环图
DAG即为有向无环图
1、对于每条边,矩阵中相应单元格值为1
2、对于每个循环,矩阵中相应单元格值为2,方便在行或列上求得顶点度数
弹性分布式属性图(Resilient Distributed Property Graph)
顶点和边都带属性的有向多重图
一份物理存储,两种视图
对Graph视图的所有操作,最终都会转换成其关联的Table视图的RDD操作来完成
class Graph[VD, ED] {
val vertices: VertexRDD[VD]
val edges: EdgeRDD[ED]
val triplets: RDD[EdgeTriplet[VD, ED]]
}
import org.apache.spark.graphx._
val vertices:RDD[(VertexId,Int)]=sc.makeRDD(Seq((1L,1),(2L,2),(3L,3)))
val edges=sc.makeRDD(Seq(Edge(1L,2L,1),Edge(2L,3L,2)))
val graph=Graph(vertices,edges) //Graph[Int,Int] ?
import org.apache.spark.graphx.GraphLoader
//加载边列表文件创建图,文件每行描述一条边,格式:srcId dstId。顶点与边的属性均为1
val graph = GraphLoader.edgeListFile(sc,"file:///opt/spark/data/graphx/followers.txt")
导入
import org.apache.spark.graphx._
创建顶点rdd
val vertices=sc.makeRDD(seq(1L,1),(2L,2),(3L,3))
创建edges边rdd
val edges=sc.makeRDD(Seq(Edge(1L,2L,1),Edge(2L,3L,2)))
创建graph对象
val graph=Graph(vertices,edges)
获取graph图对象的vertices信息
graph.vertices.collect
获取graph图对象的edges信息
graph.edges.collect
属性图应用示例-1
val users=sc.makeRDD(Array((3L,("rxin","student")),(7L,("jgonzal","postdoc")),(5L,("franklin","professor")),(2L,("istoica","professor"))))
val relationship=sc.makeRDD(Array(Edge(3L,7L,"Colla"),Edge(5L,3L,"Advisor"),Edge(2L,5L,"Colleague"),Edge(5L,7L,"Pi")))
val graphUser=Graph(users,relationship)
scala> graphUser.triplets.collect
res9: Array[org.apache.spark.graphx.EdgeTriplet[(String, String),String]] = Array(((2,(istoica,professor)),(5,(franklin,professor)),Colleague), ((3,(rxin,student)),(7,(jgonzal,postdoc)),Colla), ((5,(franklin,professor)),(3,(rxin,student)),Advisor), ((5,(franklin,professor)),(7,(jgonzal,postdoc)),Pi))
属性图应用示例-2
找出大于30岁的用户
假设打call超过5次,表示真爱。请找出他(她)们
val users=sc.makeRDD(Array((1L,("Alice",28)),(2L,("Bob",27)),(3L,("Charlie",65)),(4L,("David",42)),(5L,("Ed",55)),(6L,("Fran",50))))
val edges=sc.makeRDD(Array(Edge(2L,1L,7),Edge(4L,1L,1),Edge(2L,4L,2),Edge(5L,2L,2),Edge(3L,2L,4),Edge(3L,6L,3),Edge(5L,3L,8),Edge(5L,6L,3)))
val graphUser=Graph(users,edges)
graphUser.vertices.filter(v=>v._2._2>30).collect
graphUser.vertices.filter{case(id,(name,age))=>age>30}.collect
graphUser.triplets.collect.foreach(x=>println(x.srcAttr._1+" like "+x.dstAttr+" is "+x.attr))
Bob like (Alice,28) is 7
Bob like (David,42) is 2
Charlie like (Bob,27) is 4
Charlie like (Fran,50) is 3
David like (Alice,28) is 1
Ed like (Bob,27) is 2
Ed like (Charlie,65) is 8
Ed like (Fran,50) is 3
scala> graphUser.triplets.collect.filter(_.attr>5).foreach(x=>println(x.srcAttr._1+" like "+x.dstAttr+" is "+x.attr))
Bob like (Alice,28) is 7
Ed like (Charlie,65) is 8
顶点数量
边数量
度、入度、出度
class Graph[VD, ED] {
val numEdges: Long
val numVertices: Long
val inDegrees: VertexRDD[Int]
val outDegrees: VertexRDD[Int]
val degrees: VertexRDD[Int]
}
类似于RDD的map操作
class Graph[VD, ED] {
def mapVertices[VD2](map: (VertexId, VD) => VD2): Graph[VD2, ED]
def mapEdges[ED2](map: Edge[ED] => ED2): Graph[VD, ED2]
def mapTriplets[ED2](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2]
}
val t1_graph = tweeter_graph.mapVertices { case(vertextId, (name, age)) => (vertextId, name) }
val t2_graph = tweeter_graph.mapVertices { (vertextId, attr) => (vertextId, attr._1) }
val t3_graph = tweeter_graph.mapEdges(e => Edge(e.srcId, e.dstId, e.attr*7.0))
scala> val t1_graph=userCallGraph.mapVertices{case(v,(n,a))=>(v,n) }
t1_graph: org.apache.spark.graphx.Graph[(org.apache.spark.graphx.VertexId, String),Int] = org.apache.spark.graphx.impl.GraphImpl@52ce11b5
scala> t1_graph.vertices.collect.foreach(println)
(4,(4,David))
(1,(1,Alice))
(6,(6,Fran))
(3,(3,Charlie))
(5,(5,Ed))
(2,(2,Bob))
scala> val t2_graph=userCallGraph.mapVertices{(id,attr)=>(id,attr._1)}
t2_graph: org.apache.spark.graphx.Graph[(org.apache.spark.graphx.VertexId, String),Int] = org.apache.spark.graphx.impl.GraphImpl@68e66fff
scala> t2_graph.vertices.collect.foreach(println)
(4,(4,David))
(1,(1,Alice))
(6,(6,Fran))
(3,(3,Charlie))
(5,(5,Ed))
(2,(2,Bob))
scala> val t3_graph=userCallGraph.mapEdges(e=>Edge(e.srcId,e.dstId,e.attr*7.0))
t3_graph: org.apache.spark.graphx.Graph[(String, Int),org.apache.spark.graphx.Edge[Double]] = org.apache.spark.graphx.impl.GraphImpl@3b1847de
scala> t3_graph.edges.collect.foreach(println)
Edge(2,1,Edge(2,1,49.0))
Edge(2,4,Edge(2,4,14.0))
Edge(3,2,Edge(3,2,28.0))
Edge(3,6,Edge(3,6,21.0))
Edge(4,1,Edge(4,1,7.0))
Edge(5,2,Edge(5,2,14.0))
Edge(5,3,Edge(5,3,56.0))
Edge(5,6,Edge(5,6,21.0))
scala> userCallGraph.reverse.triplets.collect.foreach(println)
((1,(Alice,28)),(2,(Bob,27)),7)
((1,(Alice,28)),(4,(David,42)),1)
((2,(Bob,27)),(3,(Charlie,65)),4)
((2,(Bob,27)),(5,(Ed,55)),2)
((3,(Charlie,65)),(5,(Ed,55)),8)
((4,(David,42)),(2,(Bob,27)),2)
((6,(Fran,50)),(3,(Charlie,65)),3)
((6,(Fran,50)),(5,(Ed,55)),3)
scala> userCallGraph.triplets.collect.foreach(println)
((2,(Bob,27)),(1,(Alice,28)),7)
((2,(Bob,27)),(4,(David,42)),2)
((3,(Charlie,65)),(2,(Bob,27)),4)
((3,(Charlie,65)),(6,(Fran,50)),3)
((4,(David,42)),(1,(Alice,28)),1)
((5,(Ed,55)),(2,(Bob,27)),2)
((5,(Ed,55)),(3,(Charlie,65)),8)
((5,(Ed,55)),(6,(Fran,50)),3)
scala> userCallGraph.subgraph(vpred=(id,attr)=>{println("sub in"+(id,attr));attr._2<60}).triplets.collect.foreach(println)
sub in(2,(Bob,27))
sub in(1,(Alice,28))
sub in(2,(Bob,27))
sub in(4,(David,42))
sub in(3,(Charlie,65))
sub in(3,(Charlie,65))
sub in(4,(David,42))
sub in(1,(Alice,28))
sub in(5,(Ed,55))
sub in(2,(Bob,27))
sub in(5,(Ed,55))
sub in(3,(Charlie,65))
sub in(5,(Ed,55))
sub in(6,(Fran,50))
((2,(Bob,27)),(1,(Alice,28)),7)
((2,(Bob,27)),(4,(David,42)),2)
((4,(David,42)),(1,(Alice,28)),1)
((5,(Ed,55)),(2,(Bob,27)),2)
((5,(Ed,55)),(6,(Fran,50)),3)
scala> userCallGraph.subgraph(epred=(ep)=>ep.srcAttr._2<65).triplets.collect.foreach(println)
((2,(Bob,27)),(1,(Alice,28)),7)
((2,(Bob,27)),(4,(David,42)),2)
((4,(David,42)),(1,(Alice,28)),1)
((5,(Ed,55)),(2,(Bob,27)),2)
((5,(Ed,55)),(3,(Charlie,65)),8)
((5,(Ed,55)),(6,(Fran,50)),3)
scala> val two=sc.makeRDD(Array((1L,"kgc.cn"),(2L,"qq.com"),(3L,"163.com")))
two: org.apache.spark.rdd.RDD[(Long, String)] = ParallelCollectionRDD[68] at makeRDD at <console>:27
scala> userCallGraph.joinVertices(two)((id,v,cmpy)=>(v._1+"@"+cmpy,v._2))
res18: org.apache.spark.graphx.Graph[(String, Int),Int] = org.apache.spark.graphx.impl.GraphImpl@fb48dbe
scala> res18.triplets.collect.foreach(println)
((2,(Bob@qq.com,27)),(1,(Alice@kgc.cn,28)),7)
((2,(Bob@qq.com,27)),(4,(David,42)),2)
((3,(Charlie@163.com,65)),(2,(Bob@qq.com,27)),4)
((3,(Charlie@163.com,65)),(6,(Fran,50)),3)
((4,(David,42)),(1,(Alice@kgc.cn,28)),1)
((5,(Ed,55)),(2,(Bob@qq.com,27)),2)
((5,(Ed,55)),(3,(Charlie@163.com,65)),8)
((5,(Ed,55)),(6,(Fran,50)),3)
计算用户粉丝数量
case class User(name: String, age: Int, inDeg: Int, outDeg: Int)
//修改顶点属性
val initialUserGraph: Graph[User, Int] = tweeter_graph.mapVertices{
case (id, (name, age)) => User(name, age, 0, 0)
}
//将顶点入度、出度存入顶点属性中
val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) {
case (id, u, inDegOpt) => User(u.name, u.age, inDegOpt.getOrElse(0), u.outDeg)
}.outerJoinVertices(initialUserGraph.outDegrees) {
case (id, u, outDegOpt) => User(u.name, u.age, u.inDeg, outDegOpt.getOrElse(0))
}
//顶点的入度即为粉丝数量
for ((id, property) <- userGraph.vertices.collect)
println(s"User $id is ${property.name} and is liked by ${property.inDeg} people.")
scala> userCallGraph.outerJoinVertices(userCallGraph.inDegrees){case(id,u,indeg)=>User(u._1,u._2,indeg.getOrElse(0),0)}.outerJoinVertices(userCallGraph.outDegrees){case(id,u,outdeg)=>User(u.name,u.age,u.inDeg,outdeg.getOrElse(0))}
res23: org.apache.spark.graphx.Graph[User,Int] = org.apache.spark.graphx.impl.GraphImpl@491d8bfb
scala> res23.vertices.collect.foreach(println)
(4,User(David,42,1,1))
(1,User(Alice,28,2,0))
(6,User(Fran,50,2,0))
(3,User(Charlie,65,1,2))
(5,User(Ed,55,0,3))
(2,User(Bob,27,2,2))
scala> userCallGraph.pageRank(0.00000000001).vertices.collect.foreach(println)
(4,0.9688717814927128)
(1,1.7924127957615186)
(6,0.9969646507526428)
(3,0.6996243163176442)
(5,0.5451618049228396)
(2,0.9969646507526428)