如果一张图是对一片区域的描述的话,将这张图划分为很多个子图。当子图之内满足关联性尽可能大,而子图之间关联性尽可能低时,这样的子图可以称之为一个社区。
社区发现算法有很多,例如LPA,HANP,SLPA以及Louvain,不同的算法划分社区的效果不尽相同。Louvain算法是基于模块度的社区发现算法,该算法在效率和效果上都表现较好,并且能够发现层次性的社区结构,其优化目标是最大化整个社区网络的模块度。
模块度是评估一个社区网络划分好坏的度量方法,它的物理含义是社区内节点的连边数与随机情况下的边数只差,它的取值范围是 [−1/2,1)。可以简单地理解为社区内部边的权重减去所有与社区节点相连的边的权重和,对无向图更好理解,即社区内部边的度数减去社区内节点的总度数。
算法流程:
1、初始时将每个顶点当作一个社区,社区个数与顶点个数相同。
2、依次将每个顶点与之相邻顶点合并在一起,计算它们的模块度增益是否大于0,如果大于0,就将该结点放入该相邻结点所在社区。
3、迭代第二步,直至算法稳定,即所有顶点所属社区不再变化。
4、将各个社区所有节点压缩成为一个结点,社区内点的权重转化为新结点环的权重,社区间权重转化为新结点边的权重。
5、重复步骤1-3,直至算法稳定。
# coding=utf-8
import collections
import random
def load_graph(path):
G = collections.defaultdict(dict)
with open(path) as text:
for line in text:
vertices = line.strip().split()
v_i = int(vertices[0])
v_j = int(vertices[1])
w = float(vertices[2])
G[v_i][v_j] = w
G[v_j][v_i] = w
return G
class Vertex():
def __init__(self, vid, cid, nodes, k_in=0):
self._vid = vid
self._cid = cid
self._nodes = nodes
self._kin = k_in # 结点内部的边的权重
class Louvain():
def __init__(self, G):
self._G = G
self._m = 0 # 边数量
self._cid_vertices = {} # 需维护的关于社区的信息(社区编号,其中包含的结点编号的集合)
self._vid_vertex = {} # 需维护的关于结点的信息(结点编号,相应的Vertex实例)
for vid in self._G.keys():
self._cid_vertices[vid] = set([vid])
self._vid_vertex[vid] = Vertex(vid, vid, set([vid]))
self._m += sum([1 for neighbor in self._G[vid].keys() if neighbor > vid])
def first_stage(self):
mod_inc = False # 用于判断算法是否可终止
visit_sequence = self._G.keys()
random.shuffle(list(visit_sequence))
while True:
can_stop = True # 第一阶段是否可终止
for v_vid in visit_sequence:
v_cid = self._vid_vertex[v_vid]._cid
k_v = sum(self._G[v_vid].values()) + self._vid_vertex[v_vid]._kin
cid_Q = {}
for w_vid in self._G[v_vid].keys():
w_cid = self._vid_vertex[w_vid]._cid
if w_cid in cid_Q:
continue
else:
tot = sum(
[sum(self._G[k].values()) + self._vid_vertex[k]._kin for k in self._cid_vertices[w_cid]])
if w_cid == v_cid:
tot -= k_v
k_v_in = sum([v for k, v in self._G[v_vid].items() if k in self._cid_vertices[w_cid]])
delta_Q = k_v_in - k_v * tot / self._m # 由于只需要知道delta_Q的正负,所以少乘了1/(2*self._m)
cid_Q[w_cid] = delta_Q
cid, max_delta_Q = sorted(cid_Q.items(), key=lambda item: item[1], reverse=True)[0]
if max_delta_Q > 0.0 and cid != v_cid:
self._vid_vertex[v_vid]._cid = cid
self._cid_vertices[cid].add(v_vid)
self._cid_vertices[v_cid].remove(v_vid)
can_stop = False
mod_inc = True
if can_stop:
break
return mod_inc
def second_stage(self):
cid_vertices = {}
vid_vertex = {}
for cid, vertices in self._cid_vertices.items():
if len(vertices) == 0:
continue
new_vertex = Vertex(cid, cid, set())
for vid in vertices:
new_vertex._nodes.update(self._vid_vertex[vid]._nodes)
new_vertex._kin += self._vid_vertex[vid]._kin
for k, v in self._G[vid].items():
if k in vertices:
new_vertex._kin += v / 2.0
cid_vertices[cid] = set([cid])
vid_vertex[cid] = new_vertex
G = collections.defaultdict(dict)
for cid1, vertices1 in self._cid_vertices.items():
if len(vertices1) == 0:
continue
for cid2, vertices2 in self._cid_vertices.items():
if cid2 <= cid1 or len(vertices2) == 0:
continue
edge_weight = 0.0
for vid in vertices1:
for k, v in self._G[vid].items():
if k in vertices2:
edge_weight += v
if edge_weight != 0:
G[cid1][cid2] = edge_weight
G[cid2][cid1] = edge_weight
self._cid_vertices = cid_vertices
self._vid_vertex = vid_vertex
self._G = G
def get_communities(self):
communities = []
for vertices in self._cid_vertices.values():
if len(vertices) != 0:
c = set()
for vid in vertices:
c.update(self._vid_vertex[vid]._nodes)
communities.append(c)
return communities
def execute(self):
iter_time = 1
while True:
iter_time += 1
mod_inc = self.first_stage()
if mod_inc:
self.second_stage()
else:
break
return self.get_communities()
if __name__ == '__main__':
G = load_graph('s.txt')
algorithm = Louvain(G)
communities = algorithm.execute()
# 按照社区大小从大到小排序输出
communities = sorted(communities, key=lambda b: -len(b)) # 按社区大小排序
count = 0
for communitie in communities:
count += 1
print("社区", count, " ", communitie)
使用community安装python-louvain即可
pip install python-louvain
pip install networkx
最佳划分
community.best_partition(graph, partition=None, weight='weight', resolution=1.0)
Compute the partition of the graph nodes which maximises the modularity (or try…) using the Louvain heuristics.
This is the partition of highest modularity, i.e. the highest partition of the dendrogram generated by the Louvain algorithm.
import community
import networkx as nx
import matplotlib.pyplot as plt
#better with karate_graph() as defined in networkx example.
#erdos renyi don't have true community structure
G = nx.erdos_renyi_graph(30, 0.05)
#first compute the best partition
partition = community.best_partition(G)
#drawing
size = float(len(set(partition.values())))
pos = nx.spring_layout(G)
count = 0.
for com in set(partition.values()) :
count = count + 1.
list_nodes = [nodes for nodes in partition.keys()
if partition[nodes] == com]
nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20,
node_color = str(count / size))
nx.draw_networkx_edges(G,pos, alpha=0.5)
plt.show()