应师兄要求,阅读了有关重叠社区发现的文章Finding overlapping communities in networks by label propagation,文中提出了一种基于LPA的扩展算法COPRA,可以用于重叠社区的发现。简单来说,COPRA算法为每个节点添加了一个标签列表,列表的长度为算法的一个参数v,每个节点最多可以拥有v个标签,也即可以存在于最多v个社区之中。 此外,COPRA 也不再使用 LPA 算法的终止条件,因为算法往往不能收敛于所有节点的标签都不改变。于是算法改而跟踪每轮计算结束后网络上还剩下的标签集合的大小,当连续两轮中这个值不变,就认为满足终止条件。(注意:“两轮不变”不意味着一直不变,有可能在第三轮就发生变化)
该算法依然具有标签传播算法的随机性导致的随机性强、鲁棒性差、容易把所有项点分配给一个社区等缺点,并且可能划分出一些无意义的社区,修正策略依然是1、标签熵 2、迭代过程改(参考邻居和自身标签)
文中给出了比较详细的伪代码(但代码的格式简直。。。),在其基础上尝试完成了该算法的实现工作,为了与LPA算法作比较,此处暂且使用karate等经典社区图集,显示结果也在此基础上稍作改动。
#coding=utf-8
from numpy import *
import time
import copy
def LoadAdjacentMatrixData(filename,vertices):
Adjmartrix = [[0 for col in range(vertices)] for row in range(vertices)]
file_object = open(filename, 'r')
for x, line in enumerate(file_object):
line=line.strip()
t_list = line.split('\t')
for y in range(len(t_list)):
Adjmartrix[x][y] = int(t_list[y])
#Adjmartrix = mat(Adjmartrix)
return Adjmartrix
def Degree_Sorting(Adjmartrix,vertices):
degree_s = [[i,0] for i in range(vertices)]
neighbours = [[] for i in range(vertices)]
sums = 0
for i in range(vertices):
for j in range(vertices):
if Adjmartrix[i][j] == 1:
degree_s[i][1] += 1
sums += 1
neighbours[i].append(j)
#degree_s = sorted(degree_s, key=lambda x: x[1], reverse=True)
#print degree_s,neighbours,sums/2
return degree_s,neighbours,sums/2
def Propagate(x, old, new, neighbours,v,asynchronous):
#new[x] = {}
#洗牌保证随机性(都相等的情况)
random.shuffle(neighbours[x])
#依据邻结点标签集更新该节点
for eachpoint in neighbours[x]:
for eachlable in old[eachpoint]:
b = old[eachpoint][eachlable]
if eachlable in new[x]:
new[x][eachlable] += b
else:
new[x].update({eachlable: b})
if asynchronous:
old[x] = copy.deepcopy(new[x])
Normalize(new[x])
#print new[x]
maxb = 0.0
maxc = 0
t = []
#去除小于1/v的候选项,若均小于则''选b最大的赋值'',否则规范化
for each in new[x]:
if new[x][each] < 1/float(v):
t.append(each)
if new[x][each] >= maxb:#取最后一个
#if new[x][each] > maxb:#取第一个
maxb = new[x][each]
maxc = each
for i in range(len(t)):
del new[x][t[i]]
if len(new[x]) == 0:
new[x][maxc] = 1
else:
Normalize(new[x])
def Normalize(x):
sums = 0.0
for each in x:
sums += x[each]
for each in x:
if sums != 0:
x[each] = x[each]/sums
def id_l(l):
ids = []
for each in l:
ids.append(id_x(each))
return ids
def id_x(x):
ids = []
for each in x:
ids.append(each)
return ids
def count(l):
counts = {}
for eachpoint in l:
for eachlable in eachpoint:
if eachlable in counts:
n = counts[eachlable]
counts.update({eachlable: n+1})
else:
counts.update({eachlable: 1})
return counts
def mc(cs1, cs2):
#print cs1,cs2
cs = {}
for each in cs1:
if each in cs2:
cs[each] = min(cs1[each], cs2[each])
return cs
def Modulartiy(A, coms, sums,vertices):
Q = 0.0
for eachc in coms:
li = 0
for eachp in coms[eachc]:
for eachq in coms[eachc]:
li += A[eachp][eachq]
li /= 2
di = 0
for eachp in coms[eachc]:
for eachq in range(vertices):
di += A[eachp][eachq]
Q = Q + (li - (di * di) /(sums*4))
Q = Q / float(sums)
return Q
def ExtendQ(A,coms,sums,k,o):
#k-每个节点的度数 o-每个节点属于的社区数
EQ = 0.0
for eachc in coms:
for eachp in coms[eachc]:
for eachq in coms[eachc]:
EQ += (A[eachp][eachq] - k[eachp][1]*k[eachq][1]/(2*sums)) / (o[eachp]*o[eachq])
EQ = EQ / float(2*sums)
return EQ
def getcoms(degree_s,neighbours,sums,A,v,vertices):
label_new = [{} for i in range(vertices)]
label_old = [{i: 1} for i in range(vertices)]
minl = {}
oldmin = {}
flag = True# asynchronous
itera = 0# 迭代次数
start = time.clock()# 计时
#同异步迭代过程
while True:
''' if flag: flag = False else: flag = True '''
itera += 1
for each in degree_s:
Propagate(each[0], label_old, label_new, neighbours, v, flag)
if id_l(label_old) == id_l(label_new):
minl = mc(minl, count(label_new))
else:
minl = count(label_new)
if minl != oldmin:
label_old = label_new
oldmin = minl
else:
break
print itera,label_old
coms = {}
sub = {}
for each in range(vertices):
ids = id_x(label_old[each])
for eachc in ids:
if eachc in coms and eachc in sub:
coms[eachc].append(each)
#elif :
sub.update({eachc: set(sub[eachc]) & set(ids)})
else:
coms.update({eachc:[each]})
sub.update({eachc:ids})
print 'lastiter',coms
#获取每个节点属于的标签数
o = [0 for i in range(vertices)]
for eachid in range(vertices):
for eachl in coms:
if eachid in coms[eachl]:
o[eachid] += 1
#去重取标签
for each in sub:
if len(sub[each]):
for eachc in sub[each]:
if eachc != each:
coms[eachc] = list(set(coms[eachc]) - set(coms[each]))
#标签整理
clusterment = [0 for i in range(vertices)]
a = 0
for eachc in coms:
if len(coms[eachc])!=0:
for e in coms[eachc]:
clusterment[e] = a + 1
a += 1
degree_s = sorted(degree_s, key=lambda x: x[0], reverse=False)
elapsed = (time.clock() - start)
print 't=',elapsed
print 'result=',coms
print 'clusterment=',clusterment
print 'Q =',Modulartiy(A, coms, sums,vertices)
print 'EQ =',ExtendQ(A,coms,sums,degree_s,o)
#print 'NMI=',NMI(coms,coms)
return coms
if __name__ == '__main__':
#节点个数,V
#vertices = [34,115,105,62]
#txtlist = ['karate.txt','football.txt','books.txt','dolphins.txt']
vertices = [64,128,256,512]
txtlist = ['RN1.txt','RN2.txt','RN3.txt','RN4.txt']
testv = [2,3,4,5]
for i in range(len(txtlist)):
print txtlist[i],vertices[i]
for ev in testv:
print 'v=',ev
A = LoadAdjacentMatrixData(txtlist[i],vertices[i])
degree_s, neighbours, sums = Degree_Sorting(A,vertices[i])
#print neighbours
getcoms(degree_s, neighbours, sums,A,ev,vertices[i])
4-26日更新:
有bug,EQ函数的计算由于float精度的问题不能直接使用公式,需要先将公式变形再计算
def ExtendQ(A,coms,sums,k,o):
#k-每个节点的度数 o-每个节点属于的社区数
s = float(2*sums)
k = sorted(k, key=lambda x: x[0], reverse=False)
at = 0
kt = 0
EQ = 0.0
for eachc in coms:
for eachp in coms[eachc]:
for eachq in coms[eachc]:
at += A[eachp][eachq] / float(o[eachp]*o[eachq])
kt += k[eachp][1]*k[eachq][1] / float(o[eachp]*o[eachq])
EQ = at - kt / s
return EQ/s