中间被很多人转了,我是从机器之心公众号(almosthuman2014)看到的,最初来源应该是 Maël Fabien 大佬的博客,致谢
https://github.com/maelfabien/Machine_Learning_Tutorials
import numpy as np
import random
import networkx as nx
from IPython.display import Image
import matplotlib.pyplot as plt
# Load the graph
G_karate = nx.karate_club_graph()
# Find key-values for the graph
pos = nx.spring_layout(G_karate)
# Plot the graph
nx.draw(G_karate, cmap = plt.get_cmap('rainbow'), with_labels=True, pos=pos)
「空手道」图:Wayne W. Zachary 在 1970 到 1972 年这三年中研究的一个空手道俱乐部的社交网络。该网络包含了这个空手道俱乐部的 34 个成员,成员对之间的连接表示他们在俱乐部之外也有联系。在研究期间,管理员 JohnA 与教练 Mr.Hi(化名)之间出现了冲突,导致俱乐部一分为二。一半成员围绕 Mr.Hi 形成了一个新的俱乐部,另一半则找了一个新教练或放弃了空手道。基于收集到的数据,除了其中一个成员,Zachary 正确分配了所有成员在分裂之后所进入的分组。
图 G=(V, E) 由下列要素构成:
n=34
G_karate.degree() # 返回该图的每个节点的度(相邻节点的数量)的列表
''' 结果
DegreeView({0: 16, 1: 9, 2: 10, 3: 6, 4: 3, 5: 4, 6: 4, 7: 4, 8: 5, 9: 2, 10: 3, 11: 1, 12: 2, 13: 5, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 3, 20: 2, 21: 2, 22: 2, 23: 5, 24: 3, 25: 3, 26: 2, 27: 4, 28: 3, 29: 4, 30: 4, 31: 6, 32: 12, 33: 17})
'''
# Isolate the sequence of degrees 隔离度
degree_sequence = list(G_karate.degree())
# 计算边的数量,但也计算度序列的度量
nb_nodes = n
nb_arr = len(G_karate.edges())
avg_degree = np.mean(np.array(degree_sequence)[:,1])
med_degree = np.median(np.array(degree_sequence)[:,1])
max_degree = max(np.array(degree_sequence)[:,1])
min_degree = np.min(np.array(degree_sequence)[:,1])
# 打印所有信息:
print("Number of nodes : " + str(nb_nodes))
print("Number of edges : " + str(nb_arr))
print("Maximum degree : " + str(max_degree))
print("Minimum degree : " + str(min_degree))
print("Average degree : " + str(avg_degree))
print("Median degree : " + str(med_degree))
''' 结论:
Number of nodes : 34
Number of edges : 78
Maximum degree : 17
Minimum degree : 1
Average degree : 4.588235294117647 # 平均而言,该图中的每个人都连接了 4.6 个人
Median degree : 3.0
'''
# 绘出这些度的直方图
# 度的直方图相当重要,可用于确定我们看到的图的种类
degree_freq = np.array(nx.degree_histogram(G_karate)).astype('float')
plt.figure(figsize=(12, 8))
plt.stem(degree_freq)
plt.ylabel("Frequence")
plt.xlabel("Degre")
plt.show()
存储图的方式有三种,最好的表示方式取决于用法和可用的内存,以及你想用它做什么:
1:[2,3,4]
表示与节点1相连的边有2、3、4。图可能包含一些扩展:
两种主要的图类型:
定义
python生成 Erdos-Rényi 图的内置函数
# Generate the graph
n = 50
p = 0.2
G_erdos = nx.erdos_renyi_graph(n,p, seed =100)
# Plot the graph
plt.figure(figsize=(12,8))
nx.draw(G_erdos, node_size=10)
degree_freq = np.array(nx.degree_histogram(G_erdos)).astype('float')
plt.figure(figsize=(12, 8))
plt.stem(degree_freq)
plt.ylabel("Frequence")
plt.xlabel("Degree")
plt.show()
# Get the list of the degrees
degree_sequence_erdos = list(G_erdos.degree())
nb_nodes = n
nb_arr = len(G_erdos.edges())
avg_degree = np.mean(np.array(degree_sequence_erdos)[:,1])
med_degree = np.median(np.array(degree_sequence_erdos)[:,1])
max_degree = max(np.array(degree_sequence_erdos)[:,1])
min_degree = np.min(np.array(degree_sequence_erdos)[:,1])
esp_degree = (n-1)*p
print("Number of nodes : " + str(nb_nodes))
print("Number of edges : " + str(nb_arr))
print("Maximum degree : " + str(max_degree))
print("Minimum degree : " + str(min_degree))
print("Average degree : " + str(avg_degree))
print("Expected degree : " + str(esp_degree))
print("Median degree : " + str(med_degree))
'''结果:
Number of nodes : 200
Number of edges : 3949
Maximum degree : 56
Minimum degree : 25
Average degree : 39.49
Expected degree : 39.800000000000004
Median degree : 39.5
这里的平均度和期望度非常接近,因为两者之间只有很小的因子
'''
定义
python生成 Barabasi-Albert 图的内置函数
# Generate the graph
n = 150
m = 3
G_barabasi = nx.barabasi_albert_graph(n,m)
# Plot the graph
plt.figure(figsize=(12,8))
nx.draw(G_barabasi, node_size=10)
# 据说这个分布是无标度的(scale-free),平均度不能提供信息
degree_freq = np.array(nx.degree_histogram(G_barabasi)).astype('float')
plt.figure(figsize=(12, 8))
plt.stem(degree_freq)
plt.ylabel("Frequence")
plt.xlabel("Degree")
plt.show()
# Get the list of the degrees
degree_sequence_erdos = list(G_erdos.degree())
nb_nodes = n
nb_arr = len(G_erdos.edges())
avg_degree = np.mean(np.array(degree_sequence_erdos)[:,1])
med_degree = np.median(np.array(degree_sequence_erdos)[:,1])
max_degree = max(np.array(degree_sequence_erdos)[:,1])
min_degree = np.min(np.array(degree_sequence_erdos)[:,1])
esp_degree = (n-1)*p
print("Number of nodes : " + str(nb_nodes))
print("Number of edges : " + str(nb_arr))
print("Maximum degree : " + str(max_degree))
print("Minimum degree : " + str(min_degree))
print("Average degree : " + str(avg_degree))
print("Expected degree : " + str(esp_degree))
print("Median degree : " + str(med_degree))
''' 结果:
Number of nodes : 200
Number of edges : 3949
Maximum degree : 56
Minimum degree : 25
Average degree : 39.49
Expected degree : 39.800000000000004
Median degree : 39.5
'''
Pathfinding(寻路):根据可用性和质量等条件确定最优路径。我们也将搜索算法包含在这一类别中。这可用于确定最快路由或流量路由。
更多最短路径问题的介绍:https://en.wikipedia.org/wiki/Shortest_path_problem
# Returns shortest path between each node
nx.shortest_path(G_karate) # 返回图中每个节点之间的最小路径的列表
'''结果
{0: {0: [0],
1: [0, 1],
2: [0, 2],
...
'''
# Returns shortest path length between each node
list(nx.all_pairs_shortest_path_length(G_karate))
'''结果
[(0,
{0: 0,
1: 1,
2: 1,
3: 1,
4: 1,
...
'''
from networkx.algorithms import tree
mst = tree.minimum_spanning_edges(G_karate, algorithm='prim', data=False)
edgelist = list(mst)
sorted(edgelist)
'''
[(0, 1),
(0, 2),
(0, 3),
(0, 4),
(0, 5),
(0, 6),
'''
Community detection(社群检测):评估群体聚类的方式。这可用于划分客户或检测欺诈等。
from networkx.algorithms import community
k = 1 # k=1 的意思是我们期望得到 2 个社群
comp = community.girvan_newman(G_karate)
for communities in itertools.islice(comp, k):
print(tuple(sorted(c) for c in communities))
''' 得到一个属于每个社群的节点的列表:
([0, 1, 3, 4, 5, 6, 7, 10, 11, 12, 13, 16, 17, 19, 21], [2, 8, 9, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33])
'''
pip install python-louvain # 安装软件包
# 基于 Louvain 方法,计算最佳的划分方式
import community
partition = community.best_partition(G_karate)pos = nx.spring_layout(G_karate)
plt.figure(figsize=(8, 8))
plt.axis('off')
nx.draw_networkx_nodes(G_karate, pos, node_size=600, cmap=plt.cm.RdYlBu, node_color=list(partition.values()))
nx.draw_networkx_edges(G_karate, pos, alpha=0.3)
plt.show(G_karate)
# 测试相连的有向图:
nx.is_weakly_connected(G)
nx.is_strongly_connected(G)
# 测试无向图:
nx.is_connected(G_karate)
networkx 文档中有关连接性实现的问题:https://networkx.github.io/documentation/stable/reference/algorithms/component.html
# 在应用分层聚类之前,定义每个节点之间的距离矩阵
pcc_longueurs=list(nx.all_pairs_shortest_path_length(G_karate))
distances=np.zeros((n,n)) # distances[i, j] is the length of the shortest path between i and j
for i in range(n):
for j in range(n):
distances[i, j] = pcc_longueurs[i][1][j]
# 使用 sklearn 的 AgglomerativeClustering 函数来确定分层聚类
from sklearn.cluster import AgglomerativeClusteringclustering = AgglomerativeClustering(n_clusters=2,linkage='average',affinity='precomputed').fit_predict(distances)
# 根据聚类结果,用不同颜色绘出所得到的图:
nx.draw(G_karate, node_color = clustering)
# 计算局部聚类系数,List of local clustering coefficients
list(nx.clustering(G_barabasi).values())
'''
0.13636363636363635,
0.2,
0.07602339181286549,
0.04843304843304843,
0.09,
0.055384615384615386,
0.07017543859649122,
'''
# 平均这些结果,得到该图的全局聚类系数,Global clustering coefficient
np.mean(list(nx.clustering(G_barabasi).values()))
'''
0.0965577637155059
'''
Centrality(中心性):确定网络中节点的重要性。这可用于识别社交网络中有影响力的人或识别网络中潜在的攻击目标。
PageRank 是根据所连接的相邻节点,然后再根据它们各自的相邻节点估计当前节点的重要性
用途:
Neo4J 对 PageRank 算法的总结
案例
nx.pagerank(G_karate, alpha=0.9) # alpha 是阻尼参数(默认为 0.85)
''' 返回一个排名列表:
{0: 0.09923208031303203,
1: 0.0543403155825792,
2: 0.05919704684187155,
3: 0.036612460562853694,
'''
c_degree = nx.degree_centrality(G_karate)
c_degree = list(c_degree.values())
c_eigenvector = nx.eigenvector_centrality(G_karate)
c_eigenvector = list(c_eigenvector.values())
c_closeness = nx.closeness_centrality(G_karate)
c_closeness = list(c_closeness.values())
c_betweenness = nx.betweenness_centrality(G_karate)
c_betweenness = list(c_betweenness.values())
# Plot the centrality of the nodes
plt.figure(figsize=(18, 12))# Degree Centrality
f, axarr = plt.subplots(2, 2, num=1)
plt.sca(axarr[0,0])
nx.draw(G_karate, cmap = plt.get_cmap('inferno'), node_color = c_degree, node_size=300, pos=pos, with_labels=True)
axarr[0,0].set_title('Degree Centrality', size=16)# Eigenvalue Centrality
plt.sca(axarr[0,1])
nx.draw(G_karate, cmap = plt.get_cmap('inferno'), node_color = c_eigenvector, node_size=300, pos=pos, with_labels=True)
axarr[0,1].set_title('Eigenvalue Centrality', size=16)# Proximity Centrality
plt.sca(axarr[1,0])
nx.draw(G_karate, cmap = plt.get_cmap('inferno'), node_color = c_closeness, node_size=300, pos=pos, with_labels=True)
axarr[1,0].set_title('Proximity Centrality', size=16)# Betweenness Centrality
plt.sca(axarr[1,1])
nx.draw(G_karate, cmap = plt.get_cmap('inferno'), node_color = c_betweenness, node_size=300, pos=pos, with_labels=True)
axarr[1,1].set_title('Betweenness Centrality', size=16)
扩展阅读:
- Neo4j 的图算法全面指南,Mark Needham & Amy E. Hodler:https://go.neo4j.com/rs/710-RRC-335/images/Comprehensive-Guide-to-Graph-Algorithms-in-Neo4j-ebook-EN-US.pdf
- Networkx 文档:https://networkx.github.io/documentation/stable/