__init__:
from .neighbors import *
from .clustering import *
from .sparse import *
from .grid import *
from .fusion import *
from .attribute import *
__all__ = [
'gen_knn_hg',
'gen_epsilon_ball_hg',
'gen_clustering_hg',
'gen_l1_hg',
'gen_grid_neigh_hg',
'concat_multi_hg',
'gen_attribute_hg'
]
attribute.py
# coding=utf-8
import numpy as np
import scipy.sparse as sparse
from hyperg.hyperg import HyperG
from hyperg.utils import print_log
def gen_attribute_hg(n_nodes, attr_dict, X=None):
"""
:param attr_dict: dict, {'attri_1': [node_idx_1, node_idx_2, ...], 'attri_2':[...]} (属性 'attri_1' 对应的节点索引为 node_idx_1、node_index_2 和 ...)
:param n_nodes: int,
:param X: numpy array, 形如(n_samples, n_features) (optional)
:return: HyperG类的一个实例
"""
if X is not None:
assert n_nodes == X.shape[0]
n_edges = len(attr_dict) #统计属性字典中不同属性的数量,这个数量就是超图的边数
node_idx = []
edge_idx = []
for idx, attr in enumerate(attr_dict):
nodes = sorted(attr_dict[attr])
node_idx.extend(nodes)
edge_idx.extend([idx] * len(nodes))
#创建一个值为1的数组 values,其长度与node_idx的长度相同。
node_idx = np.asarray(node_idx)
edge_idx = np.asarray(edge_idx)
values = np.ones(node_idx.shape[0])
#利用节点索引、边索引和值创建一个 COO(Coordinate Format)稀疏矩阵H
H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))
return HyperG(H, X=X)
clustering.py
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import scipy.sparse as sparse
from hyperg.hyperg import HyperG
from hyperg.utils import print_log
def gen_clustering_hg(X, n_clusters, method="kmeans", with_feature=False, random_state=None):
"""
:param X: numpy array, shape = (n_samples, n_features)
:param n_clusters: int, 聚类的数量
:param method: str, 聚类的方法,目前就只有kmeans聚类
:param with_feature: bool, 超图带不带节点的特征,这个参数是可选的
:param random_state: int, optional(default=False) 随机选择一个初始的聚类中心
:return: HyperG类的一个实例
"""
if method == "kmeans":
cluster = KMeans(n_clusters=n_clusters, random_state=random_state).fit(X).labels_
else:
raise ValueError("{} method is not supported".format(method))
assert n_clusters >= 1
n_edges = n_clusters
n_nodes = X.shape[0]
node_idx = np.arange(n_nodes)
edge_idx = cluster
values = np.ones(node_idx.shape[0])
H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))
w = np.ones(n_edges)
if with_feature:
return HyperG(H, w=w, X=X)
return HyperG(H, w=w)
fusion.py
import numpy as np
import scipy.sparse as sparse
from hyperg.hyperg import HyperG
#将多个超图链接一个超图
def concat_multi_hg(hg_list):
"""concatenate multiple hypergraphs to one hypergraph
:param hg_list: list, 超图列表
:return: 超图实例
"""
H_s = [hg.incident_matrix() for hg in hg_list]
w_s = [hg.hyperedge_weights() for hg in hg_list]
H = sparse.hstack(H_s)
w = np.hstack(w_s)
X = None
for hg in hg_list:
if X is not None and hg.node_features() is not None:
assert (X == hg.node_features()).all()
elif hg.node_features() is not None:
X = hg.node_features()
return HyperG(H, X=X, w=w)
#将多个小超图合成一个大超图
def fuse_mutli_sub_hg(hg_list):
"""
:param hg_list: list, 超图实例列表
:return: 超图实例
"""
#分别存储每个子超图的关联矩阵的行索引、列索引和数据。
incident_mat_row = [hg.incident_matrix().row for hg in hg_list]
incident_mat_col = [hg.incident_matrix().col for hg in hg_list]
incident_mat_data = [hg.incident_matrix().data for hg in hg_list]
#获取每个子超图的节点数和边数
num_nodes = [hg.num_nodes() for hg in hg_list]
num_edges = [hg.num_edges() for hg in hg_list]
#计算新的节点和边的索引偏移,确保不重叠
nodes_to_add = [0] + [sum(num_nodes[:i+1]) for i in range(len(hg_list)-1)]
edges_to_add = [0] + [sum(num_edges[:i+1]) for i in range(len(hg_list)-1)]
#遍历每个小超图,将其关联矩阵的行索引和列索引分别加上节点索引和边索引的偏移。
for i in range(len(hg_list)):
incident_mat_row[i] = incident_mat_row[i] + nodes_to_add[i]
incident_mat_col[i] = incident_mat_col[i] + edges_to_add[i]
#连接所有的关联矩阵索引和数据
incident_mat_row = np.concatenate(incident_mat_row)
incident_mat_col = np.concatenate(incident_mat_col)
incident_mat_data = np.concatenate(incident_mat_data)
#创建新的超图关联矩阵
H = sparse.coo_matrix((incident_mat_data, (incident_mat_row, incident_mat_col)),
shape=(sum(num_nodes), sum(num_edges)))
return HyperG(H)
grid.py
import numpy as np
import scipy.sparse as sparse
from hyperg.hyperg import HyperG
#用于生成一个表示网格邻域关系的超图
def gen_grid_neigh_hg(input_size):
"""
:param input_size: numpy array,形如(2,),表示网格的高度和宽度。
:return: 超图实例
"""
input_size = np.array(input_size).reshape(-1)
assert input_size.shape[0] == 2
h, w = input_size
n_nodes = w * h
node_set = np.arange(n_nodes)
#定义邻居关系
neigh_idx = [
node_set - w - 1,
node_set - w,
node_set - w + 1,
node_set - 1,
node_set,
node_set + 1,
node_set + w - 1,
node_set + w,
node_set + w + 1,
]
#屏蔽不在网格内的邻居索引的布尔掩码
neigh_mask = [
(node_set // w == 0) | (node_set % w == 0),
(node_set // w == 0),
(node_set // w == 0) | (node_set % w == w - 1),
(node_set % w == 0),
np.zeros_like(node_set, dtype=np.bool),
(node_set % w == w - 1),
(node_set // w == h-1) | (node_set % w == 0),
(node_set // w == h-1),
(node_set // w == h-1) | (node_set % w == w - 1),
]
#使用掩码将不在网格内的邻居索引设置为 -1。
for i in range(len(neigh_idx)):
neigh_idx[i][neigh_mask[i]] = -1
#将邻居索引、节点索引和权重值连接在一起
node_idx = np.hstack(neigh_idx)
edge_idx = np.tile(node_set.reshape(1, -1), [len(neigh_idx), 1]).reshape(-1)
values = np.ones_like(node_idx)
#过滤掉负的节点索引,以去除在网格外的邻居
# filter negative elements
non_neg_idx = np.where(node_idx != -1)
node_idx = node_idx[non_neg_idx]
edge_idx = edge_idx[non_neg_idx]
values = values[non_neg_idx]
n_edges = n_nodes
H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))
return HyperG(H)
if __name__ == "__main__":
gen_grid_neigh_hg((4, 5))
neighbor.py
import numpy as np
from sklearn.metrics import pairwise_distances
import scipy.sparse as sparse
from hyperg.hyperg import HyperG
from hyperg.utils import print_log
#生成一个k最近邻超图
def gen_knn_hg(X, n_neighbors, is_prob=True, with_feature=False):
"""
:param X: numpy array,形如(n_samples, n_features)
:param n_neighbors: int,邻居的数量
:param is_prob: bool,如果is_prob为True,则计算每个邻居的权重,权重为指数衰减函数。默认为True
:param with_feature:带不带节点特征。可选参数,默认不带
:return: HyperG的实例
"""
assert isinstance(X, (np.ndarray, list))
assert n_neighbors > 0
X = np.array(X)
n_nodes = X.shape[0]
n_edges = n_nodes
#计算样本之间的欧几里得距离矩阵。
m_dist = pairwise_distances(X)
#对距离矩阵的每一行进行分区,得到每个样本的k+1个最近邻的索引和对应的距离值
m_neighbors = np.argpartition(m_dist, kth=n_neighbors+1, axis=1)
m_neighbors_val = np.take_along_axis(m_dist, m_neighbors, axis=1)
#将邻居的索引和距离值展平,作为节点索引和边索引
m_neighbors = m_neighbors[:, :n_neighbors+1]
m_neighbors_val = m_neighbors_val[:, :n_neighbors+1]
# check检查每个neighbor是否包含自身,并确保有n+1个邻居
for i in range(n_nodes):
if not np.any(m_neighbors[i, :] == i):
m_neighbors[i, -1] = i
m_neighbors_val[i, -1] = 0.
#用于创建稀疏超图的坐标信息
node_idx = m_neighbors.reshape(-1)
edge_idx = np.tile(np.arange(n_edges).reshape(-1, 1), (1, n_neighbors+1)).reshape(-1)
if not is_prob:
values = np.ones(node_idx.shape[0])
else:
avg_dist = np.mean(m_dist)
m_neighbors_val = m_neighbors_val.reshape(-1)
values = np.exp(-np.power(m_neighbors_val, 2.) / np.power(avg_dist, 2.))
#生成超图
H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))
w = np.ones(n_edges)
if with_feature:
return HyperG(H, w=w, X=X)
return HyperG(H, w=w)
def gen_epsilon_ball_hg(X, ratio, is_prob=True, with_feature=False):
"""
:param X: numpy array, shape = (n_samples, n_features)
:param ratio: float, 选取邻居的平均距离的比率
:param is_prob: bool,是否有权重,默认为true,可选的
:param with_feature: bool, 是否带特征
:return: instance of HyperG
"""
assert isinstance(X, (np.ndarray, list))
assert ratio > 0
X = np.array(X)
n_nodes = X.shape[0]
n_edges = n_nodes
#计算样本间的距离
m_dist = pairwise_distances(X)
avg_dist = np.mean(m_dist)
#确定选择邻居的阈值。
threshold = ratio * avg_dist
coo = np.where(m_dist <= threshold)
edge_idx, node_idx = coo
if not is_prob:
values = np.ones(node_idx.shape[0])
else:
m_neighbors_val = m_dist[coo]
values = np.exp(-np.power(m_neighbors_val, 2.) / np.power(avg_dist, 2.))
H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))
w = np.ones(n_edges)
if with_feature:
return HyperG(H, w=w, X=X)
return HyperG(H, w=w)
sparse.py
import numpy as np
from sklearn.metrics import pairwise_distances
import scipy.sparse as sparse
import cvxpy as cp
from cvxpy.error import SolverError
from hyperg.hyperg import HyperG
from hyperg.utils import print_log
# TODO: 1. elastic net hypergraph
def gen_l1_hg(X, gamma, n_neighbors, log=False, with_feature=False):
"""
:param X: numpy array, shape = (n_samples, n_features)
:param gamma: float, Elastic Net模型的超参数,用于平衡L1正则化和L2正则化的权重。
:param n_neighbors: int, 用于选择邻居的数量。
:param log: bool 是否打印日志,默认为 False。
:param with_feature: bool, optional(default=False)
:return: instance of HyperG
"""
assert n_neighbors >= 1.
assert isinstance(X, np.ndarray)
assert X.ndim == 2
n_nodes = X.shape[0]
n_edges = n_nodes
m_dist = pairwise_distances(X)
m_neighbors = np.argsort(m_dist)[:, 0:n_neighbors+1]
edge_idx = np.tile(np.arange(n_edges).reshape(-1, 1), (1, n_neighbors+1)).reshape(-1)
node_idx = []
values = []
for i_edge in range(n_edges):
if log:
print_log("processing edge {} ".format(i_edge))
neighbors = m_neighbors[i_edge].tolist()
if i_edge in neighbors:
neighbors.remove(i_edge)
else:
neighbors = neighbors[:-1]
P = X[neighbors, :]
v = X[i_edge, :]
# cvxpy使用 CVXPY 库来解决 Elastic Net 模型的优化问题
x = cp.Variable(P.shape[0], nonneg=True)
objective = cp.Minimize(cp.norm((P.T@x).T-v, 2) + gamma * cp.norm(x, 1))
# objective = cp.Minimize(cp.norm(x@P-v, 2) + gamma * cp.norm(x, 1))
prob = cp.Problem(objective)
try:
prob.solve()
except SolverError:
prob.solve(solver='SCS', verbose=False)
node_idx.extend([i_edge] + neighbors)
values.extend([1.] + x.value.tolist())
node_idx = np.array(node_idx)
values = np.array(values)
H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))
if with_feature:
return HyperG(H, X=X)
return HyperG(H)