读Graph-Matching-Networks复现①嵌入

参数

{'aggregator': 
	{'aggregation_type': 'sum',
	 'gated': True,
	 'graph_transform_sizes': [128],
	 'input_size': [32],
	 'node_hidden_sizes': [128]},
 'data': 
 	{'dataset_params': 
 		{'n_changes_negative': 2,
 		
		 'n_changes_positive': 1,
		 #一对被视为正(相似)的边替换的数量。
		 
		 'n_nodes_range': [20, 20],
		 'p_edge_range': [0.2, 0.2],
		 #生成具有 20 个节点且 p_edge=0.2 (边概率?)的图
		 
		 'validation_dataset_size': 1000},
  	 'problem': 'graph_edit_distance'},
 'encoder': 
	 {'edge_hidden_sizes': None,
	  'node_feature_dim': 1,
	  'node_hidden_sizes': [32]},
 'evaluation': {'batch_size': 20},
 'graph_embedding_net': 
	 {'edge_hidden_sizes': [64, 64],
	 
	  'edge_net_init_scale': 0.1,
	  #用小参数权重初始化消息 MLP 以防止聚合消息向量爆炸
	  #或者也可以使用例如 层标准化以控制这些的规模。
	  
	  'layer_norm': False,
	  # 在实验中没有使用层范数,但有时会有用
	  
	  'n_prop_layers': 5,
	  'node_hidden_sizes': [64],
	  'node_state_dim': 32,
	  
	  'node_update_type': 'gru',
	  # 其他也可以用'mlp'  `residual`
	  
	  'prop_type': 'matching',
	  #如果用嵌入网络就设置为 `embedding` 
	  
	  'reverse_dir_param_different': False,
	  #如果是有向图则设成TRUE
	  
	  'share_prop_params': True,
	  # 判断在信息传递层是否参数共享
	  
	  #如果有双向边就设成FALSE
	  'use_reverse_direction': True},
	  
 'graph_matching_net': 
	 {'edge_hidden_sizes': [64, 64],
	  'edge_net_init_scale': 0.1,
	  'layer_norm': False,
	  'n_prop_layers': 5,
	  'node_hidden_sizes': [64],
	  'node_state_dim': 32,
	  'node_update_type': 'gru',
	  'prop_type': 'matching',
	  'reverse_dir_param_different': False,
	  'share_prop_params': True,
	  'similarity': 'dotproduct',
	  'use_reverse_direction': True},
 'model_type': 'embedding',
 'seed': 8,
 'training': 
	 {'batch_size': 20,
	 
	  'clip_value': 10.0,
	  #设置梯度裁剪防止梯度爆炸
	  
	  'eval_after': 10,
	  #每个 `eval_after * print_after` 步骤对验证集进行评估。
	  
	  'graph_vec_regularizer_weight': 1e-06,
	  #图向量上有一个小的正则化器会缩放以避免图向量爆炸。 
	  #如果模型中的数值问题特别严重,
	  #可以将 `snt.LayerNorm` 添加到每一层的输出、聚合消息和聚合节点表示中,
	  #以将网络激活规模保持在合理范围内。

	  'learning_rate': 0.0001,
	  'loss': 'margin',
	  'margin': 1.0,
	  'mode': 'pair',
	  
	  'n_training_steps': 500000,
	  #控制训练时长
	  
	  'print_after': 100}}
	  #每隔这么多训练步骤打印训练信息

生成固定大小的图
——————
build_datasets划分数据集
(好像训练集或验证集都只有一个图?)
——————————
默认训练模式为’pair’,也即学习结果与标签作误差的标准损失函数(相对于三元损失)

training_data_iter = training_set.pairs(config['training']['batch_size'])
# 'batch_size'为20
first_batch_graphs, _ = next(training_data_iter)

其中

def pairs(self, batch_size):
    """Yields batches of pair data."""
    while True:
        batch_graphs = []
        batch_labels = []
        positive = True
        for _ in range(batch_size):# 20
            g1, g2 = self._get_pair(positive)
            #随意造出的图,依照原图改出的图
            batch_graphs.append((g1, g2))
            #这样的到一批20对图作为一个批次
            batch_labels.append(1 if positive else -1)
            #positive=true对应相似的情况
            positive = not positive
        packed_graphs = self._pack_batch(batch_graphs)
        #_pack_batch将一批图打包成一个collection实例
        labels = np.array(batch_labels, dtype=np.int32)
        yield packed_graphs, labels
        #yield把函数变成一个iter迭代器

其中

def _get_pair(self, positive):
    g = self._get_graph()
    #随机生成一个连通图
    if self._permute:	#True
    #随意定下20个点,再随意添加一些边生成新图
        permuted_g = permute_graph_nodes(g)
    else:
        permuted_g = g
    n_changes = self._k_pos if positive else self._k_neg		
    #True → n_changes = 1
    changed_g = substitute_random_edges(g, n_changes)
    #随机删除20条边,再添加n_changes=1条边得到新图
    return permuted_g, changed_g

其中(但是这样100个图里就能返回一个连通图?)

def _get_graph(self):
    """Generate one graph."""
    n_nodes = np.random.randint(self._n_min, self._n_max + 1)
    #其实还是20
    p_edge = np.random.uniform(self._p_min, self._p_max)
    #其实还是0.2

    # 随机生成100个有20个节点以0.2为概率连接的图,再筛选出其中的连通图返回
    n_trials = 100
    for _ in range(n_trials):
        g = nx.erdos_renyi_graph(n_nodes, p_edge)
        if nx.is_connected(g):
            return g
def _pack_batch(self, graphs):
    Graphs = []
    for graph in graphs:
        for inergraph in graph:
            Graphs.append(inergraph)
    graphs = Graphs
    #一批共40个图
    from_idx = []
    to_idx = []
    graph_idx = []
    n_total_nodes = 0
    n_total_edges = 0
    for i, g in enumerate(graphs):
        n_nodes = g.number_of_nodes()
        n_edges = g.number_of_edges()
        edges = np.array(g.edges(), dtype=np.int32)
        #from_idx记录所有边的起点
        from_idx.append(edges[:, 0] + n_total_nodes)
        #to_idx记录所有边的终点
        to_idx.append(edges[:, 1] + n_total_nodes)
        #记录全为当前图索引i的向量(长20)
        graph_idx.append(np.ones(n_nodes, dtype=np.int32) * i)
        n_total_nodes += n_nodes
        n_total_edges += n_edges
    GraphData = collections.namedtuple('GraphData', [
        'from_idx',
        'to_idx',
        'node_features',
        'edge_features',
        'graph_idx',
        'n_graphs'])
    return GraphData(
        from_idx=np.concatenate(from_idx, axis=0),
        to_idx=np.concatenate(to_idx, axis=0),
        # this task only cares about the structures, the graphs have no features.
        # setting higher dimension of ones to confirm code functioning
        # with high dimensional features.
        node_features=np.ones((n_total_nodes, 8), dtype=np.float32),
        edge_features=np.ones((n_total_edges, 4), dtype=np.float32),
        graph_idx=np.concatenate(graph_idx, axis=0),
        n_graphs=len(graphs),
    )

(用yield是节省了空间,但时间上还挺久的)
以training_data_iter的第一个,即next(training_data_iter)为例
packed_graphs,即next(training_data_iter)[0]为
读Graph-Matching-Networks复现①嵌入_第1张图片
其中from_idx和to_idx为长度相同但不定的行向量,即边的起点索引集和终点索引集,这第一个长为1552,
edge_features是长度与其相同也不定的4维行向量,该不定长度即边总数大概1480~1690,这第一个长为1552,即1552x4
node_features是全1的800长(即节点总数)的8维行向量,即800x8
graph_idx是800长的行向量
n_graphs为40
labels,即next(training_data_iter)[1]为
[ 1 -1 1 -1 1 -1 1 -1 1 -1 1 -1 1 -1 1 -1 1 -1 1 -1],长20
————————————————
将packed_graphs记作first_batch_graphs,(莫名其妙)取两个_features的列数,即8和4分别作为参数node_feature_dim,edge_feature_dim构造模型
这里先用图嵌入,也即是GNN来试试

GraphEmbeddingNet(
  (_encoder): GraphEncoder(
    (MLP1): Sequential(
      (0): Linear(in_features=8, out_features=32, bias=True)
    )
    (MLP2): Sequential(
      (0): Linear(in_features=4, out_features=16, bias=True)
    )
  )
  (_aggregator): GraphAggregator(
    (MLP1): Sequential(
      (0): Linear(in_features=32, out_features=256, bias=True)
    )
    (MLP2): Sequential(
      (0): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (_prop_layers): ModuleList(
    (0): GraphPropLayer(
      (_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (_reverse_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (GRU): GRU(64, 32)
    )
    (1): GraphPropLayer(
      (_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (_reverse_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (GRU): GRU(64, 32)
    )
    (2): GraphPropLayer(
      (_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (_reverse_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (GRU): GRU(64, 32)
    )
    (3): GraphPropLayer(
      (_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (_reverse_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (GRU): GRU(64, 32)
    )
    (4): GraphPropLayer(
      (_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (_reverse_message_net): Sequential(
        (0): Linear(in_features=80, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (GRU): GRU(64, 32)
    )
  )
)
#创建可以给无key字典提供默认值的字典
accumulated_metrics = collections.defaultdict(list)
#此时结果为defaultdict(list, {})

成对比较,所以记training_n_graphs_in_batch *= 2=40
迭代开始:
1.提取出来node_features, edge_features, from_idx, to_idx, graph_idx, labels,先tensor化,再cuda化
2.送进model前向(GraphEmbeddingNet)
①先送进GraphEncoder前向,也即将两个_features分别通过两个MLP,输出node_features, edge_features尺寸由分别的hidden_sizes决定,

node_states = node_features
layer_outputs = [node_states]

②五层prop层(考虑信息的双向传递,每个方向是一层MLP接ReLu激活后再接一个MLP)
每层更新node_states,记录进layer_outputs
至于边信息,在GraphPropLayer的前向过程中_compute_aggregated_messages》graph_prop_once中edge_features与from_states和to_states拼接得到的edge_inputs整合出messages传入unsorted_segment_sum得到正向的aggregated_messages,同理得到反向的reverse_aggregated_messages与前者相加得到aggregated_messages,进而送入_compute_node_update与节点信息整合
从而GraphPropLayer在GraphEmbeddingNet》_build_layer》_build_layer中构造prop层的结构

③最后通过aggregator层整合节点信息
首先是一层MLP得到800x256的矩阵node_states_g,

if self._gated:	#True
     gates = torch.sigmoid(node_states_g[:, :self._graph_state_dim])
     #	前128列通过sigmoid的800x128的结果乘以后128列
     node_states_g = node_states_g[:, self._graph_state_dim:] * gates 

接着计算张量段的和

graph_states = unsorted_segment_sum(node_states_g, graph_idx, n_graphs) 

其中判定len(graph_idx.shape)=1,则
s=torch.prod(torch.tensor(node_states_g.shape[1:])).long().cuda(),即tensor(128,device=‘cuda:0’),把graph_idx横向复制成800x128的矩阵segment_ids

tensor = torch.zeros(*shape).cuda().scatter_add(0, segment_ids,node_states_g) 

这里scatter_add参考https://blog.csdn.net/weixin_43922901/article/details/102587924的二维计算
torch.zeros(*shape).cuda()[segment_ids[i][j]][j] +=
node_states_g[i][j] # 0维上的运算
torch.zeros(*shape).cuda()[i][segment_ids[i][j]] +=
node_states_g[i][j] # 1维上
最终统一tensor为node_states_g的数据类型,返回tensor,即graph_states

(由于_aggregation_type设为sum所以跳过将小于 -1e5 的所有内容重置为 0进一步转换减少的graph_states这一过程)
将graph_states送进第二个MLP,最终得到整个模型的输出graph_vectors 40x128

接下来把graph_vectors横向劈开得到上下两个20x128,记作x和y
进而计算成对损失,可以是margin距离,即 t o r c h . r e l u [ m a r g i n − l a b e l s ∗ ( 1 − ∣ ∣ x , y ∣ ∣ 2 ) ] torch.relu[margin - labels * (1 -||x, y||_2)] torch.relu[marginlabels(1x,y2)],margin可以取1
或者hamming距离,首先如下计算x和y的近似汉明相似度(不太懂两个tanh的内积能等效于异或xor吗,还求个均值是怎样)

def approximate_hamming_similarity(x, y):
    """Approximate Hamming similarity."""
    return torch.mean(torch.tanh(x) * torch.tanh(y), dim=1)

然后计算距离 0.25 × [ l a b e l s − a p p r o x i m a t e _ h a m m i n g _ s i m i l a r i t y ( x , y ) ] 2 0.25×[labels - approximate\_hamming\_similarity(x, y)]^2 0.25×[labelsapproximate_hamming_similarity(x,y)]2
接着记录相似样本和非相似样本的位置is_pos,is_neg
之后是相似度,可以是margin相似度,即负的相似度;也可以是如下汉明相似度(这次整个x y中正数相乘也是不懂)

def exact_hamming_similarity(x, y):
    """Compute the binary Hamming similarity."""
    match = ((x > 0) * (y > 0)).float()
    return torch.mean(match, dim=1)

(下面两步也不明白是在干啥)

sim_pos = torch.sum(sim * is_pos) / (n_pos + 1e-8)
sim_neg = torch.sum(sim * is_neg) / (n_neg + 1e-8)

然后(依然不懂)

graph_vec_scale = torch.mean(graph_vectors ** 2)
loss += (config['training']['graph_vec_regularizer_weight'] *0.5 * graph_vec_scale)
# config['training']['graph_vec_regularizer_weight']=1e-6

接着就是BP优化的事了,顺便记录进accumulated_metrics,方便后续打印(不过这时loss还是矩阵的形式,这给后续画图的打算添了不少麻烦,按说记录损失值才对吧,记成一个矩阵耗复杂度不说,关键这样对吗?)
(试着画了下图感觉BP时使用loss均值还是原loss的影响不大,ACC和AUC是挺漂亮的,总体上升明显,但loss波动的仿佛没有收敛)

至于评估阶段eval,每1000轮一次,
首先是计算AUC即ROC曲线面积,
这里with torch.no_grad():是让这一块的代码不做计算图(就是链式法则),体现在这一段的变量没有grad_fn=

通过一次model前向得到x和y,直接compute_similarity计算相似度scores

scores = (scores - scores_min) / (scores_max - scores_min + 1e-8)
labels = (labels + 1) / 2

如上整理后送入sklearn的metrics.roc_curve算出auc值
(ACC后面看到三元损失训练时再细扣这里吧)
读Graph-Matching-Networks复现①嵌入_第2张图片
读Graph-Matching-Networks复现①嵌入_第3张图片
读Graph-Matching-Networks复现①嵌入_第4张图片

————————————
或者采用三元损失,

training_data_iter = training_set.triplets(config['training']['batch_size'])
first_batch_graphs = next(training_data_iter)

其中

def triplets(self, batch_size):
    """Yields batches of triplet data."""
    while True:
        batch_graphs = []
        for _ in range(batch_size):
            g1, g2, g3 = self._get_triplet()
            batch_graphs.append((g1, g2, g1, g3))
        yield self._pack_batch(batch_graphs)

与pairs的区别在于没有标签,以及存进batch的顺序
其中

def _get_triplet(self):
   """Generate one triplet of graphs."""
   g = self._get_graph()
   if self._permute:
       permuted_g = permute_graph_nodes(g)
   else:
       permuted_g = g
   pos_g = substitute_random_edges(g, self._k_pos) #1
   neg_g = substitute_random_edges(g, self._k_neg) #2
   return permuted_g, pos_g, neg_g

与_get_pair的区别体现在用substitute_random_edges构造了两个图,一个天了一条边,另一个添了两条
读Graph-Matching-Networks复现①嵌入_第5张图片
from_idx,to_idx 3360
edge_features 3360x4
node_features 1600x8
从而送进model得到80x128的graph_vectors
通过reshape_and_split_tensor切分出4份x_1, y, x_2, z计算损失(其实x_1=x_2)
margin: t o r c h . r e l u ( m a r g i n + ∣ ∣ x 1 − y ∣ ∣ 2 − ∣ ∣ x 2 − z ∣ ∣ 2 torch.relu(margin+||x_1-y||_2 -||x_2-z||_2 torch.relu(margin+x1y2x2z2
hamming: 0.125 × { [ a p p r o x i m a t e _ h a m m i n g _ s i m i l a r i t y ( x 1 , y ) − 1 ] 2 + [ a p p r o x i m a t e _ h a m m i n g _ s i m i l a r i t y ( x 2 , z ) + 1 ] 2 } 0.125×\{[approximate\_hamming\_similarity(x_1, y) - 1] ^2 +[approximate\_hamming\_similarity(x_2, z) + 1]^2\} 0.125×{[approximate_hamming_similarity(x1,y)1]2+[approximate_hamming_similarity(x2,z)+1]2}

sim_pos = torch.mean(compute_similarity(config, x_1, y))
sim_neg = torch.mean(compute_similarity(config, x_2, z))
graph_vec_scale = torch.mean(graph_vectors ** 2)
loss += (config['training']['graph_vec_regularizer_weight'] *0.5 * graph_vec_scale)

接着评估阶段计算准确率,大概因为成对损失本身就能表征准确率,所以统一采用三元损失的方式来计算准确率,
这时通过前向得到x_1, y, x_2, z,再分别计算应该相似的x_1和y以及应该不相似的x_2和z的相似度sim_1和sim_2
得到sim_1 > sim_2的01向量,求均值作为acc

————————————
优化器

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.0001
    weight_decay: 1e-05
)
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
model.to(device)

你可能感兴趣的:(论文,小白,自然语言处理,深度学习)