图注意力网络GAT在keras上的代码阅读

GraphAttention

from __future__ import absolute_import

from keras import activations, constraints, initializers, regularizers
from keras import backend as K
from keras.layers import Layer, Dropout, LeakyReLU

# 本文件中的代码最主要看的是__call__文件,展示的淋漓尽致

class GraphAttention(Layer):

    def __init__(self,  # 构造函数与初始化,带自身self参数的各个含义在下面赋值有阐述
                 F_,
                 attn_heads=1,
                 attn_heads_reduction='concat',  # {'concat', 'average'}
                 dropout_rate=0.5,  # dropout机制,主要是针对数据集较少的数据同时要求的参数又比较多,这样的数据往往会产生过拟合的现象,该机制可以防止相当于正则化。
                 activation='relu',  # 激活函数relu
                 use_bias=True,  # 偏移量
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 attn_kernel_initializer='glorot_uniform',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 attn_kernel_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 attn_kernel_constraint=None,
                 **kwargs):
        if attn_heads_reduction not in {'concat', 'average'}:
            raise ValueError('Possbile reduction methods: concat, average')

        self.F_ = F_  # Number of output features (F' in the paper) 在论文里面的F`即输出的特征,加权求和得到的特征
        self.attn_heads = attn_heads  # Number of attention heads (K in the paper) 是multi-heads的数量,即K为K轮
        self.attn_heads_reduction = attn_heads_reduction  # Eq. 5 and 6 in the paper
        # 接上:multi-heads的结合方式,5为链接(concat)6为取均值(average)
        self.dropout_rate = dropout_rate  # Internal dropout rate dropout的比率,即不采取样本的多少值。
        self.activation = activations.get(activation)  # Eq. 4 in the paper 加权求和的公式
        self.use_bias = use_bias

        self.kernel_initializer = initializers.get(kernel_initializer)  # 权值初始化的方法
        self.bias_initializer = initializers.get(bias_initializer)  # 初始化偏移量
        self.attn_kernel_initializer = initializers.get(attn_kernel_initializer)  # 初始化multi-heads

        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.attn_kernel_regularizer = regularizers.get(attn_kernel_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)

        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
        self.attn_kernel_constraint = constraints.get(attn_kernel_constraint)
        self.supports_masking = False

        # Populated by build()
        self.kernels = []       # Layer kernels for attention heads
        self.biases = []        # Layer biases for attention heads
        self.attn_kernels = []  # Attention kernels for attention heads

        if attn_heads_reduction == 'concat':
            # Output will have shape (..., K * F')
            self.output_dim = self.F_ * self.attn_heads  # 如果输出为concat形式的特征那么输出就要扩大
        else:
            # Output will have shape (..., F')
            self.output_dim = self.F_  # 否则不变,即取均值

        super(GraphAttention, self).__init__(**kwargs)

    # 由于是第一次看代码,我在这里做一点科普。对于学习层的建造,是首先调用init初始函数然后在call使用之前调用一次build函数,如果是第二次使用就不再build了
    # 如果是使用已存在的layer是不需要写build函数的(只需self.built = true),只有自定义时才需要。
    def build(self, input_shape):
        assert len(input_shape) >= 2
        F = input_shape[0][-1]

        # Initialize weights for each attention head
        for head in range(self.attn_heads):
            # Layer kernel
            kernel = self.add_weight(shape=(F, self.F_),
                                     initializer=self.kernel_initializer,
                                     regularizer=self.kernel_regularizer,
                                     constraint=self.kernel_constraint,
                                     name='kernel_{}'.format(head))
            self.kernels.append(kernel)

            # # Layer bias
            if self.use_bias:
                bias = self.add_weight(shape=(self.F_, ),
                                       initializer=self.bias_initializer,
                                       regularizer=self.bias_regularizer,
                                       constraint=self.bias_constraint,
                                       name='bias_{}'.format(head))
                self.biases.append(bias)

            # Attention kernels
            attn_kernel_self = self.add_weight(shape=(self.F_, 1),
                                               initializer=self.attn_kernel_initializer,
                                               regularizer=self.attn_kernel_regularizer,
                                               constraint=self.attn_kernel_constraint,
                                               name='attn_kernel_self_{}'.format(head),)
            attn_kernel_neighs = self.add_weight(shape=(self.F_, 1),
                                                 initializer=self.attn_kernel_initializer,
                                                 regularizer=self.attn_kernel_regularizer,
                                                 constraint=self.attn_kernel_constraint,
                                                 name='attn_kernel_neigh_{}'.format(head))
            self.attn_kernels.append([attn_kernel_self, attn_kernel_neighs])
        self.built = True

    def call(self, inputs):  # 最核心的代码逻辑在这里
        X = inputs[0]  # Node features (N x F) 输入为X个点的F个特征值组成的矩阵
        A = inputs[1]  # Adjacency matrix (N x N) 输入为邻接矩阵

        outputs = []  # 定义输出元组
        for head in range(self.attn_heads):  # 对于我们需要的multi轮数之下进行不断地迭代
            kernel = self.kernels[head]  # W in the paper (F x F') # 我们一开始学习的扩展矩阵W,即将F个特征扩大维度到F`
            attention_kernel = self.attn_kernels[head]  # Attention kernel a in the paper (2F' x 1) 映射向量,即对WH操作关联值

            # Compute inputs to attention network
            features = K.dot(X, kernel)  # (N x F') 获得加权求和之前的输入特征矩阵,即对X和kernel进行矩阵的乘积

            # Compute feature combinations 注意!代码和论文的区别在这里!我尚不太懂这么操作的理由是什么
            # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j]
            attn_for_self = K.dot(features, attention_kernel[0])    # (N x 1), [a_1]^T [Wh_i]
            attn_for_neighs = K.dot(features, attention_kernel[1])  # (N x 1), [a_2]^T [Wh_j]

            # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]] 这里即计算出关系系数
            dense = attn_for_self + K.transpose(attn_for_neighs)  # (N x N) via broadcasting

            # Add nonlinearty ·定义·添加非线性函数作为激活函数
            dense = LeakyReLU(alpha=0.2)(dense)

            # Mask values before activation (Vaswani et al., 2017) 先做好mask。我们采用的是mask_attention
            mask = -10e9 * (1.0 - A)  # 如果点之间有变相连那么就取0,没有在下一步就会变成-inf相当于
            dense += mask

            # Apply softmax to get attention coefficients 使用激活函数
            dense = K.softmax(dense)  # (N x N)

            # 现在进入的是第二步!即加权求和部分,需要将特征进行加权求和
            # Apply dropout to features and attention coefficients,这里就是对特征和权值进行dropout
            dropout_attn = Dropout(self.dropout_rate)(dense)  # (N x N)
            dropout_feat = Dropout(self.dropout_rate)(features)  # (N x F')

            # Linear combination with neighbors' features
            # 加权求和步骤
            node_features = K.dot(dropout_attn, dropout_feat)  # (N x F')

            if self.use_bias:
                node_features = K.bias_add(node_features, self.biases[head])

            # Add output of attention head to final output
            outputs.append(node_features)  # 将获得的特征F`加入到输出元组

        # 从这里开始是对multi-heads进行操作,即如果是链接的就把···在之前说过了 不赘述
        # Aggregate the heads' output according to the reduction method
        if self.attn_heads_reduction == 'concat':
            output = K.concatenate(outputs)  # (N x KF')
        else:
            output = K.mean(K.stack(outputs), axis=0)  # N x F')

        output = self.activation(output)
        return output

    def compute_output_shape(self, input_shape):
        output_shape = input_shape[0][0], self.output_dim
        return output_shape

utils

from __future__ import print_function

import os
import pickle as pkl
import sys

import networkx as nx
import numpy as np
import scipy.sparse as sp


def parse_index_file(filename):  # 按照一行一行的读成数组
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):  # 在主函数使用的时候有介绍
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)


def load_data(dataset_str):
    """Load data."""
    FILE_PATH = os.path.abspath(__file__)
    # 获得py文件的绝对路径
    DIR_PATH = os.path.dirname(FILE_PATH)
    # 获得路径对应所在的目录
    DATA_PATH = os.path.join(DIR_PATH, 'data/')
    # 数据文件的路径join起来

    """
    加载数据,在data文件夹中可以看见共有Cora,citeseer,pubmed三种数据集,每个分为8个文件(x,y,tx,ty,allx,ally,graph,index)
    .x :训练数据的特征向量。
    .y :训练集的标签。
    .tx :测试集的特征向量。
    .ty :训练集的标签。
    .allx :训练与测试集的特征向量。
    .ally :训练与测试集的标签
    .graph :图
    .index :测试数据的ID索引
    """

    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    # 将数据文件的内容全部读到objects元组
    for i in range(len(names)):
        with open("{}ind.{}.{}".format(DATA_PATH, dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    # print(x.shape[0], x.shape[1])  x的大小为[140][1433]
    # print(y.shape[0], y.shape[1])  y的大小是[140][7],同理可以得到tx[1000][1433],ty[1000][7],allx[1708][1433],ally[1708][7]
    # 解释一下7和1433的意思:7表示有7个类别,或许是one-hot编码。1433特征向量,长度为1433个数据位(或许也是one-hot编码的稀疏矩阵)

    # 获取数据的乱序索引!注意是乱序的
    test_idx_reorder = parse_index_file("{}ind.{}.test.index".format(DATA_PATH, dataset_str))
    # 从大到小对索引进行排序
    test_idx_range = np.sort(test_idx_reorder)

    # citeseer数据集由于有很多分离的点,对他进行特殊处理。本次暂未对该数据集进行使用,就跳过该部分
    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder),
                                    max(test_idx_reorder) + 1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range - min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range - min(test_idx_range), :] = ty
        ty = ty_extended

    # np.vstack: 按照垂直方向即行的顺序把两个数组组成一个新的数组
    features = sp.vstack((allx, tx)).tolil()
    # print(features.shape[0],features.shape[1]) 输出为2708 1433
    # 把特征按照idx排序
    features[test_idx_reorder, :] = features[test_idx_range, :]
    # adj很明显就是图的邻接矩阵
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    # 把标签再组成一个新的tuple
    labels = np.vstack((ally, ty))
    # 排序
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    # tolist将矩阵或数组转换为列表,可以直接索引
    idx_test = test_idx_range.tolist()
    # 训练集的长度
    idx_train = range(len(y))
    # print(idx_train) range是(0,140)
    idx_val = range(len(y), len(y) + 500)
    # idx_val就是(140,640)

    # 在这里简介mask的作用,mask的作用是将建一个后一个参数大小的矩阵然后再把前一个参数代表索引的值表示为1,最后返回Boolean。
    # train_mask是一个长度为2708,其中前140是1
    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    # 训练集测试集和val的零矩阵
    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    # 把上述新建的矩阵按照sample_mask按照mask里的true替值
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]
    # 返回邻接矩阵,特征矩阵,标签和mask矩阵
    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask


def preprocess_features(features):  # 将原生数据转换为元组形式
    """Row-normalize feature matrix and convert to tuple representation"""
    # 按行计算结果,即计算特征的行和
    rowsum = np.array(features.sum(1))
    # 将获得的np数据类型即array或者mat类型展开成向量
    r_inv = np.power(rowsum, -1).flatten()
    # isinf是判断值是否为正无穷还是负无穷
    r_inv[np.isinf(r_inv)] = 0.
    # 按照r_inv创建对角矩阵
    r_mat_inv = sp.diags(r_inv)
    # numpy的dot作用就是计算矩阵的乘法,vdot计算的是点积
    features = r_mat_inv.dot(features)
    return features.todense()


if __name__ == "__main__":
    A, X, Y_train, Y_val, Y_test, idx_train, idx_val, idx_test = load_data('cora')  # 读取数据

最后是GAT的使用实例gat.py

from __future__ import division

import numpy as np
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.layers import Input, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2

from keras_gat import GraphAttention
from keras_gat.utils import load_data, preprocess_features

# Read data
A, X, Y_train, Y_val, Y_test, idx_train, idx_val, idx_test = load_data('cora')  # 调用函数读取数据获取
# A是邻接矩阵,X是特征矩阵F维的,训练集的标签,标签,测试集的标签,训练集的X的索引,索引,测试集的索引
# Parameters超参数设置,即后期模型搭建好后一个一个测试出来的优参
N = X.shape[0]                # 特征值中点的数量
F = X.shape[1]                # 原始特征的维度
n_classes = Y_train.shape[1]  # 即有多少种标签
F_ = 8                        # 一开始我们需要的F`的维度
n_attn_heads = 8              # multi训练的维度
dropout_rate = 0.6            # Dropout的比率
l2_reg = 5e-4/2               # Factor for l2 regularization
learning_rate = 5e-3          # Learning rate for Adam学习速率,adam是一种可以替代传统SGD的方法
epochs = 10000                # Number of training epochs 明面上的意思就是对于数据循环了多少遍
es_patience = 100             # 提早停止的标志

# Preprocessing operations
X = preprocess_features(X)  # 对特征矩阵进行特征的提取
A = A + np.eye(A.shape[0])  # Add self-loops,由于不能只考虑邻居,这里加上了自身的loop

# Model definition (as per Section 3.3 of the paper)
X_in = Input(shape=(F,))  # 在论文3.3里面提到了inductive 和 transductive learning 两个模型的训练
A_in = Input(shape=(N,))

# 定义第一个训练模型的dropout
dropout1 = Dropout(dropout_rate)(X_in)
graph_attention_1 = GraphAttention(F_,
                                   attn_heads=n_attn_heads,
                                   attn_heads_reduction='concat',
                                   dropout_rate=dropout_rate,
                                   activation='elu',
                                   kernel_regularizer=l2(l2_reg),
                                   attn_kernel_regularizer=l2(l2_reg))([dropout1, A_in])
# 定义第二个人训练模型的dropout
dropout2 = Dropout(dropout_rate)(graph_attention_1)
graph_attention_2 = GraphAttention(n_classes,
                                   attn_heads=1,
                                   attn_heads_reduction='average',
                                   dropout_rate=dropout_rate,
                                   activation='softmax',
                                   kernel_regularizer=l2(l2_reg),
                                   attn_kernel_regularizer=l2(l2_reg))([dropout2, A_in])

# Build model
model = Model(inputs=[X_in, A_in], outputs=graph_attention_2)  # 给模型定义输入与输出
optimizer = Adam(lr=learning_rate)  # 定义模型的优化方式以及优化的速率
model.compile(optimizer=optimizer,  # 定义损失函数
              loss='categorical_crossentropy',  # 交叉熵损失函数
              weighted_metrics=['acc'])
model.summary()  # 打印模型的概述信息

# Callbacks 为防止模型的过拟合,keras提供了一个early-stop机制,即在达到检测器的patience即保存参数(通常以h5的形式保存)等
es_callback = EarlyStopping(monitor='val_weighted_acc', patience=es_patience)
tb_callback = TensorBoard(batch_size=N)
mc_callback = ModelCheckpoint('.\\logs\\best_model.h5',
                              monitor='val_weighted_acc',
                              save_best_only=True,
                              save_weights_only=True)

# Train model  训练模型
validation_data = ([X, A], Y_val, idx_val)
model.fit([X, A],
          Y_train,
          sample_weight=idx_train,
          epochs=epochs,
          batch_size=N,
          validation_data=validation_data,
          shuffle=False,  # Shuffling data means shuffling the whole graph
          callbacks=[es_callback, tb_callback, mc_callback])

# Load best model  加载出我们需要的模型
model.load_weights('.\\logs\\best_model.h5')

# Evaluate model 用正确性去评估模型的好坏程度
eval_results = model.evaluate([X, A],
                              Y_test,
                              sample_weight=idx_test,
                              batch_size=N,
                              verbose=0)
print('Done.\n'
      'Test loss: {}\n'
      'Test accuracy: {}'.format(*eval_results))

你可能感兴趣的:(图注意力网络GAT在keras上的代码阅读)