Graph Attention Networks(GAT)代码部分详细解析

数据集介绍
cora数据集- 下载地址
https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz

cora数据集- 内容介绍
样本特征,标签,邻接矩阵
该数据集共2708个样本点,每个样本点都是一篇科学论文,所有样本点被分为8个类别,类别分别是
1)基于案例;2)遗传算法;3)神经网络;4)概率方法;5)强化学习;6)规则学习;7)理论

每篇论文都由一个1433维的词向量表示,所以,每个样本点具有1433个特征。词向量的每个元素都对应一个词,且该元素只有0或1两个取值。取0表示该元素对应的词不在论文中,取1表示在论文中。所有的词来源于一个具有1433个词的字典。

每篇论文都至少引用了一篇其他论文,或者被其他论文引用,也就是样本点之间存在联系,没有任何一个样本点与其他样本点完全没联系。如果将样本点看做图中的点,则这是一个连通的图,不存在孤立点。

文件格式

下载的压缩包中有三个文件,分别是cora.cites,cora.content,README。

README是对数据集的介绍;cora.content是所有论文的独自的信息;cora.cites是论文之间的引用记录。

cora.content共有2708行,每一行代表一个样本点,即一篇论文。如下所示,每一行由三部分组成,分别是论文的编号,如31336;论文的词向量,一个有1433位的二进制;论文的类别,如Neural_Networks。
31336 0 0… 0 0 0 0 0 0 0 0 0 0 0 0 Neural_Networks
1061127 0 0… 0 0 0 0 0 0 0 0 0 0 0 0 Rule_Learning
1106406 0 0… 0 0 0 0 0 0 0 0 0 0 0 Reinforcement_Learnin

cora.cites共5429行, 每一行有两个论文编号,表示第一个编号的论文先写,第二个编号的论文引用第一个编号的论文。如下所示:
35 1033
35 103482
35 103515

如果将论文看做图中的点,那么这5429行便是点之间的5429条边。

读取数据

nb_nodes = features.shape[0] 2708
ft_size = features.shape[1] 1433
nb_classes = y_train.shape[1] 7
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = process.load_data(dataset)


def load_data(dataset_str): # {'pubmed', 'citeseer', 'cora'}
    """Load data."""
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    # 遍历读取文件中的数据
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))
    构造tuple数据
    x, y, tx, ty, allx, ally, graph = tuple(objects)
    标记test数据index
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended
    
    将所有x数据合并
    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
  (0, 19)	1.0
  (0, 81)	1.0
  (0, 146)	1.0
  (0, 315)	1.0
  (0, 774)	1.0
  (0, 877)	1.0
  (0, 1194)	1.0
  (0, 1247)	1.0
  (0, 1274)	1.0
  (1, 19)	1.0
  (1, 88)	1.0
  (1, 149)	1.0
  (1, 212)	1.0
  (1, 233)	1.0
  (1, 332)	1.0
  (1, 336)	1.0
  (1, 359)	1.0
  (1, 472)	1.0
  (1, 507)	1.0
  (1, 548)	1.0
  (1, 687)	1.0
  (1, 763)	1.0
  (1, 808)	1.0
  (1, 889)	1.0
  (1, 1058)	1.0
  (1, 1177)	1.0
  (1, 1254)	1.0
  (1, 1257)	1.0
  (1, 1262)	1.0
  (1, 1332)	1.0
  (1, 1339)	1.0
  (1, 1349)	1.0
    提取图的邻接矩阵
    graph:{0: [633, 1862, 2582], 1: [2, 652, 654], 2: [1986, 332, 1666, 1, 1454], 3: [2544], 4: [2176, 1016, 2176, 1761, 1256, 2175], 5: [1629, 2546, 1659, 1659], 6: [1416, 1602, 1042, 373], 7: [208], 8: [281, 1996, 269], 9: [2614, 723, 723], 10: [476, 2545],
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]
    三个集合的index配置
    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)
    设置mask
    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])
    为三个集合的y值赋值
    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]

    print(adj.shape)
    print(features.shape)

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask


处理特征

def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return features.todense(), sparse_to_tuple(features)

GAT模型

hid_units = [8] # numbers of hidden units per each attention head in each layer
n_heads = [8, 1] # additional entry for the output layer
class GAT(BaseGAttN):
    def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop,
            bias_mat, hid_units, n_heads, activation=tf.nn.elu, residual=False):
        attns = []
        生成8个头
        for _ in range(n_heads[0]):
            attns.append(layers.attn_head(inputs, bias_mat=bias_mat,
                out_sz=hid_units[0], activation=activation,
                in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
        h_1 = tf.concat(attns, axis=-1)8个atten head拼接起来
        for i in range(1, len(hid_units)):
            h_old = h_1
            attns = []
            for _ in range(n_heads[i]):
                attns.append(layers.attn_head(h_1, bias_mat=bias_mat,
                    out_sz=hid_units[i], activation=activation,
                    in_drop=ffd_drop, coef_drop=attn_drop, residual=residual))
            h_1 = tf.concat(attns, axis=-1)
        配置输出格式
        out = []
        for i in range(n_heads[-1]):
            out.append(layers.attn_head(h_1, bias_mat=bias_mat,
                out_sz=nb_classes, activation=lambda x: x,
                in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
        logits = tf.add_n(out) / n_heads[-1]
    
        return logits


模型及参数配置
train_mask

(2708,)
增加一个维度
y_train

nb_nodes = features.shape[0]
ft_size = features.shape[1]
nb_classes = y_train.shape[1]

adj = adj.todense() #转成matrix

features = features[np.newaxis]
adj = adj[np.newaxis]
y_train = y_train[np.newaxis]
y_val = y_val[np.newaxis]
y_test = y_test[np.newaxis]
train_mask = train_mask[np.newaxis]  #一行转每个一行(2708,)--》(1,2708)
val_mask = val_mask[np.newaxis]
test_mask = test_mask[np.newaxis]

biases = process.adj_to_bias(adj, [nb_nodes], nhood=1)

with tf.Graph().as_default():
    with tf.name_scope('input'):
        #设置tensor变量
        ftr_in = tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, ft_size))
        bias_in = tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, nb_nodes))
        lbl_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes, nb_classes))
        msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes))
        attn_drop = tf.placeholder(dtype=tf.float32, shape=())
        ffd_drop = tf.placeholder(dtype=tf.float32, shape=())
        is_train = tf.placeholder(dtype=tf.bool, shape=())
    
    logits = model.inference(ftr_in, nb_classes, nb_nodes, is_train,
                                attn_drop, ffd_drop,
                                bias_mat=bias_in,
                                hid_units=hid_units, n_heads=n_heads,
                                residual=residual, activation=nonlinearity)#将结果转为一行
    log_resh = tf.reshape(logits, [-1, nb_classes])
    lab_resh = tf.reshape(lbl_in, [-1, nb_classes])
    msk_resh = tf.reshape(msk_in, [-1])
    #损失函数和评价函数
    loss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh)
    accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh)
    
    #训练
    train_op = model.training(loss, lr, l2_coef)
    #保存模型
    saver = tf.train.Saver()
    #初始化变量
    #global_variables_initializer 返回一个用来初始化 计算图中 所有global variable的 op
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    
    # 阈值最小值为负无穷
    vlss_mn = np.inf
    vacc_mx = 0.0
    curr_step = 0

GAT模型细节

读取数据

adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
adj 邻接矩阵
feature 所有x的值
y_* 三个集合各自y值
*_mask 作为遮盖,覆盖y值,作用为三个集合表示时遮盖不想显示的部分

GAT模型构造

class GAT(BaseGAttN):
    def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop,
            bias_mat, hid_units, n_heads, activation=tf.nn.elu, residual=False):
        attns = [] 对应第一个unit的多个head的所有权重的矩阵
        对所有的权重进行初始化设置
        
        首先对第一个unit的所有head的权重进行初始化设置;
        这么做的原因是初始化第一个unit是基于feature以后,后面的unit都是基于前面的进行计算。
        for _ in range(n_heads[0]):
            attns.append(layers.attn_head(inputs, bias_mat=bias_mat,
                out_sz=hid_units[0], activation=activation,
                in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
        将第一个初始化拼接到所有的权重中
        h_1 = tf.concat(attns, axis=-1)
        
        对第二个到倒数第二个unit的所有atten权重矩阵进行初始化,然后都将其拼接到h_1
        for i in range(1, len(hid_units)):
            h_old = h_1# 并没有使用
            attns = []
            for _ in range(n_heads[i]):
                attns.append(layers.attn_head(h_1, bias_mat=bias_mat,
                    out_sz=hid_units[i], activation=activation,
                    in_drop=ffd_drop, coef_drop=attn_drop, residual=residual))
            h_1 = tf.concat(attns, axis=-1)
        out = []
        
        对最后一个unit做处理;做softmax?
        for i in range(n_heads[-1]):
            out.append(layers.attn_head(h_1, bias_mat=bias_mat,
                out_sz=nb_classes, activation=lambda x: x,
                in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
        
        对所有的头求平均即 1/K
        logits = tf.add_n(out) / n_heads[-1]
    
        return logits```




atten的计算实现

```python
def attn_head(seq, out_sz, bias_mat, activation, in_drop=0.0, coef_drop=0.0, residual=False):
    with tf.name_scope('my_attn'):
        首先对开始加一个dropout
        if in_drop != 0.0:
            seq = tf.nn.dropout(seq, 1.0 - in_drop)
        对输入特征进行一次卷积,使数据降维到输出维度
        seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False)
        
        
        # simplest self-attention possible
        对第一次卷积的内容再做卷积
        f_1 = tf.layers.conv1d(seq_fts, 1, 1)
        f_2 = tf.layers.conv1d(seq_fts, 1, 1)
        将二次卷积内容进行计算得到coefficients
        # tf.transpose的第二个参数perm=[0,1,2],0代表三维数组的高(即为二维数组的个数),1代表二维数组的行,2代表二维数组的列。
        # tf.transpose(x, perm=[1,0,2])代表将三位数组的高和行进行转置。
        logits = f_1 + tf.transpose(f_2, [0, 2, 1])
        对atten以后的数据进行relu正则化,softmax归一化,覆盖相邻节点的掩膜
        coefs = tf.nn.softmax(tf.nn.leaky_relu(logits) + bias_mat)
        
        添加dropout
        if coef_drop != 0.0:
            coefs = tf.nn.dropout(coefs, 1.0 - coef_drop)
        if in_drop != 0.0:
            seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)
        
        atten结果与第一次卷积结果相乘,即atten与权重w相乘
        vals = tf.matmul(coefs, seq_fts)
        ret = tf.contrib.layers.bias_add(vals)
        
        最后与初始的向量特征相乘,至此整个公式所有元素都已经凑齐
        # residual connection
        if residual:
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
            else:
                ret = ret + seq

        return activation(ret)  # activation

你可能感兴趣的:(tensorflow)