数据集介绍
cora数据集- 下载地址
https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz
cora数据集- 内容介绍
样本特征,标签,邻接矩阵
该数据集共2708个样本点,每个样本点都是一篇科学论文,所有样本点被分为8个类别,类别分别是
1)基于案例;2)遗传算法;3)神经网络;4)概率方法;5)强化学习;6)规则学习;7)理论
每篇论文都由一个1433维的词向量表示,所以,每个样本点具有1433个特征。词向量的每个元素都对应一个词,且该元素只有0或1两个取值。取0表示该元素对应的词不在论文中,取1表示在论文中。所有的词来源于一个具有1433个词的字典。
每篇论文都至少引用了一篇其他论文,或者被其他论文引用,也就是样本点之间存在联系,没有任何一个样本点与其他样本点完全没联系。如果将样本点看做图中的点,则这是一个连通的图,不存在孤立点。
文件格式
下载的压缩包中有三个文件,分别是cora.cites,cora.content,README。
README是对数据集的介绍;cora.content是所有论文的独自的信息;cora.cites是论文之间的引用记录。
cora.content共有2708行,每一行代表一个样本点,即一篇论文。如下所示,每一行由三部分组成,分别是论文的编号,如31336;论文的词向量,一个有1433位的二进制;论文的类别,如Neural_Networks。
31336 0 0… 0 0 0 0 0 0 0 0 0 0 0 0 Neural_Networks
1061127 0 0… 0 0 0 0 0 0 0 0 0 0 0 0 Rule_Learning
1106406 0 0… 0 0 0 0 0 0 0 0 0 0 0 Reinforcement_Learnin
cora.cites共5429行, 每一行有两个论文编号,表示第一个编号的论文先写,第二个编号的论文引用第一个编号的论文。如下所示:
35 1033
35 103482
35 103515
如果将论文看做图中的点,那么这5429行便是点之间的5429条边。
读取数据
nb_nodes = features.shape[0] 2708
ft_size = features.shape[1] 1433
nb_classes = y_train.shape[1] 7
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = process.load_data(dataset)
def load_data(dataset_str): # {'pubmed', 'citeseer', 'cora'}
"""Load data."""
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
# 遍历读取文件中的数据
for i in range(len(names)):
with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
if sys.version_info > (3, 0):
objects.append(pkl.load(f, encoding='latin1'))
else:
objects.append(pkl.load(f))
构造tuple数据
x, y, tx, ty, allx, ally, graph = tuple(objects)
标记test数据index
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
test_idx_range = np.sort(test_idx_reorder)
if dataset_str == 'citeseer':
# Fix citeseer dataset (there are some isolated nodes in the graph)
# Find isolated nodes, add them as zero-vecs into the right position
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
tx_extended[test_idx_range-min(test_idx_range), :] = tx
tx = tx_extended
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
ty_extended[test_idx_range-min(test_idx_range), :] = ty
ty = ty_extended
将所有x数据合并
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
(0, 19) 1.0
(0, 81) 1.0
(0, 146) 1.0
(0, 315) 1.0
(0, 774) 1.0
(0, 877) 1.0
(0, 1194) 1.0
(0, 1247) 1.0
(0, 1274) 1.0
(1, 19) 1.0
(1, 88) 1.0
(1, 149) 1.0
(1, 212) 1.0
(1, 233) 1.0
(1, 332) 1.0
(1, 336) 1.0
(1, 359) 1.0
(1, 472) 1.0
(1, 507) 1.0
(1, 548) 1.0
(1, 687) 1.0
(1, 763) 1.0
(1, 808) 1.0
(1, 889) 1.0
(1, 1058) 1.0
(1, 1177) 1.0
(1, 1254) 1.0
(1, 1257) 1.0
(1, 1262) 1.0
(1, 1332) 1.0
(1, 1339) 1.0
(1, 1349) 1.0
提取图的邻接矩阵
graph:{0: [633, 1862, 2582], 1: [2, 652, 654], 2: [1986, 332, 1666, 1, 1454], 3: [2544], 4: [2176, 1016, 2176, 1761, 1256, 2175], 5: [1629, 2546, 1659, 1659], 6: [1416, 1602, 1042, 373], 7: [208], 8: [281, 1996, 269], 9: [2614, 723, 723], 10: [476, 2545],
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
三个集合的index配置
idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_val = range(len(y), len(y)+500)
设置mask
train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])
为三个集合的y值赋值
y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]
print(adj.shape)
print(features.shape)
return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
处理特征
def preprocess_features(features):
"""Row-normalize feature matrix and convert to tuple representation"""
rowsum = np.array(features.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
features = r_mat_inv.dot(features)
return features.todense(), sparse_to_tuple(features)
GAT模型
hid_units = [8] # numbers of hidden units per each attention head in each layer
n_heads = [8, 1] # additional entry for the output layer
class GAT(BaseGAttN):
def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop,
bias_mat, hid_units, n_heads, activation=tf.nn.elu, residual=False):
attns = []
生成8个头
for _ in range(n_heads[0]):
attns.append(layers.attn_head(inputs, bias_mat=bias_mat,
out_sz=hid_units[0], activation=activation,
in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
h_1 = tf.concat(attns, axis=-1)
将8个atten head拼接起来
for i in range(1, len(hid_units)):
h_old = h_1
attns = []
for _ in range(n_heads[i]):
attns.append(layers.attn_head(h_1, bias_mat=bias_mat,
out_sz=hid_units[i], activation=activation,
in_drop=ffd_drop, coef_drop=attn_drop, residual=residual))
h_1 = tf.concat(attns, axis=-1)
配置输出格式
out = []
for i in range(n_heads[-1]):
out.append(layers.attn_head(h_1, bias_mat=bias_mat,
out_sz=nb_classes, activation=lambda x: x,
in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
logits = tf.add_n(out) / n_heads[-1]
return logits
模型及参数配置
train_mask
(2708,)
增加一个维度
y_train
nb_nodes = features.shape[0]
ft_size = features.shape[1]
nb_classes = y_train.shape[1]
adj = adj.todense() #转成matrix
features = features[np.newaxis]
adj = adj[np.newaxis]
y_train = y_train[np.newaxis]
y_val = y_val[np.newaxis]
y_test = y_test[np.newaxis]
train_mask = train_mask[np.newaxis] #一行转每个一行(2708,)--》(1,2708)
val_mask = val_mask[np.newaxis]
test_mask = test_mask[np.newaxis]
biases = process.adj_to_bias(adj, [nb_nodes], nhood=1)
with tf.Graph().as_default():
with tf.name_scope('input'):
#设置tensor变量
ftr_in = tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, ft_size))
bias_in = tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, nb_nodes))
lbl_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes, nb_classes))
msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes))
attn_drop = tf.placeholder(dtype=tf.float32, shape=())
ffd_drop = tf.placeholder(dtype=tf.float32, shape=())
is_train = tf.placeholder(dtype=tf.bool, shape=())
logits = model.inference(ftr_in, nb_classes, nb_nodes, is_train,
attn_drop, ffd_drop,
bias_mat=bias_in,
hid_units=hid_units, n_heads=n_heads,
residual=residual, activation=nonlinearity)、
#将结果转为一行
log_resh = tf.reshape(logits, [-1, nb_classes])
lab_resh = tf.reshape(lbl_in, [-1, nb_classes])
msk_resh = tf.reshape(msk_in, [-1])
#损失函数和评价函数
loss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh)
accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh)
#训练
train_op = model.training(loss, lr, l2_coef)
#保存模型
saver = tf.train.Saver()
#初始化变量
#global_variables_initializer 返回一个用来初始化 计算图中 所有global variable的 op
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
# 阈值最小值为负无穷
vlss_mn = np.inf
vacc_mx = 0.0
curr_step = 0
GAT模型细节
读取数据
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
adj 邻接矩阵
feature 所有x的值
y_* 三个集合各自y值
*_mask 作为遮盖,覆盖y值,作用为三个集合表示时遮盖不想显示的部分
GAT模型构造
class GAT(BaseGAttN):
def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop,
bias_mat, hid_units, n_heads, activation=tf.nn.elu, residual=False):
attns = [] 对应第一个unit的多个head的所有权重的矩阵
对所有的权重进行初始化设置
首先对第一个unit的所有head的权重进行初始化设置;
这么做的原因是初始化第一个unit是基于feature以后,后面的unit都是基于前面的进行计算。
for _ in range(n_heads[0]):
attns.append(layers.attn_head(inputs, bias_mat=bias_mat,
out_sz=hid_units[0], activation=activation,
in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
将第一个初始化拼接到所有的权重中
h_1 = tf.concat(attns, axis=-1)
对第二个到倒数第二个unit的所有atten权重矩阵进行初始化,然后都将其拼接到h_1
for i in range(1, len(hid_units)):
h_old = h_1# 并没有使用
attns = []
for _ in range(n_heads[i]):
attns.append(layers.attn_head(h_1, bias_mat=bias_mat,
out_sz=hid_units[i], activation=activation,
in_drop=ffd_drop, coef_drop=attn_drop, residual=residual))
h_1 = tf.concat(attns, axis=-1)
out = []
对最后一个unit做处理;做softmax?
for i in range(n_heads[-1]):
out.append(layers.attn_head(h_1, bias_mat=bias_mat,
out_sz=nb_classes, activation=lambda x: x,
in_drop=ffd_drop, coef_drop=attn_drop, residual=False))
对所有的头求平均即 1/K
logits = tf.add_n(out) / n_heads[-1]
return logits```
atten的计算实现
```python
def attn_head(seq, out_sz, bias_mat, activation, in_drop=0.0, coef_drop=0.0, residual=False):
with tf.name_scope('my_attn'):
首先对开始加一个dropout
if in_drop != 0.0:
seq = tf.nn.dropout(seq, 1.0 - in_drop)
对输入特征进行一次卷积,使数据降维到输出维度
seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False)
# simplest self-attention possible
对第一次卷积的内容再做卷积
f_1 = tf.layers.conv1d(seq_fts, 1, 1)
f_2 = tf.layers.conv1d(seq_fts, 1, 1)
将二次卷积内容进行计算得到coefficients
# tf.transpose的第二个参数perm=[0,1,2],0代表三维数组的高(即为二维数组的个数),1代表二维数组的行,2代表二维数组的列。
# tf.transpose(x, perm=[1,0,2])代表将三位数组的高和行进行转置。
logits = f_1 + tf.transpose(f_2, [0, 2, 1])
对atten以后的数据进行relu正则化,softmax归一化,覆盖相邻节点的掩膜
coefs = tf.nn.softmax(tf.nn.leaky_relu(logits) + bias_mat)
添加dropout
if coef_drop != 0.0:
coefs = tf.nn.dropout(coefs, 1.0 - coef_drop)
if in_drop != 0.0:
seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)
atten结果与第一次卷积结果相乘,即atten与权重w相乘
vals = tf.matmul(coefs, seq_fts)
ret = tf.contrib.layers.bias_add(vals)
最后与初始的向量特征相乘,至此整个公式所有元素都已经凑齐
# residual connection
if residual:
if seq.shape[-1] != ret.shape[-1]:
ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
else:
ret = ret + seq
return activation(ret) # activation