- DataWhale开源学习资料:https://github.com/datawhalechina/team-learning-nlp/tree/master/GNN
6.1 数据完全存于内存的数据集类
- 学习在PyG中如何自定义一个数据完全存于内存的数据集类。
InMemoryDataset
基类简介
- 根文件夹(
root
)
- 传递的三个函数:
transform
pre_transform
pre_filter
- 四个基本方法:
raw_file_names()
processed_file_names()
。
download()
process()
:
例子:定义一个InMemoryDataset
子类
import os.path as osp
import torch
from torch_geometric.data import (InMemoryDataset, download_url)
from torch_geometric.io import read_planetoid_data
class PlanetoidPubMed(InMemoryDataset):
r"""The citation network datasets "PubMed" from the
`"Revisiting Semi-Supervised Learning with Graph Embeddings"
`_ paper.
Nodes represent documents and edges represent citation links.
Training, validation and test splits are given by binary masks.
Args:
root (string): Root directory where the dataset should be saved.
split (string): The type of dataset split
(:obj:`"public"`, :obj:`"full"`, :obj:`"random"`).
If set to :obj:`"public"`, the split will be the public fixed split
from the
`"Revisiting Semi-Supervised Learning with Graph Embeddings"
`_ paper.
If set to :obj:`"full"`, all nodes except those in the validation
and test sets will be used for training (as in the
`"FastGCN: Fast Learning with Graph Convolutional Networks via
Importance Sampling" `_ paper).
If set to :obj:`"random"`, train, validation, and test sets will be
randomly generated, according to :obj:`num_train_per_class`,
:obj:`num_val` and :obj:`num_test`. (default: :obj:`"public"`)
num_train_per_class (int, optional): The number of training samples
per class in case of :obj:`"random"` split. (default: :obj:`20`)
num_val (int, optional): The number of validation samples in case of
:obj:`"random"` split. (default: :obj:`500`)
num_test (int, optional): The number of test samples in case of
:obj:`"random"` split. (default: :obj:`1000`)
transform (callable, optional): A function/transform that takes in an
:obj:`torch_geometric.data.Data` object and returns a transformed
version. The data object will be transformed before every access.
(default: :obj:`None`)
pre_transform (callable, optional): A function/transform that takes in
an :obj:`torch_geometric.data.Data` object and returns a
transformed version. The data object will be transformed before
being saved to disk. (default: :obj:`None`)
"""
url = 'https://github.com/kimiyoung/planetoid/raw/master/data'
def __init__(self, root, split="public", num_train_per_class=20,
num_val=500, num_test=1000, transform=None,
pre_transform=None):
super(PlanetoidPubMed, self).__init__(root, transform, pre_transform)
self.data, self.slices = torch.load(self.processed_paths[0])
self.split = split
assert self.split in ['public', 'full', 'random']
if split == 'full':
data = self.get(0)
data.train_mask.fill_(True)
data.train_mask[data.val_mask | data.test_mask] = False
self.data, self.slices = self.collate([data])
elif split == 'random':
data = self.get(0)
data.train_mask.fill_(False)
for c in range(self.num_classes):
idx = (data.y == c).nonzero(as_tuple=False).view(-1)
idx = idx[torch.randperm(idx.size(0))[:num_train_per_class]]
data.train_mask[idx] = True
remaining = (~data.train_mask).nonzero(as_tuple=False).view(-1)
remaining = remaining[torch.randperm(remaining.size(0))]
data.val_mask.fill_(False)
data.val_mask[remaining[:num_val]] = True
data.test_mask.fill_(False)
data.test_mask[remaining[num_val:num_val + num_test]] = True
self.data, self.slices = self.collate([data])
@property
def raw_dir(self):
return osp.join(self.root, 'raw')
@property
def processed_dir(self):
return osp.join(self.root, 'processed')
@property
def raw_file_names(self):
names = ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']
return ['ind.pubmed.{}'.format(name) for name in names]
@property
def processed_file_names(self):
return 'data.pt'
def download(self):
for name in self.raw_file_names:
download_url('{}/{}'.format(self.url, name), self.raw_dir)
def process(self):
data = read_planetoid_data(self.raw_dir, 'pubmed')
data = data if self.pre_transform is None else self.pre_transform(data)
torch.save(self.collate([data]), self.processed_paths[0])
def __repr__(self):
return '{}()'.format(self.name)
dataset = PlanetoidPubMed('../dataset/Planetoid/PubMed')
print(dataset.num_classes)
print(dataset[0].num_nodes)
print(dataset[0].num_edges)
print(dataset[0].num_features)
# 3
# 19717
# 88648
# 500
#可以看到这个数据集包含三个分类任务,共19,717个结点,88,648条边,节点特征维度为500。
6.2 节点预测与边预测任务实践
- 实践开始~
- 重定义一个GAT神经网络,使其能够通过参数定义
GATConv
的层数,以及每一层GATConv
的out_channels
。
class GAT(torch.nn.Module):
def __init__(self, num_features, hidden_channels_list, num_classes):
super(GAT, self).__init__()
torch.manual_seed(12345)
hns = [num_features] + hidden_channels_list
conv_list = []
for idx in range(len(hidden_channels_list)):
conv_list.append((GATConv(hns[idx], hns[idx+1]), 'x, edge_index -> x'))
conv_list.append(ReLU(inplace=True),)
self.convseq = Sequential('x, edge_index', conv_list)
self.linear = Linear(hidden_channels_list[-1], num_classes)
def forward(self, x, edge_index):
x = self.convseq(x, edge_index)
x = F.dropout(x, p=0.5, training=self.training)
x = self.linear(x)
return x
边预测任务实践
- 边预测任务,是预测两个节点之间是否存在边。
- 为了构建边预测任务,我们需要生成一些负样本,即采样一些不存在边的节点对作为负样本边,正负样本应平衡。
- 将样本分为训练集、验证集和测试集三个集合。
- 框架方法:train_test_split_edges(data, val_ratio=0.05, test_ratio=0.1)
获取数据集并进行分析:
import os.path as osp
from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.utils import train_test_split_edges
dataset = 'Cora'
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)
dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
data = dataset[0]
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
print(data.edge_index.shape)
# torch.Size([2, 10556])
for key in data.keys:
print(key, getattr(data, key).shape)
# x torch.Size([2708, 1433])
# val_pos_edge_index torch.Size([2, 263])
# test_pos_edge_index torch.Size([2, 527])
# train_pos_edge_index torch.Size([2, 8976])
# train_neg_adj_mask torch.Size([2708, 2708])
# val_neg_edge_index torch.Size([2, 263])
# test_neg_edge_index torch.Size([2, 527])
# 263 + 527 + 8976 = 9766 != 10556
# 263 + 527 + 8976/2 = 5278 = 10556/2
构建神经网络模型
import torch
from torch_geometric.nn import GCNConv
class Net(torch.nn.Module):
def __init__(self, in_channels, out_channels):
super(Net, self).__init__()
self.conv1 = GCNConv(in_channels, 128)
self.conv2 = GCNConv(128, out_channels)
def encode(self, x, edge_index):
x = self.conv1(x, edge_index)
x = x.relu()
return self.conv2(x, edge_index)
def decode(self, z, pos_edge_index, neg_edge_index):
edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)
return (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
def decode_all(self, z):
prob_adj = z @ z.t()
return (prob_adj > 0).nonzero(as_tuple=False).t()
定义单个epoch的训练过程
def get_link_labels(pos_edge_index, neg_edge_index):
num_links = pos_edge_index.size(1) + neg_edge_index.size(1)
link_labels = torch.zeros(num_links, dtype=torch.float)
link_labels[:pos_edge_index.size(1)] = 1.
return link_labels
def train(data, model, optimizer):
model.train()
neg_edge_index = negative_sampling(
edge_index=data.train_pos_edge_index,
num_nodes=data.num_nodes,
num_neg_samples=data.train_pos_edge_index.size(1))
optimizer.zero_grad()
z = model.encode(data.x, data.train_pos_edge_index)
link_logits = model.decode(z, data.train_pos_edge_index, neg_edge_index)
link_labels = get_link_labels(data.train_pos_edge_index, neg_edge_index).to(data.x.device)
loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
loss.backward()
optimizer.step()
return loss
- 通常在图上存在边的节点对的数量往往少于不存在边的节点对的数量。为了类平衡,在每一个
epoch
的训练过程中,我们只需要用到与正样本一样数量的负样本。综合以上两点原因,我们在每一个epoch
的训练过程中都采样与正样本数量一样的负样本,这样我们既做到了类平衡,又增加了训练负样本的丰富性。get_link_labels
函数用于生成完整训练集的标签。在负样本采样时,我们传递了train_pos_edge_index
为参数,于是negative_sampling
函数只会在训练集中不存在边的结点对中采样。
定义单个epoch验证与测试过程
@torch.no_grad()
def test(data, model):
model.eval()
z = model.encode(data.x, data.train_pos_edge_index)
results = []
for prefix in ['val', 'test']:
pos_edge_index = data[f'{prefix}_pos_edge_index']
neg_edge_index = data[f'{prefix}_neg_edge_index']
link_logits = model.decode(z, pos_edge_index, neg_edge_index)
link_probs = link_logits.sigmoid()
link_labels = get_link_labels(pos_edge_index, neg_edge_index)
results.append(roc_auc_score(link_labels.cpu(), link_probs.cpu()))
return results
运行完整的训练、验证与测试
def main():
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = 'Cora'
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)
dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
data = dataset[0]
ground_truth_edge_index = data.edge_index.to(device)
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
data = data.to(device)
model = Net(dataset.num_features, 64).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
best_val_auc = test_auc = 0
for epoch in range(1, 101):
loss = train(data, model, optimizer)
val_auc, tmp_test_auc = test(data, model)
if val_auc > best_val_auc:
best_val_auc = val_auc
test_auc = tmp_test_auc
print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
f'Test: {test_auc:.4f}')
z = model.encode(data.x, data.train_pos_edge_index)
final_edge_index = model.decode_all(z)
if __name__ == "__main__":
main()
- 重点关注
InMemoryDataset
子类的运行流程与实现四个函数的规范
作业:待补充~~