GNN学习4:节点预测与边预测

数据完全存于内存的数据集类

InMemoryDataset基类简介

构造函数的输入参数:
root :根文件夹,它指示数据集应该被保存在哪里。在根目录下至少有两个文件夹:1.一个文件夹为raw_dir ,它用于存储未处理的文件,从网络上下载的数据集文件会被存放到这里。2.另一个文件夹为processed_dir ,处理后的数据集被保存到这里。
transform:可以为None,transform 函数接受 Data对象为参数,对其转换后返回。此函数在每一次数据访问时被调用,所以它应该用于数据增广(Data Augmentation)。
pre_transform: 可以为None,pre_transform 函数接受 Data对象为参数,对其转换后返回。此函数在样本 Data对象保存到文件前调用,所以它最好用于只需要做一次的大量预计算。
pre_filter:pre_filter 函数可以在保存前手动过滤掉数据对象。该函数的一个用例是,过滤样本类别。

为了创建一个 InMemoryDataset,我们需要实现四个基本方法:
raw_file_names() 这是一个属性方法,返回一个文件名列表,文件应该能在raw_dir文件夹中找到,否则调用process() 函数下载文件
到raw_dir文件夹。
processed_file_names() 。这是一个属性方法,返回一个文件名列表,文件应该能在processed_dir 文件夹中找到,否则调用。
process() 函数对样本做预处理然后保存到processed_dir 文件夹。
download() : 将原始数据文件下载到raw_dir文件夹。
process() : 对样本做预处理然后保存到processed_dir 文件夹。

import torch
from torch_geometric.data import InMemoryDataset,download_url
class MyOwnDataset(InMemoryDataset):
    def __init__(self, root, transform=None,pre_transform=None, pre_filter=None):
        super().__init__(root=root,
        transform=transform, pre_transform=pre_transform,
        pre_filter=pre_filter)
        self.data, self.slices =
        torch.load(self.processed_paths[0])
    @property
    def raw_file_names(self):
        return ['some_file_1', 'some_file_2', ...]
    @property
    def processed_file_names(self):
        return ['data.pt']
    def download(self):
    # Download to `self.raw_dir`.
        download_url(url, self.raw_dir)
    def process(self):
    # Read data into huge `Data` list.
        data_list = [...]
        if self.pre_filter is not None:
        data_list = [data for data in data_list if self.pre_filter(data)]
        if self.pre_transform is not None:
        data_list = [self.pre_transform(data) for data in data_list]
        data, slices = self.collate(data_list)
        torch.save((data, slices),self.processed_paths[0])

定义一个InMemoryDataset子类

以公开数据集PubMed 为例子。 PubMed 数据集存储的是文章引用网络,文章对应图的结点,如果两篇文章存在引用关系(无论引用与被引),则这两篇文章对应的结点之间存在边。

import os.path as osp
import torch
from torch_geometric.data import (InMemoryDataset,download_url)
from torch_geometric.io import read_planetoid_data
class PlanetoidPubMed(InMemoryDataset):
    r"""The citation network datasets "PubMed" from
    the
    `"Revisiting Semi-Supervised Learning with Graph
    Embeddings"
    `_ paper.
    Nodes represent documents and edges represent
    citation links.
    Training, validation and test splits are given by
    binary masks.
    Args:
    root (string): Root directory where the
    dataset should be saved.
    split (string): The type of dataset split
    (:obj:`"public"`, :obj:`"full"`,
    :obj:`"random"`).
    If set to :obj:`"public"`, the split will
    be the public fixed split
    from the
    `"Revisiting Semi-Supervised Learning
    with Graph Embeddings"
    `_
    paper.
    If set to :obj:`"full"`, all nodes except
    those in the validation
    and test sets will be used for training
    (as in the
    `"FastGCN: Fast Learning with Graph
    Convolutional Networks via
    Importance Sampling"
    `_ paper).
    If set to :obj:`"random"`, train,
    validation, and test sets will be
    randomly generated, according to
    :obj:`num_train_per_class`,
    :obj:`num_val` and :obj:`num_test`.
    (default: :obj:`"public"`)
    num_train_per_class (int, optional): The
    number of training samples
    per class in case of :obj:`"random"`
    split. (default: :obj:`20`)
    num_val (int, optional): The number of
    validation samples in case of
    :obj:`"random"` split. (default:
    :obj:`500`)
    num_test (int, optional): The number of test
    samples in case of
    :obj:`"random"` split. (default:
    :obj:`1000`)
    transform (callable, optional): A
    function/transform that takes in an
    :obj:`torch_geometric.data.Data` object
    and returns a transformed
    version. The data object will be
    transformed before every access.
    (default: :obj:`None`)
    pre_transform (callable, optional): A
    function/transform that takes in
    an :obj:`torch_geometric.data.Data`
    object and returns a
    transformed version. The data object will
    be transformed before
    being saved to disk. (default:
    :obj:`None`)
    """
    url ='https://github.com/kimiyoung/planetoid/raw/master/data'
    def __init__(self, root, split="public",num_train_per_class=20,
        num_val=500, num_test=1000,transform=None,pre_transform=None):
        super(PlanetoidPubMed, self).__init__(root,transform, pre_transform)
        self.data, self.slices =torch.load(self.processed_paths[0])
        self.split = split
        assert self.split in ['public', 'full','random']
        if split == 'full':
            data = self.get(0)
            data.train_mask.fill_(True)
            data.train_mask[data.val_mask | data.test_mask] = False
            self.data, self.slices = self.collate([data])
        elif split == 'random':
            data = self.get(0)
            data.train_mask.fill_(False)
            for c in range(self.num_classes):
                idx = (data.y ==c).nonzero(as_tuple=False).view(-1)
                idx = idx[torch.randperm(idx.size(0))[:num_train_per_class]]
                data.train_mask[idx] = True
        remaining =(~data.train_mask).nonzero(as_tuple=False).view(-1)
        remaining =remaining[torch.randperm(remaining.size(0))]
        data.val_mask.fill_(False)
        data.val_mask[remaining[:num_val]] = True
        data.test_mask.fill_(False)
        data.test_mask[remaining[num_val:num_val + num_test]] = True
        self.data, self.slices =self.collate([data])
    @property
    def raw_dir(self):
        return osp.join(self.root, 'raw')
    @property
    def processed_dir(self):
        return osp.join(self.root, 'processed')
    @property
    def raw_file_names(self):
        names = ['x', 'tx', 'allx', 'y', 'ty','ally', 'graph', 'test.index']
        return ['ind.pubmed.{}'.format(name) for name in names]
    @property
    def processed_file_names(self):
        return 'data.pt'
    def download(self):
        for name in self.raw_file_names:
            download_url('{}/{}'.format(self.url,name), self.raw_dir)
    def process(self):
        data = read_planetoid_data(self.raw_dir,'pubmed')
        data = data if self.pre_transform is None else self.pre_transform(data)
        torch.save(self.collate([data]),self.processed_paths[0])
    def __repr__(self):
        return '{}()'.format(self.name)

/data/anaconda3/lib/python3.7/site-packages/numba/decorators.py:146: RuntimeWarning: Caching is not available when the 'parallel' target is in use. Caching is now being disabled to allow execution to continue.
  warnings.warn(msg, RuntimeWarning)

运行流程:
1.首先检查数据原始文件是否已下载
- 检查self.raw_dir 目录下是否存在raw_file_names() 属性方法返回的每个文件,
- 如有文件不存在,则调用download() 方法执行原始文件下载。
- 其中self.raw_dir 为osp.join(self.root, ‘raw’)
2.其次检查数据是否经过处理
- 首先检查之前对数据做变换的方法
- 接着检查之前的样本过滤的方法
- 接着检查是否存在处理好的数据

节点预测与边预测任务实践

节点预测任务实践

class GAT(torch.nn.Module):
    def __init__(self, num_features,
        hidden_channels_list, num_classes):
        super(GAT, self).__init__()
        torch.manual_seed(12345)
        hns = [num_features] + hidden_channels_list
        conv_list = []
        for idx in range(len(hidden_channels_list)):
            conv_list.append((GATConv(hns[idx],hns[idx+1]), 'x, edge_index -> x'))
            conv_list.append(ReLU(inplace=True),)
        self.convseq = Sequential('x, edge_index',conv_list)
        self.linear = Linear(hidden_channels_list[-1],num_classes)
    def forward(self, x, edge_index):
        x = self.convseq(x, edge_index)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.linear(x)
        return x

边预测任务实践

import os.path as osp
from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.utils import train_test_split_edges
dataset = Planetoid(root='./dataset', name='Cora', transform=NormalizeFeatures())
data = dataset[0]
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
for key in data.keys:
    print(key, getattr(data, key).shape)
x torch.Size([2708, 1433])
val_pos_edge_index torch.Size([2, 263])
test_pos_edge_index torch.Size([2, 527])
train_pos_edge_index torch.Size([2, 8976])
train_neg_adj_mask torch.Size([2708, 2708])
val_neg_edge_index torch.Size([2, 263])
test_neg_edge_index torch.Size([2, 527])

构建神经网络

import torch
from torch_geometric.nn import GCNConv
class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Net, self).__init__()
        self.conv1 = GCNConv(in_channels, 128)
        self.conv2 = GCNConv(128, out_channels)
    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        return self.conv2(x, edge_index)
    def decode(self, z, pos_edge_index,neg_edge_index):
        edge_index = torch.cat([pos_edge_index,neg_edge_index], dim=-1)
        return (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj >0).nonzero(as_tuple=False).t()

定义单个epoch的训练过程

def get_link_labels(pos_edge_index, neg_edge_index):
    num_links = pos_edge_index.size(1) +neg_edge_index.size(1)
    link_labels = torch.zeros(num_links,dtype=torch.float)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels
def train(data, model, optimizer):
    model.train()
    neg_edge_index = negative_sampling(edge_index=data.train_pos_edge_index,num_nodes=data.num_nodes,num_neg_samples=data.train_pos_edge_index.size(1))
    optimizer.zero_grad()
    z = model.encode(data.x, data.train_pos_edge_index)
    link_logits = model.decode(z,data.train_pos_edge_index, neg_edge_index)
    link_labels = get_link_labels(data.train_pos_edge_index,neg_edge_index).to(data.x.device)
    loss =F.binary_cross_entropy_with_logits(link_logits,link_labels)
    loss.backward()
    optimizer.step()
    return loss

通常在图上存在边的节点对的数量往往少于不存在边的节点对的数量。为了类平衡,在每一个epoch 的训练过程中,我们在每一个epoch的训练过程中都采样与正样本数量一样的负样本,这样我们既做到了类平衡,又增加了训练负样本的丰富性。get_link_labels函数用于生成完整训练集的标签。在负样本采样时,我们传递了train_pos_edge_index 为参数,于是negative_sampling 函数只会在训练集中不存在边的结点对中采样。在训练阶段,我们应该只见训练集,对验证集与测试集都是不可见的,但在此阶段我们应该要完成对所有结点的编码,因此我们假设此处正样本训练集涉及到了所有的结点,这样就能实现对所有结点的编码。

定义单个epoch验证与测试过程

@torch.no_grad()
def test(data, model):
    model.eval()
    z = model.encode(data.x, data.train_pos_edge_index)
    results = []
    for prefix in ['val', 'test']:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']
        link_logits = model.decode(z, pos_edge_index,neg_edge_index)
        link_probs = link_logits.sigmoid()
        link_labels = get_link_labels(pos_edge_index,neg_edge_index)
        results.append(roc_auc_score(link_labels.cpu(),link_probs.cpu()))
    return results

运行完整的训练、验证与测试

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    dataset = 'Cora'
    path = osp.join(osp.dirname(osp.realpath(__file__)), '.','dataset', dataset)
    dataset = Planetoid(path, dataset,transform=T.NormalizeFeatures())
    data = dataset[0]
    ground_truth_edge_index =data.edge_index.to(device)
    data.train_mask = data.val_mask = data.test_mask =data.y = None
    data = train_test_split_edges(data)
    data = data.to(device)
    model = Net(dataset.num_features, 64).to(device)
    optimizer =torch.optim.Adam(params=model.parameters(), lr=0.01)
    best_val_auc = test_auc = 0
    for epoch in range(1, 101):
        loss = train(data, model, optimizer)
        val_auc, tmp_test_auc = test(data, model)
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            test_auc = tmp_test_auc
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f},Val: {val_auc:.4f}, 'f'Test: {test_auc:.4f}')
    z = model.encode(data.x, data.train_pos_edge_index)
    final_edge_index = model.decode_all(z)

参考资料:https://github.com/datawhalechina/team-learning-nlp/tree/master/GNN

你可能感兴趣的:(GNN学习4:节点预测与边预测)