构造函数的输入参数:
root :根文件夹,它指示数据集应该被保存在哪里。在根目录下至少有两个文件夹:1.一个文件夹为raw_dir ,它用于存储未处理的文件,从网络上下载的数据集文件会被存放到这里。2.另一个文件夹为processed_dir ,处理后的数据集被保存到这里。
transform:可以为None,transform 函数接受 Data对象为参数,对其转换后返回。此函数在每一次数据访问时被调用,所以它应该用于数据增广(Data Augmentation)。
pre_transform: 可以为None,pre_transform 函数接受 Data对象为参数,对其转换后返回。此函数在样本 Data对象保存到文件前调用,所以它最好用于只需要做一次的大量预计算。
pre_filter:pre_filter 函数可以在保存前手动过滤掉数据对象。该函数的一个用例是,过滤样本类别。
为了创建一个 InMemoryDataset,我们需要实现四个基本方法:
raw_file_names() 这是一个属性方法,返回一个文件名列表,文件应该能在raw_dir文件夹中找到,否则调用process() 函数下载文件
到raw_dir文件夹。
processed_file_names() 。这是一个属性方法,返回一个文件名列表,文件应该能在processed_dir 文件夹中找到,否则调用。
process() 函数对样本做预处理然后保存到processed_dir 文件夹。
download() : 将原始数据文件下载到raw_dir文件夹。
process() : 对样本做预处理然后保存到processed_dir 文件夹。
import torch
from torch_geometric.data import InMemoryDataset,download_url
class MyOwnDataset(InMemoryDataset):
def __init__(self, root, transform=None,pre_transform=None, pre_filter=None):
super().__init__(root=root,
transform=transform, pre_transform=pre_transform,
pre_filter=pre_filter)
self.data, self.slices =
torch.load(self.processed_paths[0])
@property
def raw_file_names(self):
return ['some_file_1', 'some_file_2', ...]
@property
def processed_file_names(self):
return ['data.pt']
def download(self):
# Download to `self.raw_dir`.
download_url(url, self.raw_dir)
def process(self):
# Read data into huge `Data` list.
data_list = [...]
if self.pre_filter is not None:
data_list = [data for data in data_list if self.pre_filter(data)]
if self.pre_transform is not None:
data_list = [self.pre_transform(data) for data in data_list]
data, slices = self.collate(data_list)
torch.save((data, slices),self.processed_paths[0])
以公开数据集PubMed 为例子。 PubMed 数据集存储的是文章引用网络,文章对应图的结点,如果两篇文章存在引用关系(无论引用与被引),则这两篇文章对应的结点之间存在边。
import os.path as osp
import torch
from torch_geometric.data import (InMemoryDataset,download_url)
from torch_geometric.io import read_planetoid_data
class PlanetoidPubMed(InMemoryDataset):
r"""The citation network datasets "PubMed" from
the
`"Revisiting Semi-Supervised Learning with Graph
Embeddings"
`_ paper.
Nodes represent documents and edges represent
citation links.
Training, validation and test splits are given by
binary masks.
Args:
root (string): Root directory where the
dataset should be saved.
split (string): The type of dataset split
(:obj:`"public"`, :obj:`"full"`,
:obj:`"random"`).
If set to :obj:`"public"`, the split will
be the public fixed split
from the
`"Revisiting Semi-Supervised Learning
with Graph Embeddings"
`_
paper.
If set to :obj:`"full"`, all nodes except
those in the validation
and test sets will be used for training
(as in the
`"FastGCN: Fast Learning with Graph
Convolutional Networks via
Importance Sampling"
`_ paper).
If set to :obj:`"random"`, train,
validation, and test sets will be
randomly generated, according to
:obj:`num_train_per_class`,
:obj:`num_val` and :obj:`num_test`.
(default: :obj:`"public"`)
num_train_per_class (int, optional): The
number of training samples
per class in case of :obj:`"random"`
split. (default: :obj:`20`)
num_val (int, optional): The number of
validation samples in case of
:obj:`"random"` split. (default:
:obj:`500`)
num_test (int, optional): The number of test
samples in case of
:obj:`"random"` split. (default:
:obj:`1000`)
transform (callable, optional): A
function/transform that takes in an
:obj:`torch_geometric.data.Data` object
and returns a transformed
version. The data object will be
transformed before every access.
(default: :obj:`None`)
pre_transform (callable, optional): A
function/transform that takes in
an :obj:`torch_geometric.data.Data`
object and returns a
transformed version. The data object will
be transformed before
being saved to disk. (default:
:obj:`None`)
"""
url ='https://github.com/kimiyoung/planetoid/raw/master/data'
def __init__(self, root, split="public",num_train_per_class=20,
num_val=500, num_test=1000,transform=None,pre_transform=None):
super(PlanetoidPubMed, self).__init__(root,transform, pre_transform)
self.data, self.slices =torch.load(self.processed_paths[0])
self.split = split
assert self.split in ['public', 'full','random']
if split == 'full':
data = self.get(0)
data.train_mask.fill_(True)
data.train_mask[data.val_mask | data.test_mask] = False
self.data, self.slices = self.collate([data])
elif split == 'random':
data = self.get(0)
data.train_mask.fill_(False)
for c in range(self.num_classes):
idx = (data.y ==c).nonzero(as_tuple=False).view(-1)
idx = idx[torch.randperm(idx.size(0))[:num_train_per_class]]
data.train_mask[idx] = True
remaining =(~data.train_mask).nonzero(as_tuple=False).view(-1)
remaining =remaining[torch.randperm(remaining.size(0))]
data.val_mask.fill_(False)
data.val_mask[remaining[:num_val]] = True
data.test_mask.fill_(False)
data.test_mask[remaining[num_val:num_val + num_test]] = True
self.data, self.slices =self.collate([data])
@property
def raw_dir(self):
return osp.join(self.root, 'raw')
@property
def processed_dir(self):
return osp.join(self.root, 'processed')
@property
def raw_file_names(self):
names = ['x', 'tx', 'allx', 'y', 'ty','ally', 'graph', 'test.index']
return ['ind.pubmed.{}'.format(name) for name in names]
@property
def processed_file_names(self):
return 'data.pt'
def download(self):
for name in self.raw_file_names:
download_url('{}/{}'.format(self.url,name), self.raw_dir)
def process(self):
data = read_planetoid_data(self.raw_dir,'pubmed')
data = data if self.pre_transform is None else self.pre_transform(data)
torch.save(self.collate([data]),self.processed_paths[0])
def __repr__(self):
return '{}()'.format(self.name)
/data/anaconda3/lib/python3.7/site-packages/numba/decorators.py:146: RuntimeWarning: Caching is not available when the 'parallel' target is in use. Caching is now being disabled to allow execution to continue.
warnings.warn(msg, RuntimeWarning)
运行流程:
1.首先检查数据原始文件是否已下载
- 检查self.raw_dir 目录下是否存在raw_file_names() 属性方法返回的每个文件,
- 如有文件不存在,则调用download() 方法执行原始文件下载。
- 其中self.raw_dir 为osp.join(self.root, ‘raw’)
2.其次检查数据是否经过处理
- 首先检查之前对数据做变换的方法
- 接着检查之前的样本过滤的方法
- 接着检查是否存在处理好的数据
class GAT(torch.nn.Module):
def __init__(self, num_features,
hidden_channels_list, num_classes):
super(GAT, self).__init__()
torch.manual_seed(12345)
hns = [num_features] + hidden_channels_list
conv_list = []
for idx in range(len(hidden_channels_list)):
conv_list.append((GATConv(hns[idx],hns[idx+1]), 'x, edge_index -> x'))
conv_list.append(ReLU(inplace=True),)
self.convseq = Sequential('x, edge_index',conv_list)
self.linear = Linear(hidden_channels_list[-1],num_classes)
def forward(self, x, edge_index):
x = self.convseq(x, edge_index)
x = F.dropout(x, p=0.5, training=self.training)
x = self.linear(x)
return x
import os.path as osp
from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.utils import train_test_split_edges
dataset = Planetoid(root='./dataset', name='Cora', transform=NormalizeFeatures())
data = dataset[0]
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
for key in data.keys:
print(key, getattr(data, key).shape)
x torch.Size([2708, 1433])
val_pos_edge_index torch.Size([2, 263])
test_pos_edge_index torch.Size([2, 527])
train_pos_edge_index torch.Size([2, 8976])
train_neg_adj_mask torch.Size([2708, 2708])
val_neg_edge_index torch.Size([2, 263])
test_neg_edge_index torch.Size([2, 527])
import torch
from torch_geometric.nn import GCNConv
class Net(torch.nn.Module):
def __init__(self, in_channels, out_channels):
super(Net, self).__init__()
self.conv1 = GCNConv(in_channels, 128)
self.conv2 = GCNConv(128, out_channels)
def encode(self, x, edge_index):
x = self.conv1(x, edge_index)
x = x.relu()
return self.conv2(x, edge_index)
def decode(self, z, pos_edge_index,neg_edge_index):
edge_index = torch.cat([pos_edge_index,neg_edge_index], dim=-1)
return (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
def decode_all(self, z):
prob_adj = z @ z.t()
return (prob_adj >0).nonzero(as_tuple=False).t()
def get_link_labels(pos_edge_index, neg_edge_index):
num_links = pos_edge_index.size(1) +neg_edge_index.size(1)
link_labels = torch.zeros(num_links,dtype=torch.float)
link_labels[:pos_edge_index.size(1)] = 1.
return link_labels
def train(data, model, optimizer):
model.train()
neg_edge_index = negative_sampling(edge_index=data.train_pos_edge_index,num_nodes=data.num_nodes,num_neg_samples=data.train_pos_edge_index.size(1))
optimizer.zero_grad()
z = model.encode(data.x, data.train_pos_edge_index)
link_logits = model.decode(z,data.train_pos_edge_index, neg_edge_index)
link_labels = get_link_labels(data.train_pos_edge_index,neg_edge_index).to(data.x.device)
loss =F.binary_cross_entropy_with_logits(link_logits,link_labels)
loss.backward()
optimizer.step()
return loss
通常在图上存在边的节点对的数量往往少于不存在边的节点对的数量。为了类平衡,在每一个epoch 的训练过程中,我们在每一个epoch的训练过程中都采样与正样本数量一样的负样本,这样我们既做到了类平衡,又增加了训练负样本的丰富性。get_link_labels函数用于生成完整训练集的标签。在负样本采样时,我们传递了train_pos_edge_index 为参数,于是negative_sampling 函数只会在训练集中不存在边的结点对中采样。在训练阶段,我们应该只见训练集,对验证集与测试集都是不可见的,但在此阶段我们应该要完成对所有结点的编码,因此我们假设此处正样本训练集涉及到了所有的结点,这样就能实现对所有结点的编码。
@torch.no_grad()
def test(data, model):
model.eval()
z = model.encode(data.x, data.train_pos_edge_index)
results = []
for prefix in ['val', 'test']:
pos_edge_index = data[f'{prefix}_pos_edge_index']
neg_edge_index = data[f'{prefix}_neg_edge_index']
link_logits = model.decode(z, pos_edge_index,neg_edge_index)
link_probs = link_logits.sigmoid()
link_labels = get_link_labels(pos_edge_index,neg_edge_index)
results.append(roc_auc_score(link_labels.cpu(),link_probs.cpu()))
return results
def main():
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = 'Cora'
path = osp.join(osp.dirname(osp.realpath(__file__)), '.','dataset', dataset)
dataset = Planetoid(path, dataset,transform=T.NormalizeFeatures())
data = dataset[0]
ground_truth_edge_index =data.edge_index.to(device)
data.train_mask = data.val_mask = data.test_mask =data.y = None
data = train_test_split_edges(data)
data = data.to(device)
model = Net(dataset.num_features, 64).to(device)
optimizer =torch.optim.Adam(params=model.parameters(), lr=0.01)
best_val_auc = test_auc = 0
for epoch in range(1, 101):
loss = train(data, model, optimizer)
val_auc, tmp_test_auc = test(data, model)
if val_auc > best_val_auc:
best_val_auc = val_auc
test_auc = tmp_test_auc
print(f'Epoch: {epoch:03d}, Loss: {loss:.4f},Val: {val_auc:.4f}, 'f'Test: {test_auc:.4f}')
z = model.encode(data.x, data.train_pos_edge_index)
final_edge_index = model.decode_all(z)
参考资料:https://github.com/datawhalechina/team-learning-nlp/tree/master/GNN