参照官方文档学习:
构建一个三个节点,四条边的图(有两条无向边,用四条有向边表示)
import torch
from torch_geometric.data import Data
#data.edge_index: Graph connectivity in COO format with shape [2, num_edges] and type torch.long
#也可以以该矩阵的转置形式定义edge_index
edge_index = torch.tensor([[0, 1, 1, 2],
[1, 0, 2, 1]], dtype=torch.long)
#data.x: Node feature matrix with shape [num_nodes, num_node_features]
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)
data = Data(x=x, edge_index=edge_index)
print(data)
#>>> Data(edge_index=[2, 4], x=[3, 1])
print(data.keys)
print(data.x)
print(data.edge_index)
for key, item in data:
print(f'{key} found in data')
'edge_attr' in data
data.num_nodes
data.num_edges
data.num_node_features
data.has_isolated_nodes()
# Transfer data object to GPU.
#device = torch.device('cuda')
#data = data.to(device)
COO存储格式:
优点:容易转换成其他的稀疏矩阵存储格式(CSR等)
import torch
from torch_geometric.data import Data
from torch_geometric.datasets import TUDataset
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
print(dataset)
len(dataset)
dataset.num_classes
dataset.num_node_features
#1. 分析数据集中的图
data = dataset[0]#取数据集中第一个图
print(data)
"""
>>>Data(edge_index=[2, 168], x=[37, 3], y=[1])
一个图有168条边;37个节点,每个节点有3个特征,
data.y: Target to train against (may have arbitrary shape)
e.g., node-level targets of shape [num_nodes, *] or graph-level targets of shape [1, *]
the data object is holding exactly one graph-level target.
"""
"""
例2
>>> Data(edge_index=[2, 10556], test_mask=[2708],
train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708])
图有10558条边;2708个节点;每个节点1433个特征;该数据的任务是node-level
test_mask train_mask val_mask 标记了节点的用途
"""
#2. shuffle the data
dataset = dataset.shuffle()
#equal to
#perm = torch.randperm(len(dataset))
#dataset = dataset[perm]
#3. 按照90/10 train/test 分割数据集
train_dataset = dataset[:540]
test_dataset = dataset[540:]
print(train_dataset)
print(test_dataset)
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_scatter import scatter_mean
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
#每个batch有32张图
print(dataset)
data1 = dataset[0]
print(data1)
data2 = dataset[1]
print(data2)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
for data in loader:
#print(data)
"""
>>> DataBatch(edge_index=[2, 3836], x=[980, 21], y=[32], batch=[980], ptr=[33])
# 32个图的节点矩阵拼接在一起:x=[980, 21];32个图的边矩阵拼接在一起:edge_index=[2, 3836]
# y=[32]表示32个图;batch=[980]表示有980个节点参与此batch的训练
# batch is a column vector which maps each node to its respective graph in the batch:
"""
#print(data.num_graphs)
#>>> 32
print('data.x')
print(data.x.size())
#同一个图中节点向量求平均
x = scatter_mean(data.x, data.batch, dim=0)
print(data.batch)
print('scatter_mean之后:')
print(x.size())
print(x)
#>>> torch.Size([32, 21])
torch.scatter 模块
torch.scatter官方文档
"""
src – The source tensor.
index – The indices of elements to scatter.
dim – The axis along which to index. (default: -1)
out – The destination tensor.
dim_size – If out is not given, automatically create output with size dim_size at dimension dim. If dim_size is not given, a minimal sized output tensor according to index.max() + 1 is returned.
reduce – The reduce operation ("sum", "mul", "mean", "min" or "max"). (default: "sum")
"""
from torch_scatter import scatter
#src = torch.randn(10, 6, 64)
src = torch.tensor([[[1],
[2],
[3],
[4],
[5],
[6]]])
index = torch.tensor([0, 1, 0, 1, 2, 1])
print(src)
# Broadcasting in the first and last dim.
#压缩维度为1的向量(y轴);假如要将dim_1压缩到m维,则 0<=index[i]<=m-1
out = scatter(src, index, dim=1, reduce="sum")
print(out.size())
print(out)
"""
torch.Size([1, 3, 1])
tensor([[[ 4],
[12],
[ 5]]])
"""
out = scatter_mean(src, index, dim=1)
#out = scatter(src, index, dim=1,reduce="mean")
print(out)
"""
tensor([[[2],
[4],
[5]]])
"""
transform在将数据输入到神经网络之前修改数据,这一功能可用于实现数据规范化或数据增强;
常见transforms:
在数据集:ShapeNet dataset (containing 17,000 3D shape point clouds and per point labels from 16 shape categories)使用transforms.
点云数据(point cloud)
点云数据是指在一个三维坐标系统中的一组向量的集合。这些向量通常以X,Y,Z三维坐标的形式表示,而且一般主要用来代表一个物体的外表面形状。不经如此,除(X,Y,Z)代表的几何位置信息之外,点云数据还可以表示一个点的RGB颜色,灰度值,深度,分割结果等。
E.g. Pi={Xi, Yi, Zi,…….}表示空间中的一个点,
则Point Cloud={P1, P2, P3,……Pn}表示一组点云数据。
from torch_geometric.datasets import ShapeNet
import torch_geometric.transforms as T
dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'])
dataset[0]
# >>> Data(pos=[2518, 3], y=[2518])
# data.pos: Node position matrix with shape [num_nodes, num_dimensions]
# 2518个点,每个点是三维的;node-level
#1. 根据节点位置生成KNN最近邻图; 2. translate
dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'],
pre_transform=T.KNNGraph(k=6),
transform=T.RandomTranslate(0.01))
dataset[0]
#>>> Data(edge_index=[2, 15108], pos=[2518, 3], y=[2518])