参照官方文档学习:
Cora是一个机器学习论文数据集。其中共有7个类别(num_classes:基于案例、遗传算法、 神经网络、概率方法、强化学习 、规则学习、理论。整个数据集中共有2708篇论文,在词干堵塞和去除词尾后,只剩下1433个独特的单词(num_node_features),文档频率小于10的所有单词都被删除。
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root='/tmp/Cora', name='Cora')
"""
Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708],
train_mask=[2708], val_mask=[2708], test_mask=[2708])
"""
print(dataset)
问题:cora无法下载:
修改文件 “C:\Users\Chloe\Anaconda3\envs\pytorch\Lib\site-packages\torch_geometric\datasets\planetoid.py” (我的文件位置)
把planetoid.py里面第48行的 url = 'https://github.com/kimiyoung/planetoid/raw/master/data'
改成 url='https://gitee.com/jiajiewu/planetoid/raw/master/data'
方案来源及其它解决方案
GCN直观理解
GCN作者的文档
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
'''
两层的GCN
'''
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
#GCNConv的两个参数为input channel size和Output channel size
#conv1将每个顶点的1433个特征压缩到16个特征值
#conv2根据之前得到的16个特征值将其再压缩为7
self.conv1 = GCNConv(dataset.num_node_features, 16)
self.conv2 = GCNConv(16, dataset.num_classes)
def forward(self, data):
x, edge_index = data.x, data.edge_index
x = self.conv1(x, edge_index)
x = F.relu(x)
#dropout用于降低过拟合情况
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
#dim=0对一列所有元素的进行softmax运算
#dim=1对一行所有元素的进行softmax运算
return F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
for epoch in range(200):
optimizer.zero_grad()
out = model(data)
#在训练集上计算loss,out为图在gcn网络中的计算结果,data.y即7类的概率大小
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
model.eval()
#选取7种类别中概率最大的类别为预测的节点类别
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')
输出结果:
部分代码注释来源
#利用MessagePassing类,重写GCNConv模块
import torch
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree
class GCNConv(MessagePassing):
def __init__(self, in_channels, out_channels): # 定义采样通道数、输出通道数
# 继承上面这个类,然后初始化。这里聚合的策略采用的是合并。
#(super 子类继承了父类的所有属性和方法)
super().__init__(aggr='add') # "Add" aggregation (Step 5).
self.lin = torch.nn.Linear(in_channels, out_channels)
def forward(self, x, edge_index):
# x has shape [N, in_channels]
# edge_index has shape [2, E]
# Step 1: Add self-loops to the adjacency matrix.
# 添加自循环
edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
# Step 2: Linearly transform node feature matrix.
x = self.lin(x)
# Step 3: Compute normalization.
row, col = edge_index #row=2 col=边的个数
#degree(index,num_nodes,dtype) :computes the (unweighted) degree of a given one-dimensional index tensor
#三个参数:边的个数,节点个数,节点数据类型
print("col:")
print(col)
print("x")
print(x.size(0))
deg = degree(col, x.size(0), dtype=x.dtype)
print("degree:")
print(deg)
deg_inv_sqrt = deg.pow(-0.5)
# float('inf')表示正无穷; float('-inf')表示负无穷
deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
print("deg_inv_sqrt:")
print(deg_inv_sqrt)
norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
print("norm")
print(norm)
# Step 4-5: Start propagating messages.
return self.propagate(edge_index, x=x, norm=norm)
def message(self, x_j, norm):
# x_j has shape [E, out_channels]
# Step 4: Normalize node features.
return norm.view(-1, 1) * x_j