最近参加了百度飞桨的图神经网络七日打卡,收获满满。
之前已经参加过飞桨的目标检测和分割课程,都是干货满满,强烈推荐。
import pgl
import paddle.fluid as fluid
import numpy as np
import time
import pandas as pd
from collections import namedtuple
Dataset = namedtuple("Dataset",
["graph", "num_classes", "train_index",
"train_label", "valid_index", "valid_label", "test_index"])
def load_edges(num_nodes, self_loop=True, add_inverse_edge=True):
# 从数据中读取边
edges = pd.read_csv("work/edges.csv", header=None, names=["src", "dst"]).values
if add_inverse_edge:
edges = np.vstack([edges, edges[:, ::-1]])
if self_loop:
src = np.arange(0, num_nodes)
dst = np.arange(0, num_nodes)
self_loop = np.vstack([src, dst]).T
edges = np.vstack([edges, self_loop])
return edges
def load():
# 从数据中读取点特征和边,以及数据划分
node_feat = np.load("work/feat.npy")
num_nodes = node_feat.shape[0]
edges = load_edges(num_nodes=num_nodes, self_loop=True, add_inverse_edge=True)
graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges, node_feat={"feat": node_feat})
indegree = graph.indegree()
norm = np.maximum(indegree.astype("float32"), 1)
norm = np.power(norm, -0.5)
graph.node_feat["norm"] = np.expand_dims(norm, -1)
df = pd.read_csv("work/train.csv")
node_index = df["nid"].values
node_label = df["label"].values
train_part = int(len(node_index) * 0.8)
train_index = node_index[:train_part]
train_label = node_label[:train_part]
valid_index = node_index[train_part:]
valid_label = node_label[train_part:]
test_index = pd.read_csv("work/test.csv")["nid"].values
dataset = Dataset(graph=graph,
train_label=train_label,
train_index=train_index,
valid_index=valid_index,
valid_label=valid_label,
test_index=test_index, num_classes=35)
return dataset
dataset = load()
train_index = dataset.train_index
train_label = np.reshape(dataset.train_label, [-1 , 1])
train_index = np.expand_dims(train_index, -1)
val_index = dataset.valid_index
val_label = np.reshape(dataset.valid_label, [-1, 1])
val_index = np.expand_dims(val_index, -1)
test_index = dataset.test_index
test_index = np.expand_dims(test_index, -1)
test_label = np.zeros((len(test_index), 1), dtype="int64")
import pgl
import model
import paddle.fluid as fluid
import numpy as np
import time
from build_model import build_model
# 使用CPU
place = fluid.CPUPlace()
# 使用GPU
# place = fluid.CUDAPlace(0)
train_program = fluid.default_main_program()
startup_program = fluid.default_startup_program()
with fluid.program_guard(train_program, startup_program):
with fluid.unique_name.guard():
gw, loss, acc, pred = build_model(dataset,
config=config,
phase="train",
main_prog=train_program)
test_program = fluid.Program()
with fluid.program_guard(test_program, startup_program):
with fluid.unique_name.guard():
_gw, v_loss, v_acc, v_pred = build_model(dataset,
config=config,
phase="test",
main_prog=test_program)
test_program = test_program.clone(for_test=True)
exe = fluid.Executor(place)
import pgl
import paddle.fluid.layers as L
import pgl.layers.conv as conv
def get_norm(indegree):
float_degree = L.cast(indegree, dtype="float32")
float_degree = L.clamp(float_degree, min=1.0)
norm = L.pow(float_degree, factor=-0.5)
return norm
class GCN(object):
"""Implement of GCN
"""
def __init__(self, config, num_class):
self.num_class = num_class
self.num_layers = config.get("num_layers", 1)
self.hidden_size = config.get("hidden_size", 64)
self.dropout = config.get("dropout", 0.5)
self.edge_dropout = config.get("edge_dropout", 0.0)
def forward(self, graph_wrapper, feature, phase):
for i in range(self.num_layers):
if phase == "train":
ngw = pgl.sample.edge_drop(graph_wrapper, self.edge_dropout)
norm = get_norm(ngw.indegree())
else:
ngw = graph_wrapper
norm = graph_wrapper.node_feat["norm"]
feature = pgl.layers.gcn(ngw,
feature,
self.hidden_size,
activation="relu",
norm=norm,
name="layer_%s" % i)
feature = L.dropout(
feature,
self.dropout,
dropout_implementation='upscale_in_train')
if phase == "train":
ngw = pgl.sample.edge_drop(graph_wrapper, self.edge_dropout)
norm = get_norm(ngw.indegree())
else:
ngw = graph_wrapper
norm = graph_wrapper.node_feat["norm"]
feature = conv.gcn(ngw,
feature,
self.num_class,
activation=None,
norm=norm,
name="output")
return feature
import pgl
import model
from pgl import data_loader
import paddle.fluid as fluid
import numpy as np
import time
def build_model(dataset, config, phase, main_prog):
gw = pgl.graph_wrapper.GraphWrapper(
name="graph",
node_feat=dataset.graph.node_feat_info())
GraphModel = getattr(model, config.model_name)
m = GraphModel(config=config, num_class=dataset.num_classes)
logits = m.forward(gw, gw.node_feat["feat"], phase)
# Take the last
node_index = fluid.layers.data(
"node_index",
shape=[None, 1],
dtype="int64",
append_batch_size=False)
node_label = fluid.layers.data(
"node_label",
shape=[None, 1],
dtype="int64",
append_batch_size=False)
pred = fluid.layers.gather(logits, node_index)
loss, pred = fluid.layers.softmax_with_cross_entropy(
logits=pred, label=node_label, return_softmax=True)
acc = fluid.layers.accuracy(input=pred, label=node_label, k=1)
pred = fluid.layers.argmax(pred, -1)
loss = fluid.layers.mean(loss)
if phase == "train":
adam = fluid.optimizer.Adam(
learning_rate=config.learning_rate,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=config.weight_decay))
adam.minimize(loss)
return gw, loss, acc, pred
epoch = 200
exe.run(startup_program)
# 将图数据变成 feed_dict 用于传入Paddle Excecutor
feed_dict = gw.to_feed(dataset.graph)
for epoch in range(epoch):
# Full Batch 训练
# 设定图上面那些节点要获取
# node_index: 训练节点的nid
# node_label: 训练节点对应的标签
feed_dict["node_index"] = np.array(train_index, dtype="int64")
feed_dict["node_label"] = np.array(train_label, dtype="int64")
train_loss, train_acc = exe.run(train_program,
feed=feed_dict,
fetch_list=[loss, acc],
return_numpy=True)
# Full Batch 验证
# 设定图上面那些节点要获取
# node_index: 训练节点的nid
# node_label: 训练节点对应的标签
feed_dict["node_index"] = np.array(val_index, dtype="int64")
feed_dict["node_label"] = np.array(val_label, dtype="int64")
val_loss, val_acc = exe.run(test_program,
feed=feed_dict,
fetch_list=[v_loss, v_acc],
return_numpy=True)
print("Epoch", epoch, "Train Acc", train_acc[0], "Valid Acc", val_acc[0])
feed_dict["node_index"] = np.array(test_index, dtype="int64")
feed_dict["node_label"] = np.array(test_label, dtype="int64") #假标签
test_prediction = exe.run(test_program,
feed=feed_dict,
fetch_list=[v_pred],
return_numpy=True)[0]