for i in range(beg, end):
torch.cuda.manual_seed(0) # for initialisation of the models
use_cuda = torch.cuda.is_available()
if use_cuda:
print("Using GPU")
print("---------------Running time:------------", i)
torch.manual_seed(args.seed) #为CPU设置种子用于生成随机数,以使得结果是确定的 torch.cuda.manual_seed(args.seed) #为当前GPU设置随机种子;
#如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子
人话:神经网络都需要初始化参数,若每次都随机初始化,则结果没有可比性;所以用同样的随机初始化种子即可以保证每次初始化的结果都相同。
class NN1_PCA(nn.Module):
def __init__(self, input_dim=60, output_dim=10):
super(NN1_PCA, self).__init__()
self.fc1 = nn.Linear(input_dim, 200)
self.fc2 = nn.Linear(200, output_dim)
def forward(self, x):
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
神经网络结构:60— 200—relu—47—softmax
linear:线性输出,无任何激活函数
forward:forward的作用主要是首先网络各个层的拼接,保证其能够按照一定的顺序来进行,同时可以在定义子网络时,使用nn.Sequential容器也可以保证网络的流动顺序。
if model == "NN1_PCA": # for Femnist, MNIST datasets
model = NN1_PCA(input_dim=dim_pca, output_dim=dim_output), model
add_hooks(model[0])
hook函数是程序中预定义好的函数,这个函数处于原有程序流程当中(暴露一个钩子出来)
我们需要再在有流程中钩子定义的函数块中实现某个具体的细节,需要把我们的实现,挂接或者注册(register)到钩子里,使得hook函数对目标可用
钩子只有注册的时候,才会使用,所以原有程序的流程中,没有注册或挂载时,执行的是空(即没有执行任何操作)
class UserAVG(User):
def __init__(self, numeric_id, train_data, test_data, model, sample_ratio, learning_rate, L, local_updates,
dp, times, use_cuda):
super().__init__(numeric_id, train_data, test_data, model[0], sample_ratio, learning_rate, L,
local_updates, dp, times, use_cuda)
if model[1] == 'mclr':
self.loss = nn.NLLLoss()
else:
self.loss = nn.CrossEntropyLoss()
# self.scheduler = StepLR(self.optimizer, step_size=50, gamma=0.1)
# self.lr_drop_rate = 0.95
param_groups = [{'params': p, 'lr': self.learning_rate} for p in self.model.parameters()]
self.optimizer = FedAvgOptimizer(param_groups, lr=self.learning_rate, weight_decay=L)
self.csi = None
def set_grads(self, new_grads):
if isinstance(new_grads, nn.Parameter):
for model_grad, new_grad in zip(self.model.parameters(), new_grads):
model_grad.data = new_grad.data
elif isinstance(new_grads, list):
for idx, model_grad in enumerate(self.model.parameters()):
model_grad.data = new_grads[idx]
def train_no_dp(self, glob_iter):
"""Training phase without differential privacy"""
for epoch in range(1, self.local_updates + 1):
self.model.train()
# new batch (data sampling on every local epoch)
np.random.seed(500 * (self.times + 1) * (glob_iter + 1) + epoch + 1)
torch.manual_seed(500 * (self.times + 1) * (glob_iter + 1) + epoch + 1)
train_idx = np.arange(self.train_samples)
train_sampler = SubsetRandomSampler(train_idx)
self.trainloader = DataLoader(self.train_data, self.batch_size, sampler=train_sampler)
X, y = list(self.trainloader)[0]
if self.use_cuda:
X, y = X.cuda(), y.cuda()
self.optimizer.zero_grad()
clear_backprops(self.model)
output = self.model(X)
loss = self.loss(output, y)
loss.backward()
self.optimizer.step()
if self.scheduler:
self.scheduler.step()
# get model difference
for local, server, delta in zip(self.model.parameters(), self.server_model, self.delta_model):
delta.data = local.data.detach() - server.data.detach()
return loss
def train_dp(self, sigma_g, glob_iter, max_norm):
"""Training phase under differential privacy"""
for epoch in range(1, self.local_updates + 1):
self.model.train()
# new batch (data sampling on every local epoch)
np.random.seed(500 * (self.times + 1) * (glob_iter + 1) + epoch + 1)
torch.manual_seed(500 * (self.times + 1) * (glob_iter + 1) + epoch + 1)
train_idx = np.arange(self.train_samples)
train_sampler = SubsetRandomSampler(train_idx)
self.trainloader = DataLoader(self.train_data, self.batch_size, sampler=train_sampler)
X, y = list(self.trainloader)[0]
if self.use_cuda:
X, y = X.cuda(), y.cuda()
self.optimizer.zero_grad()
clear_backprops(self.model)
output = self.model(X)
loss = self.loss(output, y)
loss.backward(retain_graph=True)
compute_grad1(self.model)
for p in self.model.parameters():
# clipping single gradients
# heuristic: otherwise, use max_norm constant
max_norm = np.median([float(grad.data.norm(2)) for grad in p.grad1])
p.grad1 = torch.stack(
[grad / max(1, float(grad.data.norm(2)) / max_norm) for grad in p.grad1])
p.grad.data = torch.mean(p.grad1, dim=0)
# DP mechanism
p.grad.data = GaussianMechanism(p.grad.data, sigma_g, max_norm, self.batch_size, self.use_cuda)
self.optimizer.step()
if self.scheduler:
self.scheduler.step()
# get model difference
for local, server, delta in zip(self.model.parameters(), self.server_model, self.delta_model):
delta.data = local.data.detach() - server.data.detach()
return 0
class CrossEntropyLoss(_WeightedLoss):
__constants__ = ['ignore_index', 'reduction']
def __init__(self, weight=None, size_average=None, ignore_index=-100,
reduce=None, reduction='mean'):
super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
self.ignore_index = ignore_index
def forward(self, input, target):
return F.cross_entropy(input, target, weight=self.weight,
ignore_index=self.ignore_index, reduction=self.reduction)
It is useful when training a classification problem with `C` classes.
If provided, the optional argument :attr:`weight` should be a 1D `Tensor` assigning weight to each of the classes.
This is particularly useful when you have an unbalanced training set.
输入预计是每一个类的原始的,未被归一化的数据
`input` has to be a Tensor of size. either :math:`(minibatch, C)` or
:math:`(minibatch, C, d_1, d_2, ..., d_K)`
with :math:`K \geq 1` for the `K`-dimensional case (described later).
This criterion expects a class index in the range :math:`[0, C-1]` as the
`target` for each value of a 1D tensor of size `minibatch`; if `ignore_index`
is specified, this criterion also accepts this class index (this index may not
necessarily be in the class range).
The loss can be described as:
.. math::
\text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right)
= -x[class] + \log\left(\sum_j \exp(x[j])\right)
or in the case of the :attr:`weight` argument being specified:
.. math::
\text{loss}(x, class) = weight[class] \left(-x[class] + \log\left(\sum_j \exp(x[j])\right)\right)
The losses are averaged across observations for each minibatch.对每个小批的观测值平均损失。
Can also be used for higher dimension inputs, such as 2D images, by providing
an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`,
where :math:`K` is the number of dimensions, and a target of appropriate shape
(see below).
Args:
weight (Tensor, optional): a manual rescaling weight given to each class.
If given, has to be a Tensor of size `C`
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
the losses are averaged over each loss element in the batch. Note that for
some losses, there are multiple elements per sample. If the field :attr:`size_average`
is set to ``False``, the losses are instead summed for each minibatch. Ignored
when reduce is ``False``. Default: ``True``
ignore_index (int, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. When :attr:`size_average` is
``True``, the loss is averaged over non-ignored targets.
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
losses are averaged or summed over observations for each minibatch depending
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
batch element instead and ignores :attr:`size_average`. Default: ``True``
reduction (string, optional): Specifies the reduction to apply to the output:
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
``'mean'``: the sum of the output will be divided by the number of
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
and :attr:`reduce` are in the process of being deprecated, and in the meantime,
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
Shape:
- Input: :math:`(N, C)` where `C = number of classes`, or
:math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
in the case of `K`-dimensional loss.
- Target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or
:math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
K-dimensional loss.
- Output: scalar.
If :attr:`reduction` is ``'none'``, then the same size as the target:
:math:`(N)`, or
:math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case
of K-dimensional loss.
Examples::
>>> loss = nn.CrossEntropyLoss()
>>> input = torch.randn(3, 5, requires_grad=True)
>>> target = torch.empty(3, dtype=torch.long).random_(5)
>>> output = loss(input, target)
>>> output.backward()
class FedAvgOptimizer(FedLOptimizer):
def __init__(self, params, lr, weight_decay):
super().__init__(params, lr, weight_decay)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
p.data = p.data - p.grad.data * group['lr']
return loss
def train(self):
loss = []
for glob_iter in range(self.num_glob_iters):
print("-------------Round number: ", glob_iter, " -------------")
# loss_ = 0
# Each user gets the global parameters
self.send_parameters() #用户从服务器设置他们的参数
# Evaluate model at each iteration 在每次迭代中评估模型
self.evaluate()
# Users are selected
if self.noise:
self.selected_users = self.select_transmitting_users()
print(f"Transmitting {len(self.selected_users)} users")
else:
self.selected_users = self.select_users(glob_iter, self.users_per_round)
# Local updates
for user in self.selected_users:
if self.dp == "None":
user.train_no_dp(glob_iter)
else:
user.train_dp(self.sigma_g, glob_iter, self.max_norm)
user.drop_lr()
# Aggregation
self.aggregate_parameters()
self.get_max_norm()
if self.noise:
self.apply_channel_effect()
self.save_results() # 保存loss(训练和测试),accuracy(训练和测试),dissimilarity(训练)到h5文件"
self.save_norms() # 保存规范到h5文件
self.save_model()
迭代250次
def train_no_dp(self, glob_iter):
"""Training phase without differential privacy"""
for epoch in range(1, self.local_updates + 1):
self.model.train()
# new batch (data sampling on every local epoch)
np.random.seed(500 * (self.times + 1) * (glob_iter + 1) + epoch + 1)
torch.manual_seed(500 * (self.times + 1) * (glob_iter + 1) + epoch + 1)
train_idx = np.arange(self.train_samples)
train_sampler = SubsetRandomSampler(train_idx)
self.trainloader = DataLoader(self.train_data, self.batch_size, sampler=train_sampler)
X, y = list(self.trainloader)[0]
if self.use_cuda:
X, y = X.cuda(), y.cuda()
self.optimizer.zero_grad()
clear_backprops(self.model)
output = self.model(X)
loss = self.loss(output, y)
loss.backward()
self.optimizer.step()
if self.scheduler:
self.scheduler.step()
# get model difference
for local, server, delta in zip(self.model.parameters(), self.server_model, self.delta_model):
delta.data = local.data.detach() - server.data.detach()
return loss
迭代10次
Similarity: 1.0
Algorithm: FedAvg
-------------Round number: 0 -------------
Similarity: 1.0
Average Global Test Accuracy: 0.02385
Average Global Test Loss: 3.84941
Average Global Training Accuracy: 0.02289
Average Global F(x_t)-F(x*): 3.6905
Average Global Training Loss: 3.85023
Average Global Training Gradient Dissimilarity: 0.00117
Average Global Training Gradient Dissimilarity (mean of norms): 0.00421
Average Global Training Gradient Dissimilarity (norm of mean): 0.00304
-------------Round number: 249 -------------
Similarity: 1.0
Average Global Test Accuracy: 0.69255
Average Global Test Loss: 1.07392
Average Global Training Accuracy: 0.69969
Average Global F(x_t)-F(x*): 0.89725
Average Global Training Loss: 1.05698
Average Global Training Gradient Dissimilarity: 0.01182
Average Global Training Gradient Dissimilarity (mean of norms): 0.01604
Average Global Training Gradient Dissimilarity (norm of mean): 0.00421
Std Max Accuracy: 0.00037043517951487363
Mean Max Accuracy: 0.6947666666666666
====================================
Algorithm: SCAFFOLD
-------------Round number: 0 -------------
Similarity: 1.0
Average Global Test Accuracy: 0.0246
Average Global Test Loss: 3.84992
Average Global Training Accuracy: 0.02369
Average Global F(x_t)-F(x*): 3.69058
Average Global Training Loss: 3.85031
Average Global Training Gradient Dissimilarity: 0.00121
Average Global Training Gradient Dissimilarity (mean of norms): 0.00452
Average Global Training Gradient Dissimilarity (norm of mean): 0.00331
-------------Round number: 249 -------------
Similarity: 1.0
Average Global Test Accuracy: 0.6952
Average Global Test Loss: 1.068
Average Global Training Accuracy: 0.70214
Average Global F(x_t)-F(x*): 0.89042
Average Global Training Loss: 1.05015
Average Global Training Gradient Dissimilarity: 0.01158
Average Global Training Gradient Dissimilarity (mean of norms): 0.01338
Average Global Training Gradient Dissimilarity (norm of mean): 0.0018
Std Max Accuracy: 0.00037932688922471
Mean Max Accuracy: 0.6950333333333333
Similarity: 0.1
====================================
Algorithm: FedAvg
-------------Round number: 0 -------------
Similarity: 0.1
Average Global Test Accuracy: 0.02618
Average Global Test Loss: 3.85023
Average Global Training Accuracy: 0.02662
Average Global F(x_t)-F(x*): 3.75271
Average Global Training Loss: 3.84991
Average Global Training Gradient Dissimilarity: 1.54572
Average Global Training Gradient Dissimilarity (mean of norms): 1.55486
Average Global Training Gradient Dissimilarity (norm of mean): 0.00913
-------------Round number: 249 -------------
Similarity: 0.1
Average Global Test Accuracy: 0.57521
Average Global Test Loss: 1.4556
Average Global Training Accuracy: 0.57695
Average Global F(x_t)-F(x*): 1.35535
Average Global Training Loss: 1.45255
Average Global Training Gradient Dissimilarity: 6.6318
Average Global Training Gradient Dissimilarity (mean of norms): 6.72388
Average Global Training Gradient Dissimilarity (norm of mean): 0.09208
Std Max Accuracy: 0.003339922903454331
Mean Max Accuracy: 0.5943915018615296
================================================
Algorithm: SCAFFOLD
-------------Round number: 0 -------------
Similarity: 0.1
Average Global Test Accuracy: 0.3033
Average Global Test Loss: 3.84531
Average Global Training Accuracy: 0.03012
Average Global F(x_t)-F(x*): 3.7487
Average Global Training Loss: 3.8459
Average Global Training Gradient Dissimilarity: 1.51894
Average Global Training Gradient Dissimilarity (mean of norms): 1.52739
Average Global Training Gradient Dissimilarity (norm of mean): 0.00844
-------------Round number: 249 -------------
Similarity: 0.1
Average Global Test Accuracy: 0.62906
Average Global Test Loss: 1.30998
Average Global Training Accuracy: 0.63091
Average Global F(x_t)-F(x*): 1.21077
Average Global Training Loss: 1.30797
Average Global Training Gradient Dissimilarity: 19.33952
Average Global Training Gradient Dissimilarity (mean of norms): 19.44003
Average Global Training Gradient Dissimilarity (norm of mean): 0.10051
Std Max Accuracy: 0.002479936340467419
Mean Max Accuracy: 0.6366772551832497
====================================
Algorithm: FedAvg
-------------Round number: 0 -------------
Similarity: 0.0
Average Global Test Accuracy: 0.3094
Average Global Test Loss: 3.84465
Average Global Training Accuracy: 0.03056
Average Global F(x_t)-F(x*): 3.76351
Average Global Training Loss: 3.84495
Average Global Training Gradient Dissimilarity: 1.88936
Average Global Training Gradient Dissimilarity (mean of norms): 1.8992
Average Global Training Gradient Dissimilarity (norm of mean): 0.00984
-------------Round number: 249 -------------
Similarity: 0.0
Average Global Test Accuracy: 0.36521
Average Global Test Loss: 2.12814
Average Global Training Accuracy: 0.36927
Average Global F(x_t)-F(x*): 2.03785
Average Global Training Loss: 2.11928
Average Global Training Gradient Dissimilarity: 11.32929
Average Global Training Gradient Dissimilarity (mean of norms): 11.61684
Average Global Training Gradient Dissimilarity (norm of mean): 0.28755
========================================================
Algorithm: SCAFFOLD
-------------Round number: 0 -------------
Similarity: 0.0
Average Global Test Accuracy: 0.02005
Average Global Test Loss: 3.84892
Average Global Training Accuracy: 0.02082
Average Global F(x_t)-F(x*): 3.76703
Average Global Training Loss: 3.84847
Average Global Training Gradient Dissimilarity: 1.92658
Average Global Training Gradient Dissimilarity (mean of norms): 1.93722
Average Global Training Gradient Dissimilarity (norm of mean): 0.01065
-------------Round number: 249 -------------
Similarity: 0.0
Average Global Test Accuracy: 0.51411
Average Global Test Loss: 1.8632
Average Global Training Accuracy: 0.51596
Average Global F(x_t)-F(x*): 1.76778
Average Global Training Loss: 1.84922
Average Global Training Gradient Dissimilarity: 53.76546
Average Global Training Gradient Dissimilarity (mean of norms): 54.39637
Average Global Training Gradient Dissimilarity (norm of mean): 0.63091