sampling.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Python version: 3.6
import numpy as np
from torchvision import datasets, transforms
'''
datasets: 提供常用的数据集加载,
设计上都是继承 torch.utils.data.Dataset,
主要包括 MNIST、CIFAR10/100、ImageNet、COCO等;
transforms:提供常用的数据预处理操作,
主要包括对 Tensor 以及 PIL Image 对象的操作;
'''
def mnist_iid(dataset, num_users):
"""
独立同分布
对MNIST数据集采样数据(IID数据)
Sample I.I.D. client data from MNIST dataset
:param dataset:
:param num_users:
:return: dict of image index
"""
num_items = int(len(dataset)/num_users)
'''
len(dataset)所有元素,= num_items * num_users
'''
dict_users, all_idxs = {}, [i for i in range(len(dataset))]
'''
没调用这些又不输出
print('-------------')
print(dict_users)
all_idxs为下标
'''
for i in range(num_users):
dict_users[i] = set(np.random.choice(all_idxs, num_items,
replace=False))
'''
dict_users[i] dict类型{'0':{1,3,4}}
replace表示是否重用元素
numpy.random.choice(a, size=None, replace=True, p=None)
a : 如果是一维数组,就表示从这个一维数组中随机采样;如果是int型,就表示从0到a-1这个序列中随机采样
从[0,1,2,3 ... len(dataset)]采样num_items个元素
这很合理,dataset相当于矩阵,行为user,列为Item
每个user为一行,列为item数量,所以对每个user采样num_item个元素
'''
all_idxs = list(set(all_idxs) - dict_users[i])
'''
set(all_idxs):{0,1,2,3,4,5,6,7...}
每个user都减一次,最后为空
函数返回dict_users:dict of image index
dict_users[i]类似{'0':{1,3,4}}
'''
# print(dict_users)
return dict_users
def mnist_noniid(dataset, num_users):
"""
非独立同分布
Sample non-I.I.D client data from MNIST dataset
:param dataset:
:param num_users:
:return:
"""
# 60,000 training imgs --> 200 imgs/shard X 300 shards
'''
shard碎片
'''
num_shards, num_imgs = 200, 300
idx_shard = [i for i in range(num_shards)]
# 类似[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
dict_users = {i: np.array([]) for i in range(num_users)}
# print(dict_users)
'''
初始化dict_users
'''
idxs = np.arange(num_shards*num_imgs)
'''
idxs类似[0 1 2 3 4 5 6 7 8....60000]
'''
# print(idxs) # [ 0 1 2 ... 59997 59998 59999]
labels = dataset.train_labels.numpy()
# print(labels) # [5 0 4 ... 5 6 8]
'''
训练集的标签(mnist.train.labels)是一个55000 * 10 的矩阵
每一行的10个数字分别代表对应的图片属于数字0到9
的概率,范围是0或1。一个标签行只有一个是1,
表示该图片的正确数字是对应的下标值, 其余是0。
'''
# sort labels
idxs_labels = np.vstack((idxs, labels))
'''
idxs_labels第一个为0123下标
第二个元素为标签
print(idxs_labels)
两个np数据合并
[[ 0 1 2 ... 59997 59998 59999]
[ 5 0 4 ... 5 6 8]]
类似:
c = np.array([[1, 2],[3, 4]])
print(c)
[[1 2]
[3 4]]
'''
idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
'''
对idxs_labels除了第一个元素(ndarry),就是第二个
元素排序(从小到大)
'''
idxs = idxs_labels[0, :]
# print(idxs) [30207 5662 55366 ... 23285 15728 11924]
'''
idxs存下标
'''
# divide and assign 2 shards/client
for i in range(num_users):
# num_shards=200
rand_set = set(np.random.choice(idx_shard, 2, replace=False))
'''
从idx_shard[0, 1, 2, 3, 4, 5, 6, 7, 8, 9...]
选择2个元素例如[1,4]
'''
idx_shard = list(set(idx_shard) - rand_set)
'''
idx_shard [0, 1, 2, 3, 4, 5, 6, 7, 8, 9...]
set()集合是不存在重复的
'''
# print(dict_users[1])
# print(rand_set) {8, 19}
# print('---------')
for rand in rand_set:
# print(dict_users[i])
dict_users[i] = np.concatenate(
(dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
'''
idxs存下标
num_imgs=300
例如rand = 8
idxs[2400:2700]
每个dic_users[i]有了300个下标数据
无序的
'''
# print(dict_users)
# print(idx_shard)
return dict_users
def mnist_noniid_unequal(dataset, num_users):
"""
意思是每个客户端数据量不同?
Sample non-I.I.D client data from MNIST dataset s.t clients
have unequal amount of data
:param dataset:
:param num_users:
:returns a dict of clients with each clients assigned certain
number of training imgs
"""
# 60,000 training imgs --> 50 imgs/shard X 1200 shards
num_shards, num_imgs = 1200, 50
idx_shard = [i for i in range(num_shards)]
# 类似[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
dict_users = {i: np.array([]) for i in range(num_users)}
idxs = np.arange(num_shards*num_imgs)
'''
idxs类似[0 1 2 3 4 5 6 7 8....60000]
'''
labels = dataset.train_labels.numpy()
# sort labels
idxs_labels = np.vstack((idxs, labels))
idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
idxs = idxs_labels[0, :]
'''
以上和def mnist_noniid(dataset, num_users):一样
'''
# Minimum and maximum shards assigned per client:
# 为每个客户端分配最大&最小碎片
min_shard = 1
max_shard = 30
# Divide the shards into random chunks for every client
# s.t the sum of these chunks = num_shards
'''
为每个客户端将碎片分成随机块
随机块的和为碎片数1200
以下注释以num_users为测试
'''
random_shard_size = np.random.randint(min_shard, max_shard+1,
size=num_users)
'''
函数返回整型数组,范围[min_shard, max_shard)
random_shard_size = np.random.randint(1, 30+1, size=10
[ 4 11 17 4 15 28 20 14 4 12]
'''
random_shard_size = np.around(random_shard_size /
sum(random_shard_size) * num_shards)
'''
sum(random_shard_size)数组中元素之和
num_shards=1200
random_shard_size / sum(random_shard_size)
数组每个元素除以数组元素之和
random_shard_size / sum(random_shard_size) * num_shards
np.around()作用:取整,小数点第一位四舍五入
random_shard_size
[161. 134. 152. 116. 188. 125. 36. 99. 63. 125.]
'''
random_shard_size = random_shard_size.astype(int)
'''
每个元素转换成整型
'''
# Assign the shards randomly to each client
# 将碎片随机分配给每个客户端
if sum(random_shard_size) > num_shards:
# [181 43 130 203 14 94 217 101 101 108]之和确实大于1200
# 和上面对比多了这个条件
for i in range(num_users):
# First assign each client 1 shard to ensure every client has
# atleast one shard of data
'''
为每个客户端分配一个碎片确保每个客户端
至少有一个碎片的数据
'''
rand_set = set(np.random.choice(idx_shard, 1, replace=False))
'''
idx_shard = [i for i in range(1200)]
选择一个数据
'''
idx_shard = list(set(idx_shard) - rand_set)
for rand in rand_set:
dict_users[i] = np.concatenate(
(dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
axis=0)
'''
num_imgs=50
dict_users[i]有50个下标数据
'''
# print(dict_users)
'''
每个客户端50个数据
[10422., 10426., 10061., 55610., 55611., 3502., 57174.
'''
random_shard_size = random_shard_size-1
# random_shard_size中每个元素减少一
# Next, randomly assign the remaining shards
# 随机分配保留的碎片
'''
***********
与前面不同了
***********
以10个客户端测试
print(random_shard_size)
[181 43 130 203 14 94 217 101 101 108]
print(len(idx_shard))
1190
'''
for i in range(num_users):
'''
一个用户一个用户地来
'''
if len(idx_shard) == 0:
continue
shard_size = random_shard_size[i]
'''
random_shard_size:
[181 43 130 203 14 94 217 101 101 108]
len(idx_shard)
1190
'''
if shard_size > len(idx_shard):
shard_size = len(idx_shard)
'''
基本不可能大于len(idx_shard)
'''
rand_set = set(np.random.choice(idx_shard, shard_size,
replace=False))
'''
从idx_shard中选shard_size个
181个
43个
130个
一共出1192个
rand_set是从idx_shard中抽取
shard_size(random_shard_size[i])个元素组成的集合
'''
idx_shard = list(set(idx_shard) - rand_set)
'''
print(idx_shard)最后为空
print(len(idx_shard))0
'''
# print(random_shard_size)
# print('******')
# print(rand_set)
for rand in rand_set:
'''
每个rand_set都有好多个元素{......}
比如:rand=
idx_shard里面还有1190个下标数据
客户端1从idx_shard里面取了181个数据,那么idx_shard
还剩1190-181个数据供客户端2选择,以此类推
例如其中一个客户端rand_set={801, 614, 204, 556, 721, 211, 188}
'''
dict_users[i] = np.concatenate(
(dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
axis=0)
'''
801:802*50
那这个客户端就有len(rand_set)*50个元素
(1)rand_set是从idx_shard中抽取
shard_size(random_shard_size[i])个元素组成的集合
idx_shard相当于:[0, 1, 2, ....1190]
10个客户端的碎片大小存储在:
random_shard_size:[181 43 130 203 14 94 217 101 101 108]
(1)random_shard_size是怎么来的?
1.是从1到30选取10(num_users)个数据可以重复
2.每个数据除以数据总和
3.每个数据乘1200(num_shards)
客户端1的碎片大小为181个
(2)每个客户端的rand_set是从idx_shard中抽取的
客户端1需要从idx_shard中抽取181个数据
(3)客户端1的rand_set有181个数据,例如第一个数据为
801,那么从idx取50个数据范围从[801*50——802*50],rand_set里面有多少个数,那么最终
dict_users[0]最终将会有181*50个数据
idxs = np.arange(num_shards*num_imgs)
idxs类似[0 1 2 3 4 5 6 7 8....60000]
之前每个客户端只有50个数据
现在客户端1有了50+181*50个数据了
'''
# print(len(dict_users[1]))
# print(dict_users)
else:
# sum(random_shard_size) < num_shards(1200):
# 会有这种情况
for i in range(num_users):
shard_size = random_shard_size[i]
rand_set = set(np.random.choice(idx_shard, shard_size,
replace=False))
'''
random_shard_size:[181 43 130 203 14 94 217 101 101 108]
客户端1的rand_set有181个元素
idx_shard = [i for i in range(1200)]
'''
idx_shard = list(set(idx_shard) - rand_set)
for rand in rand_set:
dict_users[i] = np.concatenate(
(dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
axis=0)
'''
random_shard_size:[181 43 130 203 14 94 217 101 101 108]
例如第一个数据为
801,那么从idx取50个数据范围从[801*50——802*50],rand_set里面有多少个数,那么最终
dict_users[0]最终将会有181*50个数据
idxs = np.arange(num_shards*num_imgs)
idxs类似[0 1 2 3 4 5 6 7 8....60000]
之前每个客户端有0个数据
现在客户端1有了181*50个数据了
'''
'''
下面是成立的,sum(random_shard_size) < num_shards
shard_size = random_shard_size[i]
idx_shard = list(set(idx_shard) - rand_set)是
大于零的
即len(idx_shard)>0
'''
if len(idx_shard) > 0:
'''
idx_shard = list(set(idx_shard) - rand_set)
这里的idx_shard已经是剩余的了
'''
# Add the leftover shards to the client with minimum images:
shard_size = len(idx_shard)
'''
将剩余碎片添加到具有最少图像的客户端
因为sum(random_shard_size) < num_shards(1200):
'''
# Add the remaining shard to the client with lowest data
k = min(dict_users, key=lambda x: len(dict_users.get(x)))
# print(k)
# print('xxxxxxxxxxxx')
'''
将剩余的碎片添加到数据量最低的客户端
找到数据量最少的客户端的序号
'''
rand_set = set(np.random.choice(idx_shard, shard_size,
replace=False))
'''
idx_shard=1200
这里的idx_shard已经是剩余的了:idx_shard = list(set(idx_shard) - rand_set)
不再是1200
shard_size = len(idx_shard)
把rand_set中所有的元素都取过来
{873, 267,354}
'''
'''
idx_shard = list(set(idx_shard) - rand_set)
这里的idx_shard已经是剩余的了
'''
idx_shard = list(set(idx_shard) - rand_set)
for rand in rand_set:
dict_users[k] = np.concatenate(
(dict_users[k], idxs[rand*num_imgs:(rand+1)*num_imgs]),
axis=0)
'''
idxs=60000
加了len(rand_set)*50个数据
idxs随着label排序已经打乱了
'''
return dict_users
def cifar_iid(dataset, num_users):
"""
和上面mnist_iid一样一样滴
Sample I.I.D. client data from CIFAR10 dataset
:param dataset:
:param num_users:
:return: dict of image index
"""
num_items = int(len(dataset)/num_users)
dict_users, all_idxs = {}, [i for i in range(len(dataset))]
for i in range(num_users):
dict_users[i] = set(np.random.choice(all_idxs, num_items,
replace=False))
all_idxs = list(set(all_idxs) - dict_users[i])
return dict_users
def cifar_noniid(dataset, num_users):
"""
和上面mnist_noniid一样一样滴
除了这一句:labels = np.array(dataset.targets)
一下以10个用户来测试num_users=10
Sample non-I.I.D client data from CIFAR10 dataset
:param dataset:
:param num_users:
:return:
"""
num_shards, num_imgs = 200, 250
idx_shard = [i for i in range(num_shards)]
dict_users = {i: np.array([]) for i in range(num_users)}
idxs = np.arange(num_shards*num_imgs)
# labels = dataset.train_labels.numpy()
labels = np.array(dataset.targets)
'''
labels = dataset.train_labels.numpy()
和
labels = np.array(dataset.train_labels)
都会报错
AttributeError: 'CIFAR10' object has no attribute 'train_labels'
change the "train_labels" variable for "targets" everywhere
就好了
'''
# print(labels) [6 9 9 ... 9 1 1]
# sort labels
idxs_labels = np.vstack((idxs, labels))
# print(idxs)
# [ 0 1 2 ... 49997 49998 49999]
'''
print(len(idxs))
print(len(labels))
50000
50000
'''
'''
[[ 0 1 2 ... 49997 49998 49999]
[ 6 9 9 ... 9 1 1]]
'''
idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
'''
idxs_labels = np.vstack((idxs, labels))
这一步的时候相当于:
[[ 0 1 2 3 4 5 6 7]
[ 6 10 11 3 4 2 9 1]]
对标签排序
[[ 7 5 3 4 0 6 1 2]
[ 1 2 3 4 6 9 10 11]]
第二个数组排序,对应的第一个数组顺序要跟着第二个数组
一起改变
不再是顺序的了
print(idxs_labels[1,:])
[0 0 0 ... 9 9 9]
'''
idxs = idxs_labels[0, :]
'''
print(idxs)
[29513 16836 32316... 36910 21518 25648]
idxs_labels中第一个数组顺序已经跟着第二个数组改变了
'''
# divide and assign
# 划分&&分配
# 和def mnist_noniid(dataset, num_users):一样
for i in range(num_users):
rand_set = set(np.random.choice(idx_shard, 2, replace=False))
idx_shard = list(set(idx_shard) - rand_set)
for rand in rand_set:
dict_users[i] = np.concatenate(
(dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
'''
idxs存下标
num_imgs=300
例如rand = 8
idxs[2400:2700]
每个dic_users[i]有了300个下标数据
无序的
'''
return dict_users
if __name__ == '__main__':
dataset_train = datasets.MNIST('./data/mnist/', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,),
(0.3081,))
]))
num = 100
d = mnist_noniid(dataset_train, num)