【代码解析(1)】Communication-Efficient Learning of Deep Networks from Decentralized Data

sampling.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Python version: 3.6


import numpy as np
from torchvision import datasets, transforms
'''
datasets: 提供常用的数据集加载,
设计上都是继承 torch.utils.data.Dataset,
主要包括 MNISTCIFAR10/100、ImageNet、COCO等;

transforms:提供常用的数据预处理操作,
主要包括对 Tensor 以及 PIL Image 对象的操作;

'''


def mnist_iid(dataset, num_users):
    """
    独立同分布
    对MNIST数据集采样数据(IID数据)
    Sample I.I.D. client data from MNIST dataset
    :param dataset:
    :param num_users:
    :return: dict of image index
    """
    num_items = int(len(dataset)/num_users)
    '''
        len(dataset)所有元素,= num_items * num_users
    '''
    dict_users, all_idxs = {}, [i for i in range(len(dataset))]

    '''
        没调用这些又不输出
        print('-------------')
        print(dict_users)
        all_idxs为下标
    '''
    for i in range(num_users):
        dict_users[i] = set(np.random.choice(all_idxs, num_items,
                                             replace=False))
        '''
            dict_users[i] dict类型{'0':{1,3,4}}
            
            replace表示是否重用元素
            numpy.random.choice(a, size=None, replace=True, p=None)
            a : 如果是一维数组,就表示从这个一维数组中随机采样;如果是int型,就表示从0到a-1这个序列中随机采样
            从[0,1,2,3 ... len(dataset)]采样num_items个元素
            
            这很合理,dataset相当于矩阵,行为user,列为Item
            每个user为一行,列为item数量,所以对每个user采样num_item个元素
            
        '''
        all_idxs = list(set(all_idxs) - dict_users[i])
        '''
            set(all_idxs):{0,1,2,3,4,5,6,7...}
            每个user都减一次,最后为空
            函数返回dict_users:dict of image index
            dict_users[i]类似{'0':{1,3,4}}
        '''
    # print(dict_users)
    return dict_users


def mnist_noniid(dataset, num_users):
    """
    非独立同分布
    Sample non-I.I.D client data from MNIST dataset
    :param dataset:
    :param num_users:
    :return:
    """
    # 60,000 training imgs -->  200 imgs/shard X 300 shards
    '''
        shard碎片
        
    '''
    num_shards, num_imgs = 200, 300
    idx_shard = [i for i in range(num_shards)]
    # 类似[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    dict_users = {i: np.array([]) for i in range(num_users)}
    # print(dict_users)
    '''
        初始化dict_users
    '''
    idxs = np.arange(num_shards*num_imgs)
    '''
        idxs类似[0 1 2 3 4 5 6 7 8....60000]
    '''
    # print(idxs) # [    0     1     2 ... 59997 59998 59999]
    labels = dataset.train_labels.numpy()
    # print(labels) # [5 0 4 ... 5 6 8]
    '''
        训练集的标签(mnist.train.labels)是一个55000 * 10 的矩阵
        每一行的10个数字分别代表对应的图片属于数字09
        的概率,范围是01。一个标签行只有一个是1,
        表示该图片的正确数字是对应的下标值, 其余是0'''
    # sort labels
    idxs_labels = np.vstack((idxs, labels))
    '''
        idxs_labels第一个为0123下标
        第二个元素为标签
        print(idxs_labels)
        两个np数据合并
        [[    0     1     2 ... 59997 59998 59999]
        [    5     0     4 ...     5     6     8]]
        
        类似:
            c = np.array([[1, 2],[3, 4]])
            print(c)
            [[1 2]
            [3 4]]
    '''

    idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
    '''
        对idxs_labels除了第一个元素(ndarry),就是第二个
        元素排序(从小到大)
    '''
    idxs = idxs_labels[0, :]
    # print(idxs) [30207  5662 55366 ... 23285 15728 11924]

    '''
        idxs存下标
    '''

    # divide and assign 2 shards/client
    for i in range(num_users):
        # num_shards=200
        rand_set = set(np.random.choice(idx_shard, 2, replace=False))
        '''
            从idx_shard[0, 1, 2, 3, 4, 5, 6, 7, 8, 9...]
            选择2个元素例如[1,4]
        '''
        idx_shard = list(set(idx_shard) - rand_set)
        '''
            idx_shard [0, 1, 2, 3, 4, 5, 6, 7, 8, 9...]
            set()集合是不存在重复的
        '''
        # print(dict_users[1])
        # print(rand_set)  {8, 19}
        # print('---------')
        for rand in rand_set:
            # print(dict_users[i])
            dict_users[i] = np.concatenate(
                (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
            '''
                    idxs存下标
                    num_imgs=300
                    例如rand = 8
                    idxs[2400:2700]
                    每个dic_users[i]有了300个下标数据
                    无序的
            '''
    # print(dict_users)
    # print(idx_shard)
    return dict_users


def mnist_noniid_unequal(dataset, num_users):
    """
    意思是每个客户端数据量不同?
    Sample non-I.I.D client data from MNIST dataset s.t clients
    have unequal amount of data
    :param dataset:
    :param num_users:
    :returns a dict of clients with each clients assigned certain
    number of training imgs
    """
    # 60,000 training imgs --> 50 imgs/shard X 1200 shards
    num_shards, num_imgs = 1200, 50
    idx_shard = [i for i in range(num_shards)]
    # 类似[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    dict_users = {i: np.array([]) for i in range(num_users)}
    idxs = np.arange(num_shards*num_imgs)
    '''
            idxs类似[0 1 2 3 4 5 6 7 8....60000]
    '''
    labels = dataset.train_labels.numpy()

    # sort labels
    idxs_labels = np.vstack((idxs, labels))
    idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
    idxs = idxs_labels[0, :]
    '''
        以上和def mnist_noniid(dataset, num_users):一样
    '''

    # Minimum and maximum shards assigned per client:
    # 为每个客户端分配最大&最小碎片
    min_shard = 1
    max_shard = 30

    # Divide the shards into random chunks for every client
    # s.t the sum of these chunks = num_shards
    '''
        为每个客户端将碎片分成随机块
        随机块的和为碎片数1200
        以下注释以num_users为测试
    '''
    random_shard_size = np.random.randint(min_shard, max_shard+1,
                                          size=num_users)
    '''
        函数返回整型数组,范围[min_shard, max_shard)
        random_shard_size = np.random.randint(1, 30+1, size=10
        [ 4 11 17  4 15 28 20 14  4 12]
    '''
    random_shard_size = np.around(random_shard_size /
                                  sum(random_shard_size) * num_shards)
    '''
        sum(random_shard_size)数组中元素之和
        num_shards=1200
        random_shard_size / sum(random_shard_size)
        数组每个元素除以数组元素之和
        random_shard_size / sum(random_shard_size) * num_shards
        np.around()作用:取整,小数点第一位四舍五入
        
        random_shard_size
        [161. 134. 152. 116. 188. 125.  36.  99.  63. 125.]                      
    '''
    random_shard_size = random_shard_size.astype(int)
    '''
        每个元素转换成整型
    '''

    # Assign the shards randomly to each client
    # 将碎片随机分配给每个客户端
    if sum(random_shard_size) > num_shards:
        # [181  43 130 203  14  94 217 101 101 108]之和确实大于1200
        # 和上面对比多了这个条件
        for i in range(num_users):
            # First assign each client 1 shard to ensure every client has
            # atleast one shard of data
            '''
                为每个客户端分配一个碎片确保每个客户端
                至少有一个碎片的数据
            '''
            rand_set = set(np.random.choice(idx_shard, 1, replace=False))
            '''
                idx_shard = [i for i in range(1200)]
                选择一个数据
            '''
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[i] = np.concatenate(
                    (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)
                '''
                    num_imgs=50
                    dict_users[i]50个下标数据
                '''
                # print(dict_users)
                '''
                    每个客户端50个数据
                    [10422., 10426., 10061., 55610., 55611.,  3502., 57174.
                '''
        random_shard_size = random_shard_size-1
        # random_shard_size中每个元素减少一

        # Next, randomly assign the remaining shards
        # 随机分配保留的碎片
        '''
            ***********
            与前面不同了
            ***********10个客户端测试
            print(random_shard_size)
            [181  43 130 203  14  94 217 101 101 108]
            print(len(idx_shard))
            1190
        '''

        for i in range(num_users):
            '''
                一个用户一个用户地来
            '''
            if len(idx_shard) == 0:
                continue
            shard_size = random_shard_size[i]
            '''
                random_shard_size:
                [181  43 130 203  14  94 217 101 101 108]
                len(idx_shard)
                1190
            '''
            if shard_size > len(idx_shard):
                shard_size = len(idx_shard)
            '''
                基本不可能大于len(idx_shard)
            '''
            rand_set = set(np.random.choice(idx_shard, shard_size,
                                            replace=False))
            '''
                从idx_shard中选shard_size个
                18143130个
                一共出1192个
                rand_set是从idx_shard中抽取
                shard_size(random_shard_size[i])个元素组成的集合
            '''

            idx_shard = list(set(idx_shard) - rand_set)
            '''
                print(idx_shard)最后为空
                print(len(idx_shard))0
            '''
            # print(random_shard_size)
            # print('******')
            # print(rand_set)
            for rand in rand_set:
                '''
                    每个rand_set都有好多个元素{......}
                    比如:rand=
                    idx_shard里面还有1190个下标数据
                    客户端1从idx_shard里面取了181个数据,那么idx_shard
                    还剩1190-181个数据供客户端2选择,以此类推
                    例如其中一个客户端rand_set={801, 614, 204, 556, 721, 211, 188}
                    
                '''
                dict_users[i] = np.concatenate(
                    (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)

                '''
                    801:802*50
                    
                    那这个客户端就有len(rand_set)*50个元素
                    (1)rand_set是从idx_shard中抽取
                    shard_size(random_shard_size[i])个元素组成的集合
                    idx_shard相当于:[0, 1, 2, ....1190]
                    10个客户端的碎片大小存储在:
                    random_shard_size:[181  43 130 203  14  94 217 101 101 108]1)random_shard_size是怎么来的?
                    1.是从130选取10(num_users)个数据可以重复
                    2.每个数据除以数据总和
                    3.每个数据乘1200(num_shards)
                    客户端1的碎片大小为181个
                    (2)每个客户端的rand_set是从idx_shard中抽取的
                    客户端1需要从idx_shard中抽取181个数据
                    
                    (3)客户端1的rand_set有181个数据,例如第一个数据为
                    801,那么从idx取50个数据范围从[801*50——802*50],rand_set里面有多少个数,那么最终
                    dict_users[0]最终将会有181*50个数据
                    
                    idxs = np.arange(num_shards*num_imgs)
    
                    idxs类似[0 1 2 3 4 5 6 7 8....60000]
                    
                    之前每个客户端只有50个数据
                    现在客户端1有了50+181*50个数据了
                '''
        # print(len(dict_users[1]))
        # print(dict_users)
    else:
        # sum(random_shard_size) < num_shards(1200):
        # 会有这种情况
        for i in range(num_users):
            shard_size = random_shard_size[i]
            rand_set = set(np.random.choice(idx_shard, shard_size,
                                            replace=False))
            '''
                random_shard_size:[181  43 130 203  14  94 217 101 101 108]
                客户端1的rand_set有181个元素
                idx_shard = [i for i in range(1200)]
            '''
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[i] = np.concatenate(
                    (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)
            '''
                random_shard_size:[181  43 130 203  14  94 217 101 101 108]
                例如第一个数据为
                801,那么从idx取50个数据范围从[801*50——802*50],rand_set里面有多少个数,那么最终
                dict_users[0]最终将会有181*50个数据
                
                idxs = np.arange(num_shards*num_imgs)

                idxs类似[0 1 2 3 4 5 6 7 8....60000]
                
                之前每个客户端有0个数据
                现在客户端1有了181*50个数据了
            
            '''
        '''
            下面是成立的,sum(random_shard_size) < num_shards
            shard_size = random_shard_size[i]
            idx_shard = list(set(idx_shard) - rand_set)是
            大于零的
            即len(idx_shard)>0
            
        '''
        if len(idx_shard) > 0:
            '''
                idx_shard = list(set(idx_shard) - rand_set)
                这里的idx_shard已经是剩余的了
            '''
            # Add the leftover shards to the client with minimum images:
            shard_size = len(idx_shard)
            '''
                将剩余碎片添加到具有最少图像的客户端
                因为sum(random_shard_size) < num_shards(1200):
            '''
            # Add the remaining shard to the client with lowest data
            k = min(dict_users, key=lambda x: len(dict_users.get(x)))
            # print(k)
            # print('xxxxxxxxxxxx')
            '''
                将剩余的碎片添加到数据量最低的客户端
                找到数据量最少的客户端的序号
                
            '''
            rand_set = set(np.random.choice(idx_shard, shard_size,
                                            replace=False))
            '''
                idx_shard=1200
                这里的idx_shard已经是剩余的了:idx_shard = list(set(idx_shard) - rand_set)
                不再是1200
               shard_size = len(idx_shard)
               把rand_set中所有的元素都取过来
               {873, 267,354}
                
            '''
            '''
                idx_shard = list(set(idx_shard) - rand_set)
                这里的idx_shard已经是剩余的了
            '''
            idx_shard = list(set(idx_shard) - rand_set)
            for rand in rand_set:
                dict_users[k] = np.concatenate(
                    (dict_users[k], idxs[rand*num_imgs:(rand+1)*num_imgs]),
                    axis=0)
                '''
                    idxs=60000
                    加了len(rand_set)*50个数据
                    idxs随着label排序已经打乱了
                '''

    return dict_users


def cifar_iid(dataset, num_users):
    """
    和上面mnist_iid一样一样滴
    Sample I.I.D. client data from CIFAR10 dataset
    :param dataset:
    :param num_users:
    :return: dict of image index
    """
    num_items = int(len(dataset)/num_users)
    dict_users, all_idxs = {}, [i for i in range(len(dataset))]
    for i in range(num_users):
        dict_users[i] = set(np.random.choice(all_idxs, num_items,
                                             replace=False))
        all_idxs = list(set(all_idxs) - dict_users[i])
    return dict_users


def cifar_noniid(dataset, num_users):
    """
    和上面mnist_noniid一样一样滴
    除了这一句:labels = np.array(dataset.targets)
    一下以10个用户来测试num_users=10
    Sample non-I.I.D client data from CIFAR10 dataset
    :param dataset:
    :param num_users:
    :return:
    """
    num_shards, num_imgs = 200, 250
    idx_shard = [i for i in range(num_shards)]
    dict_users = {i: np.array([]) for i in range(num_users)}
    idxs = np.arange(num_shards*num_imgs)
    # labels = dataset.train_labels.numpy()
    labels = np.array(dataset.targets)
    '''
        labels = dataset.train_labels.numpy()
        和
        labels = np.array(dataset.train_labels)
        都会报错
        AttributeError: 'CIFAR10' object has no attribute 'train_labels'
        change the "train_labels" variable for "targets" everywhere
        就好了
    '''

    # print(labels) [6 9 9 ... 9 1 1]

    # sort labels
    idxs_labels = np.vstack((idxs, labels))
    # print(idxs)
    # [    0     1     2 ... 49997 49998 49999]
    '''
        print(len(idxs))
        print(len(labels))
        50000
        50000
    '''
    '''
        [[    0     1     2 ... 49997 49998 49999]
        [    6     9     9 ...     9     1     1]]
    '''
    idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
    '''
        idxs_labels = np.vstack((idxs, labels))
        这一步的时候相当于:
        [[ 0  1  2  3  4  5  6  7]
        [ 6 10 11  3  4  2  9  1]]
        对标签排序
        [[ 7  5  3  4  0  6  1  2]
        [ 1  2  3  4  6  9 10 11]]
        第二个数组排序,对应的第一个数组顺序要跟着第二个数组
        一起改变
        不再是顺序的了
        
        print(idxs_labels[1,:])
        [0 0 0 ... 9 9 9]
    '''

    idxs = idxs_labels[0, :]
    '''
        print(idxs)
        [29513 16836 32316... 36910 21518 25648]
        idxs_labels中第一个数组顺序已经跟着第二个数组改变了
    '''

    # divide and assign
    # 划分&&分配
    # 和def mnist_noniid(dataset, num_users):一样
    for i in range(num_users):
        rand_set = set(np.random.choice(idx_shard, 2, replace=False))
        idx_shard = list(set(idx_shard) - rand_set)
        for rand in rand_set:
            dict_users[i] = np.concatenate(
                (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
            '''
                idxs存下标
                num_imgs=300
                例如rand = 8
                idxs[2400:2700]
                每个dic_users[i]有了300个下标数据
                无序的
            '''
    return dict_users


if __name__ == '__main__':
    dataset_train = datasets.MNIST('./data/mnist/', train=True, download=True,
                                   transform=transforms.Compose([
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.1307,),
                                                            (0.3081,))
                                   ]))
    num = 100
    d = mnist_noniid(dataset_train, num)

你可能感兴趣的:(Xidian科研,python,去中心化,python,深度学习)