超参数优化专题之工具—microsoft/nni(1)

超参数优化专题之工具—microsoft/nni

这篇博客主要讲述模型的调参的一些基本知识,主要两个方面,调参的工具和相应的算法。

  • 工具我比较推荐的是微软的nni框架,以及weight & bias
  • 目前的调参算法有:TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner等

安装

Linux 和 macOS

python3 -m pip install --upgrade nni

Docker中使用NNI:

docker pull msranni/nni:latest

Window下安装:

pip install cython wheel
python -m pip install --upgrade nni

入门Demo

演示代码来自官方库:https://github.com/microsoft/nni/blob/master/examples/trials/mnist-pytorch

Step1:编写 JSON 格式的搜索空间文件

search_space.json文件,包括所有需要搜索的超参的名称和分布(离散和连续值均可)

{
    "batch_size": {"_type":"choice", "_value": [16, 32, 64, 128]},
    "hidden_size":{"_type":"choice","_value":[128, 256, 512, 1024]},
    "lr":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]},
    "momentum":{"_type":"uniform","_value":[0, 1]}
}

Step 2: 定义 YAML 格式的配置文件

config.yml文件声明了搜索空间和 Trial 文件的路径。 它还提供其他信息,例如调整算法,最大 Trial 运行次数和最大持续时间的参数。

authorName: pprp #作者名称
experimentName: example_mnist_pytorch # 实验名称
trialConcurrency: 1 # 设置并发数量
maxExecDuration: 1h # 每个trial 最长执行时间
maxTrialNum: 10 # 实验重复运行次数
#choice: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json # 搜索空间对应json文件
#choice: true, false
useAnnotation: false
tuner:
  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
  #SMAC (SMAC should be installed through nnictl)
  builtinTunerName: TPE # 指定tuner算法
  classArgs:
    #choice: maximize, minimize
    optimize_mode: maximize
trial:
  command: python3 examples1.py # 命令行
  codeDir: .
  gpuNum: 1 # 使用gpu数目

# 当trainingServicePlatform为local时添加如下语句
localConfig:
    useActiveGpu: true			# 该参数将允许NNI使用活动的GPU运行任务
    maxTrialNumPerGpu: 3		# 该参数指定在同一GPU上可同时进行的最大任务数量

# 当trainingServicePlatform为remote时添加如下语句
machineList:
    useActiveGpu: true			# 该参数将允许NNI使用活动的GPU运行任务
    maxTrialNumPerGpu: 3		# 该参数指定在同一GPU上可同时进行的最大任务数量
    # ...以下为你自己的其它的远程配置

Step3: 修改 Trial 代码

导入NNI, 修改代码来从 NNI 获取超参,并返回 NNI 最终结果。
比较重点代码如下:
从Tuner获得参数值
RECEIVED_PARAMS = nni.get_next_parameter()
定期返回指标数据(可选)
nni.report_intermediate_result(metrics)
返回配置的最终性能,如精度、loss等
nni.report_final_result(metrics)
启动

nnictl create --config ./config.yml

其他命令

  1. nnictl experiment show show the information of experiments
  2. nnictl trial ls list all of trial jobs
  3. nnictl top monitor the status of running experiments
  4. nnictl log stderr show stderr log content
  5. nnictl log stdout show stdout log content
  6. nnictl stop stop an experiment
  7. nnictl trial kill kill a trial job by id
  8. nnictl --help get help information about nnictl

完整代码

import os
import argparse
import logging
import nni
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from nni.utils import merge_parameter
from torchvision import datasets, transforms

logger = logging.getLogger('mnist_AutoML')


class Net(nn.Module):
    def __init__(self, hidden_size):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if (args['batch_num'] is not None) and batch_idx >= args['batch_num']:
            break
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args['log_interval'] == 0:
            logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


def test(args, model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            # sum up batch loss
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            # get the index of the max log-probability
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    accuracy = 100. * correct / len(test_loader.dataset)

    logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset), accuracy))

    return accuracy


def main(args):
    use_cuda = not args['no_cuda'] and torch.cuda.is_available()

    torch.manual_seed(args['seed'])

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    data_dir = args['data_dir']

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(data_dir, train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args['batch_size'], shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(data_dir, train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])),
        batch_size=1000, shuffle=True, **kwargs)

    hidden_size = args['hidden_size']

    model = Net(hidden_size=hidden_size).to(device)
    optimizer = optim.SGD(model.parameters(), lr=args['lr'],
                          momentum=args['momentum'])

    for epoch in range(1, args['epochs'] + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test_acc = test(args, model, device, test_loader)

        # report intermediate result
        nni.report_intermediate_result(test_acc)## 保存中间的指标结果
        logger.debug('test accuracy %g', test_acc)
        logger.debug('Pipe send intermediate result done.')

    # report final result
    nni.report_final_result(test_acc)## 保存最后的指标结果
    logger.debug('Final result is %g', test_acc)
    logger.debug('Send final result done.')


def get_params():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument("--data_dir", type=str,  default='./data', help="data directory") ## 数据目录
    parser.add_argument('--batch_size', type=int, default=64, metavar='N',  help='input batch size for training (default: 64)') ## batch_size
    parser.add_argument("--batch_num", type=int, default=None) # 最大的batch数量
    parser.add_argument("--hidden_size", type=int, default=512, metavar='N', help='hidden layer size (default: 512)') ## 中间隐层数量
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',  help='learning rate (default: 0.01)')## 学习率
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',  help='SGD momentum (default: 0.5)')## 学习率动量
    parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') ## 一共多少个epochs
    parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') #随机种子
    parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') # 是否使用cuda
    parser.add_argument('--log_interval', type=int, default=1000, metavar='N', help='how many batches to wait before logging training status') ## 日志间隔


    args, _ = parser.parse_known_args()
    return args


if __name__ == '__main__':
    try:
        # get parameters form tuner
        tuner_params = nni.get_next_parameter()
        logger.debug(tuner_params)
        params = vars(merge_parameter(get_params(), tuner_params))
        print(params)
        main(params)
    except Exception as exception:
        logger.exception(exception)
        raise

你可能感兴趣的:(实战,算法,机器学习)