Ray----Tune(6):Tune 的实例(二)

三、PyTorch 实例

mnist_pytorch

转换PyTorch MNIST示例以使用Tune和基于函数的API。另外展示了argparse命令行解析来使用Tune。
代码:

# Original Code here:
# https://github.com/pytorch/examples/blob/master/mnist/main.py
from __future__ import print_function

import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument(
    '--batch-size',
    type=int,
    default=64,
    metavar='N',
    help='input batch size for training (default: 64)')
parser.add_argument(
    '--test-batch-size',
    type=int,
    default=1000,
    metavar='N',
    help='input batch size for testing (default: 1000)')
parser.add_argument(
    '--epochs',
    type=int,
    default=1,
    metavar='N',
    help='number of epochs to train (default: 1)')
parser.add_argument(
    '--lr',
    type=float,
    default=0.01,
    metavar='LR',
    help='learning rate (default: 0.01)')
parser.add_argument(
    '--momentum',
    type=float,
    default=0.5,
    metavar='M',
    help='SGD momentum (default: 0.5)')
parser.add_argument(
    '--no-cuda',
    action='store_true',
    default=False,
    help='disables CUDA training')
parser.add_argument(
    '--seed',
    type=int,
    default=1,
    metavar='S',
    help='random seed (default: 1)')
parser.add_argument(
    '--smoke-test', action="store_true", help="Finish quickly for testing")


def train_mnist(args, config, reporter):
    vars(args).update(config)
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    #为CPU设置种子用于生成随机数,以使得结果是确定的
    torch.manual_seed(args.seed)
    # 为当前GPU设置随机种子;如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子。
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            '~/data',
            train=True,
            download=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])),
        batch_size=args.batch_size,
        shuffle=True,
        **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            '~/data',
            train=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])),
        batch_size=args.test_batch_size,
        shuffle=True,
        **kwargs)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, x):
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x, dim=1)

    model = Net()
    if args.cuda:
        model.cuda()

    optimizer = optim.SGD(
        model.parameters(), lr=args.lr, momentum=args.momentum)

    def train(epoch):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

    def test():
        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                if args.cuda:
                    data, target = data.cuda(), target.cuda()
                output = model(data)
                # sum up batch loss
                test_loss += F.nll_loss(output, target, reduction='sum').item()
                # get the index of the max log-probability
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(
                    target.data.view_as(pred)).long().cpu().sum()

        test_loss = test_loss / len(test_loader.dataset)
        accuracy = correct.item() / len(test_loader.dataset)
        reporter(mean_loss=test_loss, mean_accuracy=accuracy)

    for epoch in range(1, args.epochs + 1):
        train(epoch)
        test()


if __name__ == "__main__":
    datasets.MNIST('~/data', train=True, download=True)
    args = parser.parse_args()

    import numpy as np
    import ray
    from ray import tune
    from ray.tune.schedulers import AsyncHyperBandScheduler

    ray.init()
    # time_attr 时间基元;               reward_attr 目标属性;
    # grace_period 至少的时间(次数);   max_t 最大的时间(次数);
    sched = AsyncHyperBandScheduler(
        time_attr="training_iteration",
        reward_attr="neg_mean_loss",
        max_t=400,
        grace_period=20)
    # 基于功能的函数( the function-based API)进行注册
    tune.register_trainable(
        "TRAIN_FN",
        lambda config, reporter: train_mnist(args, config, reporter))
    tune.run(
        "TRAIN_FN",
        name="exp",
        scheduler=sched,
        **{
            "stop": {
                "mean_accuracy": 0.98,
                "training_iteration": 1 if args.smoke_test else 20
            },
            "resources_per_trial": {
                "cpu": 3,
                # "gpu": int(not args.no_cuda)
            },
            "num_samples": 1 if args.smoke_test else 10,
            "config": {
                "lr": tune.sample_from(
                    lambda spec: np.random.uniform(0.001, 0.1)),
                "momentum": tune.sample_from(
                    lambda spec: np.random.uniform(0.1, 0.9)),
            }
        })

运行结果(使用cpu进行实验):

/usr/bin/python3.5 /home/kangkang/PycharmProjects/ray/python/ray/tune/examples/mnist_pytorch.py
2019-04-24 19:37:11,892	INFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-04-24_19-37-11_5077/logs.
2019-04-24 19:37:11,997	INFO services.py:363 -- Waiting for redis server at 127.0.0.1:37534 to respond...
2019-04-24 19:37:12,113	INFO services.py:363 -- Waiting for redis server at 127.0.0.1:59931 to respond...
2019-04-24 19:37:12,114	INFO services.py:760 -- Starting Redis shard with 3.35 GB max memory.
2019-04-24 19:37:12,133	INFO services.py:1384 -- Starting the Plasma object store with 5.03 GB memory using /dev/shm.
2019-04-24 19:37:12,280	INFO tune.py:60 -- Tip: to resume incomplete experiments, pass resume='prompt' or resume=True to run()
2019-04-24 19:37:12,281	INFO tune.py:211 -- Starting a new experiment.
== Status ==
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 180.000: None | Iter 60.000: None | Iter 20.000: None
Bracket: Iter 180.000: None | Iter 60.000: None
Bracket: Iter 180.000: None
Resources requested: 0/8 CPUs, 0/0 GPUs
Memory usage on this node: 5.5/16.8 GB

2019-04-24 19:37:13,646	WARNING util.py:62 -- The `start_trial` operation took 1.2476763725280762 seconds to complete, which may be a performance bottleneck.
== Status ==
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 180.000: None | Iter 60.000: None | Iter 20.000: None
Bracket: Iter 180.000: None | Iter 60.000: None
Bracket: Iter 180.000: None
Resources requested: 3/8 CPUs, 0/0 GPUs
Memory usage on this node: 5.7/16.8 GB
Result logdir: /home/kangkang/ray_results/exp
Number of trials: 10 ({'RUNNING': 1, 'PENDING': 9})
PENDING trials:
 - TRAIN_FN_1_lr=0.022625,momentum=0.34345:	PENDING
 - TRAIN_FN_2_lr=0.065667,momentum=0.17842:	PENDING
 - TRAIN_FN_3_lr=0.0046286,momentum=0.82255:	PENDING
 - TRAIN_FN_4_lr=0.05494,momentum=0.88637:	PENDING
 - TRAIN_FN_5_lr=0.037683,momentum=0.49114:	PENDING
 - TRAIN_FN_6_lr=0.0049163,momentum=0.13329:	PENDING
 - TRAIN_FN_7_lr=0.012478,momentum=0.11843:	PENDING
 - TRAIN_FN_8_lr=0.032357,momentum=0.61504:	PENDING
 - TRAIN_FN_9_lr=0.079978,momentum=0.83846:	PENDING
RUNNING trials:
 - TRAIN_FN_0_lr=0.043764,momentum=0.67148:	RUNNING

Result for TRAIN_FN_0_lr=0.043764,momentum=0.67148:
  date: 2019-04-24_19-37-37
  done: false
  experiment_id: 12e59bae093942da87a144b50257e54f
  hostname: kangkang-1994
  iterations_since_restore: 1
  mean_accuracy: 0.9713
  mean_loss: 0.09200096435546876
  neg_mean_loss: -0.09200096435546876
  node_ip: 192.168.4.102
  pid: 5110
  time_since_restore: 23.326541423797607
  time_this_iter_s: 23.326541423797607
  time_total_s: 23.326541423797607
  timestamp: 1556105857
  timesteps_since_restore: 0
  training_iteration: 1
  ,,,,,,
  ,,,,,,
  ,,,,,,
    
2019-04-24 19:39:12,712	INFO ray_trial_executor.py:178 -- Destroying actor for trial TRAIN_FN_9_lr=0.079978,momentum=0.83846. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2019-04-24 19:39:12,715	INFO ray_trial_executor.py:178 -- Destroying actor for trial TRAIN_FN_8_lr=0.032357,momentum=0.61504. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
== Status ==
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 180.000: None | Iter 60.000: None | Iter 20.000: None
Bracket: Iter 180.000: None | Iter 60.000: None
Bracket: Iter 180.000: None
Resources requested: 0/8 CPUs, 0/0 GPUs
Memory usage on this node: 5.7/16.8 GB
Result logdir: /home/kangkang/ray_results/exp
Number of trials: 10 ({'TERMINATED': 10})
TERMINATED trials:
 - TRAIN_FN_0_lr=0.043764,momentum=0.67148:	TERMINATED, [3 CPUs, 0 GPUs], [pid=5110], 23 s, 1 iter, 0.092 loss, 0.971 acc
 - TRAIN_FN_1_lr=0.022625,momentum=0.34345:	TERMINATED, [3 CPUs, 0 GPUs], [pid=5147], 23 s, 1 iter, 0.152 loss, 0.953 acc
 - TRAIN_FN_2_lr=0.065667,momentum=0.17842:	TERMINATED, [3 CPUs, 0 GPUs], [pid=5151], 24 s, 1 iter, 0.103 loss, 0.969 acc
 - TRAIN_FN_3_lr=0.0046286,momentum=0.82255:	TERMINATED, [3 CPUs, 0 GPUs], [pid=5148], 24 s, 1 iter, 0.181 loss, 0.946 acc
 - TRAIN_FN_4_lr=0.05494,momentum=0.88637:	TERMINATED, [3 CPUs, 0 GPUs], [pid=5152], 23 s, 1 iter, 0.246 loss, 0.929 acc
 - TRAIN_FN_5_lr=0.037683,momentum=0.49114:	TERMINATED, [3 CPUs, 0 GPUs], [pid=5112], 23 s, 1 iter, 0.103 loss, 0.968 acc
 - TRAIN_FN_6_lr=0.0049163,momentum=0.13329:	TERMINATED, [3 CPUs, 0 GPUs], [pid=5144], 22 s, 1 iter, 0.509 loss, 0.868 acc
 - TRAIN_FN_7_lr=0.012478,momentum=0.11843:	TERMINATED, [3 CPUs, 0 GPUs], [pid=5114], 22 s, 1 iter, 0.254 loss, 0.926 acc
 - TRAIN_FN_8_lr=0.032357,momentum=0.61504:	TERMINATED, [3 CPUs, 0 GPUs], [pid=4513], 22 s, 1 iter, 0.1 loss, 0.969 acc
 - TRAIN_FN_9_lr=0.079978,momentum=0.83846:	TERMINATED, [3 CPUs, 0 GPUs], [pid=4531], 22 s, 1 iter, 0.184 loss, 0.947 acc


Process finished with exit code 0

mnist_pytorch_trainable

转换PyTorch MNIST示例以使用Tune和Trainable API。 还使用HyperBandScheduler并在最后检查模型。
源码:

# Original Code here:
# https://github.com/pytorch/examples/blob/master/mnist/main.py
from __future__ import print_function

import argparse
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from ray.tune import Trainable

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument(
    '--batch-size',
    type=int,
    default=64,
    metavar='N',
    help='input batch size for training (default: 64)')
parser.add_argument(
    '--test-batch-size',
    type=int,
    default=1000,
    metavar='N',
    help='input batch size for testing (default: 1000)')
parser.add_argument(
    '--epochs',
    type=int,
    default=1,
    metavar='N',
    help='number of epochs to train (default: 1)')
parser.add_argument(
    '--lr',
    type=float,
    default=0.01,
    metavar='LR',
    help='learning rate (default: 0.01)')
parser.add_argument(
    '--momentum',
    type=float,
    default=0.5,
    metavar='M',
    help='SGD momentum (default: 0.5)')
parser.add_argument(
    '--no-cuda',
    action='store_true',
    default=False,
    help='disables CUDA training')
parser.add_argument(
    '--seed',
    type=int,
    default=1,
    metavar='S',
    help='random seed (default: 1)')
parser.add_argument(
    '--smoke-test', action="store_true", help="Finish quickly for testing")


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# Trainable API 类
class TrainMNIST(Trainable):
    def _setup(self, config):
        args = config.pop("args")
        vars(args).update(config)
        args.cuda = not args.no_cuda and torch.cuda.is_available()

        torch.manual_seed(args.seed)
        if args.cuda:
            torch.cuda.manual_seed(args.seed)

        kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
        self.train_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                '~/data',
                train=True,
                download=False,
                transform=transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307, ), (0.3081, ))
                ])),
            batch_size=args.batch_size,
            shuffle=True,
            **kwargs)
        self.test_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                '~/data',
                train=False,
                transform=transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307, ), (0.3081, ))
                ])),
            batch_size=args.test_batch_size,
            shuffle=True,
            **kwargs)

        self.model = Net()
        if args.cuda:
            self.model.cuda()

        self.optimizer = optim.SGD(
            self.model.parameters(), lr=args.lr, momentum=args.momentum)
        self.args = args

    def _train_iteration(self):
        self.model.train()
        for batch_idx, (data, target) in enumerate(self.train_loader):
            if self.args.cuda:
                data, target = data.cuda(), target.cuda()
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            self.optimizer.step()

    def _test(self):
        self.model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in self.test_loader:
                if self.args.cuda:
                    data, target = data.cuda(), target.cuda()
                output = self.model(data)
                # sum up batch loss
                test_loss += F.nll_loss(output, target, reduction='sum').item()
                # get the index of the max log-probability
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(
                    target.data.view_as(pred)).long().cpu().sum()

        test_loss = test_loss / len(self.test_loader.dataset)
        accuracy = correct.item() / len(self.test_loader.dataset)
        return {"mean_loss": test_loss, "mean_accuracy": accuracy}

    def _train(self):
        self._train_iteration()
        return self._test()

    def _save(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
        torch.save(self.model.state_dict(), checkpoint_path)
        return checkpoint_path

    def _restore(self, checkpoint_path):
        self.model.load_state_dict(checkpoint_path)


if __name__ == "__main__":
    datasets.MNIST('~/data', train=True, download=True)
    args = parser.parse_args()

    import numpy as np
    import ray
    from ray import tune
    from ray.tune.schedulers import HyperBandScheduler

    ray.init()
    sched = HyperBandScheduler(
        time_attr="training_iteration", reward_attr="neg_mean_loss")
    tune.run(
        TrainMNIST,
        scheduler=sched,
        **{
            "stop": {
                "mean_accuracy": 0.95,
                "training_iteration": 1 if args.smoke_test else 20,
            },
            "resources_per_trial": {
                "cpu": 3,
                # "gpu": int(not args.no_cuda)
            },
            "num_samples": 1 if args.smoke_test else 20,
            "checkpoint_at_end": True,
            "config": {
                "args": args,
                "lr": tune.sample_from(
                    lambda spec: np.random.uniform(0.001, 0.1)),
                "momentum": tune.sample_from(
                    lambda spec: np.random.uniform(0.1, 0.9)),
            }
        })

运行结果:

/usr/bin/python3.5 /home/kangkang/PycharmProjects/ray/python/ray/tune/examples/mnist_pytorch_trainable.py
2019-04-24 20:36:49,395	INFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-04-24_20-36-49_26310/logs.
2019-04-24 20:36:49,500	INFO services.py:363 -- Waiting for redis server at 127.0.0.1:51648 to respond...
2019-04-24 20:36:49,616	INFO services.py:363 -- Waiting for redis server at 127.0.0.1:15287 to respond...
2019-04-24 20:36:49,617	INFO services.py:760 -- Starting Redis shard with 3.35 GB max memory.
2019-04-24 20:36:49,633	INFO services.py:1384 -- Starting the Plasma object store with 5.03 GB memory using /dev/shm.
2019-04-24 20:36:49,741	INFO tune.py:64 -- Did not find checkpoint file in /home/kangkang/ray_results/TrainMNIST.
2019-04-24 20:36:49,741	INFO tune.py:211 -- Starting a new experiment.
== Status ==
Using HyperBand: num_stopped=0 total_brackets=0
Round #0:
Resources requested: 0/8 CPUs, 0/0 GPUs
Memory usage on this node: 5.2/16.8 GB

== Status ==
Using HyperBand: num_stopped=0 total_brackets=3
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {PENDING: 4, RUNNING: 1} 
  Bracket(Max Size (n)=8, Milestone (r)=27, completed=0.0%): {PENDING: 8} 
  Bracket(Max Size (n)=15, Milestone (r)=9, completed=0.0%): {PENDING: 7} 
Resources requested: 3/8 CPUs, 0/0 GPUs
Memory usage on this node: 5.4/16.8 GB
Result logdir: /home/kangkang/ray_results/TrainMNIST
Number of trials: 20 ({'RUNNING': 1, 'PENDING': 19})
PENDING trials:
 - TrainMNIST_0_lr=0.011026,momentum=0.79764:	PENDING
 - TrainMNIST_1_lr=0.039852,momentum=0.23715:	PENDING
 - TrainMNIST_3_lr=0.017233,momentum=0.20601:	PENDING
 - TrainMNIST_4_lr=0.076475,momentum=0.31586:	PENDING
 - TrainMNIST_5_lr=0.020496,momentum=0.10799:	PENDING
 - TrainMNIST_6_lr=0.081365,momentum=0.76826:	PENDING
 - TrainMNIST_7_lr=0.091581,momentum=0.51798:	PENDING
 - TrainMNIST_8_lr=0.013477,momentum=0.27381:	PENDING
 - TrainMNIST_9_lr=0.046978,momentum=0.45581:	PENDING
 - TrainMNIST_10_lr=0.024256,momentum=0.65206:	PENDING
 - TrainMNIST_11_lr=0.032366,momentum=0.67866:	PENDING
 - TrainMNIST_12_lr=0.056358,momentum=0.71433:	PENDING
 - TrainMNIST_13_lr=0.072232,momentum=0.30116:	PENDING
 - TrainMNIST_14_lr=0.01279,momentum=0.22866:	PENDING
 - TrainMNIST_15_lr=0.071809,momentum=0.66429:	PENDING
 - TrainMNIST_16_lr=0.086842,momentum=0.53263:	PENDING
2019-04-24 20:36:51,069	WARNING util.py:62 -- The `start_trial` operation took 1.1444847583770752 seconds to complete, which may be a performance bottleneck.
 - TrainMNIST_17_lr=0.054844,momentum=0.41178:	PENDING
 - TrainMNIST_18_lr=0.064185,momentum=0.72111:	PENDING
 - TrainMNIST_19_lr=0.08816,momentum=0.69577:	PENDING
RUNNING trials:
 - TrainMNIST_2_lr=0.019918,momentum=0.85528:	RUNNING

Result for TrainMNIST_4_lr=0.076475,momentum=0.31586:
  date: 2019-04-24_20-37-14
  done: true
  experiment_id: fc8592228c364db3b08483f9edd63782
  hostname: kangkang-1994
  iterations_since_restore: 1
  mean_accuracy: 0.9701
  mean_loss: 0.09734744338989258
  neg_mean_loss: -0.09734744338989258
  node_ip: 192.168.4.102
  pid: 26357
  time_since_restore: 23.453298568725586
  time_this_iter_s: 23.453298568725586
  time_total_s: 23.453298568725586
  timestamp: 1556109434
  timesteps_since_restore: 0
  training_iteration: 1
  ......
  ......
  ......
  == Status ==
Using HyperBand: num_stopped=0 total_brackets=3
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=100.0%): {TERMINATED: 5} 
  Bracket(Max Size (n)=8, Milestone (r)=27, completed=0.5%): {RUNNING: 1, TERMINATED: 7} 
  Bracket(Max Size (n)=5, Milestone (r)=36, completed=0.2%): {TERMINATED: 7} 
Resources requested: 3/8 CPUs, 0/0 GPUs
Memory usage on this node: 5.4/16.8 GB
Result logdir: /home/kangkang/ray_results/TrainMNIST
Number of trials: 20 ({'RUNNING': 1, 'TERMINATED': 19})
RUNNING trials:
 - TrainMNIST_7_lr=0.091581,momentum=0.51798:	RUNNING
TERMINATED trials:
 - TrainMNIST_0_lr=0.011026,momentum=0.79764:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26360], 22 s, 1 iter, 0.124 loss, 0.964 acc
 - TrainMNIST_1_lr=0.039852,momentum=0.23715:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26358], 22 s, 1 iter, 0.122 loss, 0.962 acc
 - TrainMNIST_2_lr=0.019918,momentum=0.85528:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26353], 23 s, 1 iter, 0.0872 loss, 0.973 acc
 - TrainMNIST_3_lr=0.017233,momentum=0.20601:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26356], 44 s, 2 iter, 0.126 loss, 0.962 acc
 - TrainMNIST_4_lr=0.076475,momentum=0.31586:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26357], 23 s, 1 iter, 0.0973 loss, 0.97 acc
 - TrainMNIST_5_lr=0.020496,momentum=0.10799:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26354], 46 s, 2 iter, 0.125 loss, 0.962 acc
 - TrainMNIST_6_lr=0.081365,momentum=0.76826:	TERMINATED, [3 CPUs, 0 GPUs], [pid=762], 24 s, 1 iter, 0.121 loss, 0.965 acc
 - TrainMNIST_8_lr=0.013477,momentum=0.27381:	TERMINATED, [3 CPUs, 0 GPUs], [pid=787], 46 s, 2 iter, 0.137 loss, 0.958 acc
 - TrainMNIST_9_lr=0.046978,momentum=0.45581:	TERMINATED, [3 CPUs, 0 GPUs], [pid=704], 24 s, 1 iter, 0.0964 loss, 0.972 acc
 - TrainMNIST_10_lr=0.024256,momentum=0.65206:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26355], 23 s, 1 iter, 0.107 loss, 0.968 acc
 - TrainMNIST_11_lr=0.032366,momentum=0.67866:	TERMINATED, [3 CPUs, 0 GPUs], [pid=734], 24 s, 1 iter, 0.0966 loss, 0.971 acc
 - TrainMNIST_12_lr=0.056358,momentum=0.71433:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26359], 23 s, 1 iter, 0.102 loss, 0.97 acc
 - TrainMNIST_13_lr=0.072232,momentum=0.30116:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1320], 21 s, 1 iter, 0.0965 loss, 0.969 acc
 - TrainMNIST_14_lr=0.01279,momentum=0.22866:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1254], 50 s, 2 iter, 0.147 loss, 0.956 acc
 - TrainMNIST_15_lr=0.071809,momentum=0.66429:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1312], 21 s, 1 iter, 0.0873 loss, 0.973 acc
 - TrainMNIST_16_lr=0.086842,momentum=0.53263:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1344], 21 s, 1 iter, 0.097 loss, 0.971 acc
 - TrainMNIST_17_lr=0.054844,momentum=0.41178:	TERMINATED, [3 CPUs, 0 GPUs], [pid=25339], 22 s, 1 iter, 0.094 loss, 0.971 acc
 - TrainMNIST_18_lr=0.064185,momentum=0.72111:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1292], 28 s, 1 iter, 0.0931 loss, 0.973 acc
 - TrainMNIST_19_lr=0.08816,momentum=0.69577:	TERMINATED, [3 CPUs, 0 GPUs], [pid=25365], 23 s, 1 iter, 0.0913 loss, 0.973 acc

Result for TrainMNIST_7_lr=0.091581,momentum=0.51798:
  date: 2019-04-24_20-41-35
  done: true
  experiment_id: 3bfd303ffb3a4b81b5917919c2ad388b
  hostname: kangkang-1994
  iterations_since_restore: 1
  mean_accuracy: 0.9655
  mean_loss: 0.11476650772094726
  neg_mean_loss: -0.11476650772094726
  node_ip: 192.168.4.102
  pid: 754
  time_since_restore: 22.205914735794067
  time_this_iter_s: 22.205914735794067
  time_total_s: 22.205914735794067
  timestamp: 1556109695
  timesteps_since_restore: 0
  training_iteration: 1
  
2019-04-24 20:41:35,979	INFO ray_trial_executor.py:178 -- Destroying actor for trial TrainMNIST_7_lr=0.091581,momentum=0.51798. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
== Status ==
Using HyperBand: num_stopped=0 total_brackets=3
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=100.0%): {TERMINATED: 5} 
  Bracket(Max Size (n)=3, Milestone (r)=81, completed=100.0%): {TERMINATED: 8} 
  Bracket(Max Size (n)=5, Milestone (r)=36, completed=0.2%): {TERMINATED: 7} 
Resources requested: 0/8 CPUs, 0/0 GPUs
Memory usage on this node: 5.2/16.8 GB
Result logdir: /home/kangkang/ray_results/TrainMNIST
Number of trials: 20 ({'TERMINATED': 20})
TERMINATED trials:
 - TrainMNIST_0_lr=0.011026,momentum=0.79764:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26360], 22 s, 1 iter, 0.124 loss, 0.964 acc
 - TrainMNIST_1_lr=0.039852,momentum=0.23715:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26358], 22 s, 1 iter, 0.122 loss, 0.962 acc
 - TrainMNIST_2_lr=0.019918,momentum=0.85528:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26353], 23 s, 1 iter, 0.0872 loss, 0.973 acc
 - TrainMNIST_3_lr=0.017233,momentum=0.20601:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26356], 44 s, 2 iter, 0.126 loss, 0.962 acc
 - TrainMNIST_4_lr=0.076475,momentum=0.31586:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26357], 23 s, 1 iter, 0.0973 loss, 0.97 acc
 - TrainMNIST_5_lr=0.020496,momentum=0.10799:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26354], 46 s, 2 iter, 0.125 loss, 0.962 acc
 - TrainMNIST_6_lr=0.081365,momentum=0.76826:	TERMINATED, [3 CPUs, 0 GPUs], [pid=762], 24 s, 1 iter, 0.121 loss, 0.965 acc
 - TrainMNIST_7_lr=0.091581,momentum=0.51798:	TERMINATED, [3 CPUs, 0 GPUs], [pid=754], 22 s, 1 iter, 0.115 loss, 0.966 acc
 - TrainMNIST_8_lr=0.013477,momentum=0.27381:	TERMINATED, [3 CPUs, 0 GPUs], [pid=787], 46 s, 2 iter, 0.137 loss, 0.958 acc
 - TrainMNIST_9_lr=0.046978,momentum=0.45581:	TERMINATED, [3 CPUs, 0 GPUs], [pid=704], 24 s, 1 iter, 0.0964 loss, 0.972 acc
 - TrainMNIST_10_lr=0.024256,momentum=0.65206:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26355], 23 s, 1 iter, 0.107 loss, 0.968 acc
 - TrainMNIST_11_lr=0.032366,momentum=0.67866:	TERMINATED, [3 CPUs, 0 GPUs], [pid=734], 24 s, 1 iter, 0.0966 loss, 0.971 acc
 - TrainMNIST_12_lr=0.056358,momentum=0.71433:	TERMINATED, [3 CPUs, 0 GPUs], [pid=26359], 23 s, 1 iter, 0.102 loss, 0.97 acc
 - TrainMNIST_13_lr=0.072232,momentum=0.30116:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1320], 21 s, 1 iter, 0.0965 loss, 0.969 acc
 - TrainMNIST_14_lr=0.01279,momentum=0.22866:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1254], 50 s, 2 iter, 0.147 loss, 0.956 acc
 - TrainMNIST_15_lr=0.071809,momentum=0.66429:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1312], 21 s, 1 iter, 0.0873 loss, 0.973 acc
 - TrainMNIST_16_lr=0.086842,momentum=0.53263:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1344], 21 s, 1 iter, 0.097 loss, 0.971 acc
 - TrainMNIST_17_lr=0.054844,momentum=0.41178:	TERMINATED, [3 CPUs, 0 GPUs], [pid=25339], 22 s, 1 iter, 0.094 loss, 0.971 acc
 - TrainMNIST_18_lr=0.064185,momentum=0.72111:	TERMINATED, [3 CPUs, 0 GPUs], [pid=1292], 28 s, 1 iter, 0.0931 loss, 0.973 acc
 - TrainMNIST_19_lr=0.08816,momentum=0.69577:	TERMINATED, [3 CPUs, 0 GPUs], [pid=25365], 23 s, 1 iter, 0.0913 loss, 0.973 acc


Process finished with exit code 0

四、Tensorflow 实例

tune_mnist_ray

使用Trainable类在MNIST上调整TensorFlow模型的基本示例。

五、分布式实例

genetic_example

源码:

你可能感兴趣的:(ray,学习,ray)