Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distrib #635

Merged
merged 29 commits into from
Oct 24, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2afc205
[WIP] Added cifar10 distributed example
vfdev-5 Aug 1, 2019
7b8eac9
[WIP] Metric with all reduce decorator and tests
vfdev-5 Aug 1, 2019
c7d2337
[WIP] Added tests for accumulation metric
vfdev-5 Aug 1, 2019
69ced1e
[WIP] Updated with reinit_is_reduced
vfdev-5 Aug 1, 2019
f2f923b
[WIP] Distrib adaptation for other metrics
vfdev-5 Aug 2, 2019
d13b985
[WIP] Warnings for EpochMetric and Precision/Recall when distrib
vfdev-5 Aug 2, 2019
e7d12d0
Updated metrics and tests to run on distributed configuration
vfdev-5 Aug 3, 2019
0a5f582
Minor fixes and cosmetics
vfdev-5 Aug 3, 2019
954269c
Merge branch 'master' into distrib
vfdev-5 Aug 3, 2019
206f2e1
Fixed bugs and improved contrib/cifar10 example
vfdev-5 Aug 3, 2019
99a6b4a
Updated docs
vfdev-5 Aug 3, 2019
3eff370
Update metrics.rst
vfdev-5 Aug 6, 2019
ad8375c
Updated docs and set device as "cuda" in distributed instead of raisi…
vfdev-5 Aug 6, 2019
0bcc287
[WIP] Fix missing _is_reduced in precision/recall with tests
vfdev-5 Aug 7, 2019
1bda698
Merge remote-tracking branch 'origin' into distrib
vfdev-5 Aug 7, 2019
7dd6937
Updated other tests
vfdev-5 Aug 7, 2019
27324dc
Merge branch 'master' into distrib
vfdev-5 Aug 29, 2019
f4a3d4b
Updated travis and renamed tbptt test gpu -> cuda
vfdev-5 Aug 29, 2019
2036075
Distrib (#573)
vfdev-5 Aug 30, 2019
69502fc
Merge branch 'distrib' of https://github.com/pytorch/ignite into distrib
vfdev-5 Sep 9, 2019
d52c36d
Merge branch 'master' into distrib
vfdev-5 Sep 9, 2019
ecb00a5
Merge branch 'master' into distrib
vfdev-5 Sep 13, 2019
71836aa
Merge branch 'master' into distrib
vfdev-5 Sep 25, 2019
46cdd86
Compute IoU, Precision, Recall based on CM on CPU
vfdev-5 Sep 26, 2019
fd14d4d
Fixes incomplete merge with 1856c8e0f1be102d4530592bcb7caac690f198c4
vfdev-5 Sep 26, 2019
59b894c
Merge branch 'master' into distrib
vfdev-5 Oct 17, 2019
80ad40a
Update distrib branch and CIFAR10 example (#647)
vfdev-5 Oct 22, 2019
8288831
Finalized Cifar10 example (#649)
vfdev-5 Oct 24, 2019
25db95b
Merge branch 'master' into distrib
vfdev-5 Oct 24, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Updated metrics and tests to run on distributed configuration
- Test on 2 GPUS single node
- Added cmd in .travis.yml to indicate how to test locally
- Updated travis to run tests in 4 processes
  • Loading branch information
vfdev-5 committed Aug 3, 2019
commit e7d12d038359ef9fd26a6533486e920e1e651835
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ install:
- pip install gym==0.10.11

script:
- py.test --cov ignite --cov-report term-missing
- CUDA_VISIBLE_DEVICES="" py.test --tx 4*popen//python=python$TRAVIS_PYTHON_VERSION --cov ignite --cov-report term-missing -vvv tests/

# Smoke tests for the examples
# Mnist
Expand Down Expand Up @@ -73,7 +73,6 @@ script:

# tests for distributed ops
# As no GPUs on travis -> all tests will be skipped
- pip install pytest-xdist
# 2 is the number of processes <-> number of available GPUs
- export WORLD_SIZE=2
- pytest --dist=each --tx $WORLD_SIZE*popen//python=python$TRAVIS_PYTHON_VERSION tests -m distributed -vvv
Expand Down
1 change: 1 addition & 0 deletions ignite/metrics/accumulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import torch


class VariableAccumulation(Metric):
"""Single variable accumulator helper to compute (arithmetic, geometric, harmonic) average of a single variable.

Expand Down
4 changes: 2 additions & 2 deletions ignite/metrics/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,8 @@ def thresholded_output_transform(output):
def __init__(self, output_transform=lambda x: x, is_multilabel=False, device=None):
self._num_correct = None
self._num_examples = None
super(Accuracy, self).__init__(output_transform=output_transform,
is_multilabel=is_multilabel,
super(Accuracy, self).__init__(output_transform=output_transform,
is_multilabel=is_multilabel,
device=device)

@reinit_is_reduced
Expand Down
6 changes: 3 additions & 3 deletions ignite/metrics/confusion_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def __init__(self, num_classes, average=None, output_transform=lambda x: x, devi

@reinit_is_reduced
def reset(self):
self.confusion_matrix = torch.zeros(self.num_classes, self.num_classes,
dtype=torch.float,
self.confusion_matrix = torch.zeros(self.num_classes, self.num_classes,
dtype=torch.float,
device=self._device)
self._num_examples = 0

Expand Down Expand Up @@ -77,7 +77,7 @@ def _check_shape(self, output):

return y_pred, y

@reinit_is_reduced
@reinit_is_reduced
def update(self, output):
y_pred, y = self._check_shape(output)

Expand Down
6 changes: 2 additions & 4 deletions ignite/metrics/epoch_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,12 @@ class EpochMetric(Metric):
def __init__(self, compute_fn, output_transform=lambda x: x):

if torch.distributed.is_available() and torch.distributed.is_initialized():
raise warnings.warn("EpochMetric class does not work in distributed setting. "
"Metric's results are not reduced across the GPUs. Computed result "
"corresponds to the local rank's (single GPU) result.")
warnings.warn("EpochMetric class does not work in distributed setting.", RuntimeWarning)

if not callable(compute_fn):
raise TypeError("Argument compute_fn should be callable.")

super(EpochMetric, self).__init__(output_transform=output_transform)
super(EpochMetric, self).__init__(output_transform=output_transform, device='cpu')
self.compute_fn = compute_fn

def reset(self):
Expand Down
6 changes: 3 additions & 3 deletions ignite/metrics/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ def __init__(self, loss_fn, output_transform=lambda x: x,
self._loss_fn = loss_fn
self._batch_size = batch_size

@reinit_is_reduced
@reinit_is_reduced
def reset(self):
self._sum = 0
self._num_examples = 0

@reinit_is_reduced
@reinit_is_reduced
def update(self, output):
if len(output) == 2:
y_pred, y = output
Expand All @@ -54,7 +54,7 @@ def update(self, output):
N = self._batch_size(y)
self._sum += average_loss.item() * N
self._num_examples += N

@sync_all_reduce("_sum", "_num_examples")
def compute(self):
if self._num_examples == 0:
Expand Down
6 changes: 3 additions & 3 deletions ignite/metrics/mean_pairwise_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

class MeanPairwiseDistance(Metric):
"""
Calculates the mean pairwise distance.
Calculates the mean pairwise distance: average of pairwise distances computed on provided batches.

- `update` must receive output of the form `(y_pred, y)`.
"""
def __init__(self, p=2, eps=1e-6, output_transform=lambda x: x):
super(MeanPairwiseDistance, self).__init__(output_transform)
def __init__(self, p=2, eps=1e-6, output_transform=lambda x: x, device=None):
super(MeanPairwiseDistance, self).__init__(output_transform, device=device)
self._p = p
self._eps = eps

Expand Down
9 changes: 4 additions & 5 deletions ignite/metrics/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def _sync_all_reduce(self, tensor):
# synchronize and reduce
torch.distributed.barrier()
torch.distributed.all_reduce(tensor)

if tensor_to_number:
return tensor.item()
return tensor
Expand Down Expand Up @@ -217,11 +217,10 @@ def another_wrapper(self, *args, **kwargs):


def reinit_is_reduced(func):

@wraps(func)
def wrapper(self, *args, **kwargs):
def wrapper(self, *args, **kwargs):
func(self, *args, **kwargs)
self._is_reduced = False

return wrapper

return wrapper
2 changes: 1 addition & 1 deletion ignite/metrics/metrics_lambda.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(self, f, *args, **kwargs):
self.function = f
self.args = args
self.kwargs = kwargs
super(MetricsLambda, self).__init__()
super(MetricsLambda, self).__init__(device='cpu')

def reset(self):
for i in itertools.chain(self.args, self.kwargs.values()):
Expand Down
8 changes: 4 additions & 4 deletions ignite/metrics/precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ class _BasePrecisionRecall(_BaseClassification):
def __init__(self, output_transform=lambda x: x, average=False, is_multilabel=False, device=None):
if torch.distributed.is_available() and torch.distributed.is_initialized():
if (not average) and is_multilabel:
raise warnings.warn("Precision/Recall metrics do not work in distributed setting when average=False "
"and is_multilabel=True. Results are not reduced across the GPUs. Computed result "
"corresponds to the local rank's (single GPU) result.")
warnings.warn("Precision/Recall metrics do not work in distributed setting when average=False "
"and is_multilabel=True. Results are not reduced across the GPUs. Computed result "
"corresponds to the local rank's (single GPU) result.", RuntimeWarning)

self._average = average
self._true_positives = None
Expand All @@ -38,7 +38,7 @@ def compute(self):
raise NotComputableError("{} must have at least one example before"
" it can be computed.".format(self.__class__.__name__))

if self._average:
if not (self._type == "multilabel" and not self._average):
self._true_positives = self._sync_all_reduce(self._true_positives)
self._positives = self._sync_all_reduce(self._positives)

Expand Down
7 changes: 5 additions & 2 deletions ignite/metrics/recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,14 @@ def thresholded_output_transform(output):
in multiclass case), otherwise, returns a tensor with the precision (for each class in multiclass case).
is_multilabel (bool, optional) flag to use in multilabel case. By default, value is False. If True, average
parameter should be True and the average is computed across samples, instead of classes.
device (str of torch.device): device specification in case of distributed computation usage.
In most of the cases, it should defined as "cuda:local_rank".

"""

def __init__(self, output_transform=lambda x: x, average=False, is_multilabel=False):
def __init__(self, output_transform=lambda x: x, average=False, is_multilabel=False, device=None):
super(Recall, self).__init__(output_transform=output_transform,
average=average, is_multilabel=is_multilabel)
average=average, is_multilabel=is_multilabel, device=device)

@reinit_is_reduced
def update(self, output):
Expand Down
4 changes: 2 additions & 2 deletions ignite/metrics/top_k_categorical_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class TopKCategoricalAccuracy(Metric):

- `update` must receive output of the form `(y_pred, y)`.
"""
def __init__(self, k=5, output_transform=lambda x: x):
super(TopKCategoricalAccuracy, self).__init__(output_transform)
def __init__(self, k=5, output_transform=lambda x: x, device=None):
super(TopKCategoricalAccuracy, self).__init__(output_transform, device=device)
self._k = k

@reinit_is_reduced
Expand Down
7 changes: 6 additions & 1 deletion tests/ignite/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@
@pytest.fixture()
def local_rank(worker_id):
""" use a different account in each xdist worker """
return int(worker_id.replace("gw", ""))
if "gw" in worker_id:
return int(worker_id.replace("gw", ""))
return worker_id


@pytest.fixture()
def distributed_context_single_node(local_rank):



import os
if "WORLD_SIZE" not in os.environ:
os.environ["WORLD_SIZE"] = "{}".format(torch.cuda.device_count())
Expand Down
Loading