Skip to content

Commit

Permalink
[CI] Fix CI issues (pytorch#2084)
Browse files Browse the repository at this point in the history
  • Loading branch information
vmoens authored Apr 17, 2024
1 parent 8570bd3 commit 730dd45
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 26 deletions.
8 changes: 5 additions & 3 deletions .github/unittest/linux_libs/scripts_habitat/setup_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@ if [ ! -d "${env_dir}" ]; then
conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
fi
conda activate "${env_dir}"
#pip3 uninstall cython -y
#pip uninstall cython -y
#conda uninstall cython -y

# set debug variables
conda env config vars set MAGNUM_LOG=debug HABITAT_SIM_LOG=debug
conda deactivate && conda activate "${env_dir}"

pip3 install "cython<3"
conda install -c anaconda cython="<3.0.0" -y

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/test-linux-habitat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ jobs:
tests:
strategy:
matrix:
python_version: ["3.9"] # "3.8", "3.9", "3.10", "3.11"
cuda_arch_version: ["11.6"] # "11.6", "11.7"
python_version: ["3.9"]
cuda_arch_version: ["12.1"]
fail-fast: false
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.g5.4xlarge.nvidia.gpu
repository: pytorch/rl
docker-image: "nvidia/cuda:12.2.0-devel-ubuntu22.04"
docker-image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
gpu-arch-type: cuda
gpu-arch-version: ${{ matrix.cuda_arch_version }}
timeout: 90
Expand Down
16 changes: 13 additions & 3 deletions .github/workflows/test-linux-libs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,16 @@ jobs:
unittests-brax:
strategy:
matrix:
python_version: ["3.9"]
python_version: ["3.11"]
cuda_arch_version: ["12.1"]
if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
repository: pytorch/rl
runner: "linux.g5.4xlarge.nvidia.gpu"
gpu-arch-type: cuda
gpu-arch-version: "11.7"
docker-image: "pytorch/manylinux-cuda124"
timeout: 120
script: |
if [[ "${{ github.ref }}" =~ release/* ]]; then
Expand All @@ -73,7 +75,7 @@ jobs:
set -euo pipefail
export PYTHON_VERSION="3.9"
export PYTHON_VERSION="3.11"
export CU_VERSION="12.1"
export TAR_OPTIONS="--no-same-owner"
export UPLOAD_CHANNEL="nightly"
Expand Down Expand Up @@ -123,7 +125,7 @@ jobs:
matrix:
python_version: ["3.9"]
cuda_arch_version: ["12.1"]
if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Data') }}
if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
repository: pytorch/rl
Expand Down Expand Up @@ -224,12 +226,14 @@ jobs:
matrix:
python_version: ["3.9"]
cuda_arch_version: ["12.1"]
if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
repository: pytorch/rl
runner: "linux.g5.4xlarge.nvidia.gpu"
gpu-arch-type: cuda
gpu-arch-version: "11.7"
docker-image: "pytorch/manylinux-cuda124"
timeout: 120
script: |
if [[ "${{ github.ref }}" =~ release/* ]]; then
Expand Down Expand Up @@ -324,12 +328,14 @@ jobs:
bash .github/unittest/linux_libs/scripts_openx/post_process.sh
unittests-pettingzoo:
if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
repository: pytorch/rl
runner: "linux.g5.4xlarge.nvidia.gpu"
gpu-arch-type: cuda
gpu-arch-version: "11.7"
docker-image: "pytorch/manylinux-cuda124"
timeout: 120
script: |
if [[ "${{ github.ref }}" =~ release/* ]]; then
Expand Down Expand Up @@ -360,6 +366,7 @@ jobs:
matrix:
python_version: ["3.9"]
cuda_arch_version: ["12.1"]
if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
repository: pytorch/rl
Expand Down Expand Up @@ -468,6 +475,7 @@ jobs:
runner: "linux.g5.4xlarge.nvidia.gpu"
gpu-arch-type: cuda
gpu-arch-version: "11.7"
docker-image: "pytorch/manylinux-cuda124"
timeout: 120
script: |
if [[ "${{ github.ref }}" =~ release/* ]]; then
Expand Down Expand Up @@ -532,12 +540,14 @@ jobs:
matrix:
python_version: ["3.9"]
cuda_arch_version: ["12.1"]
if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
repository: pytorch/rl
runner: "linux.g5.4xlarge.nvidia.gpu"
gpu-arch-type: cuda
gpu-arch-version: "11.7"
docker-image: "pytorch/manylinux-cuda124"
timeout: 120
script: |
if [[ "${{ github.ref }}" =~ release/* ]]; then
Expand Down
10 changes: 7 additions & 3 deletions test/test_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@

IS_OSX = platform == "darwin"
IS_WIN = platform == "win32"
if IS_WIN:
mp_ctx = "spawn"
else:
mp_ctx = "fork"

## TO BE FIXED: DiscreteActionProjection queries a randint on each worker, which leads to divergent results between
## the serial and parallel batched envs
Expand Down Expand Up @@ -463,7 +467,7 @@ def test_parallel_devices(
env.shared_tensordict_parent.device.type == torch.device(edevice).type
)

@pytest.mark.parametrize("start_method", [None, "fork"])
@pytest.mark.parametrize("start_method", [None, mp_ctx])
def test_serial_for_single(self, maybe_fork_ParallelEnv, start_method):
env = ParallelEnv(
1,
Expand Down Expand Up @@ -2959,7 +2963,7 @@ def test_auto_reset_parallel(self):
env = ParallelEnv(
2,
functools.partial(AutoResettingCountingEnv, 4, auto_reset=True),
mp_start_method="fork",
mp_start_method=mp_ctx,
)
r = env.rollout(20, policy, break_when_any_done=False)
assert r.shape == torch.Size([2, 20])
Expand All @@ -2982,7 +2986,7 @@ def test_auto_reset_parallel_hetero(self):
functools.partial(AutoResettingCountingEnv, 4, auto_reset=True),
functools.partial(AutoResettingCountingEnv, 5, auto_reset=True),
],
mp_start_method="fork",
mp_start_method=mp_ctx,
)
r = env.rollout(20, policy, break_when_any_done=False)
assert r.shape == torch.Size([2, 20])
Expand Down
4 changes: 3 additions & 1 deletion test/test_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import numpy as np
import pytest
import torch
from _utils_internal import get_default_devices

from _utils_internal import get_default_devices, retry
from mocking_classes import MockBatchedUnLockedEnv
from packaging import version
from tensordict import TensorDict
Expand Down Expand Up @@ -890,6 +891,7 @@ def _get_mock_input_td(
)
return td

@retry(AssertionError, 3)
@pytest.mark.parametrize("n_agents", [1, 3])
@pytest.mark.parametrize("share_params", [True, False])
@pytest.mark.parametrize("centralised", [True, False])
Expand Down
2 changes: 0 additions & 2 deletions test/test_rb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2794,8 +2794,6 @@ def test_rb_multidim_collector(
if transform is not None:
assert s.ndim == 2
except Exception:
print(f"Failing at iter {i}") # noqa: T201
print(f"rb {rb}") # noqa: T201
raise

@pytest.mark.parametrize("strict_length", [True, False])
Expand Down
15 changes: 11 additions & 4 deletions test/test_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import sys
from copy import copy
from functools import partial
from sys import platform

import numpy as np
import pytest
Expand Down Expand Up @@ -125,6 +126,12 @@
from torchrl.envs.utils import check_env_specs, step_mdp
from torchrl.modules import GRUModule, LSTMModule, MLP, ProbabilisticActor, TanhNormal

IS_WIN = platform == "win32"
if IS_WIN:
mp_ctx = "spawn"
else:
mp_ctx = "fork"

TIMEOUT = 100.0

_has_gymnasium = importlib.util.find_spec("gymnasium") is not None
Expand Down Expand Up @@ -9404,7 +9411,7 @@ def make_env():
env = ParallelEnv(
2,
make_env,
mp_start_method="fork" if not torch.cuda.is_available() else "spawn",
mp_start_method=mp_ctx if not torch.cuda.is_available() else "spawn",
)
assert env.device is None
try:
Expand Down Expand Up @@ -9447,7 +9454,7 @@ def make_env():
ParallelEnv(
2,
make_env,
mp_start_method="fork" if not torch.cuda.is_available() else "spawn",
mp_start_method=mp_ctx if not torch.cuda.is_available() else "spawn",
),
DeviceCastTransform(
"cpu:1",
Expand Down Expand Up @@ -10696,7 +10703,7 @@ def make_env(stateless=stateless, reshape_fn=reshape_fn):
assert env.batch_size == expected_batch_size
return env

env = ParallelEnv(2, make_env, mp_start_method="fork")
env = ParallelEnv(2, make_env, mp_start_method=mp_ctx)
assert env.batch_size == (2, *make_env().batch_size)
check_env_specs(env)

Expand Down Expand Up @@ -10751,7 +10758,7 @@ def make_env(stateless=stateless, reshape_fn=reshape_fn):
assert transform.batch_size is None

env = TransformedEnv(
ParallelEnv(2, make_env, mp_start_method="fork"), transform
ParallelEnv(2, make_env, mp_start_method=mp_ctx), transform
)
assert env.batch_size == expected_batch_size
check_env_specs(env)
Expand Down
9 changes: 2 additions & 7 deletions torchrl/data/replay_buffers/samplers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1572,10 +1572,6 @@ def __init__(
)

def __repr__(self):
if self._sample_list is not None:
perc = len(self._sample_list) / self.len_storage * 100
else:
perc = 0.0
return (
f"{self.__class__.__name__}("
f"num_slices={self.num_slices}, "
Expand All @@ -1586,8 +1582,7 @@ def __repr__(self):
f"strict_length={self.strict_length},"
f"alpha={self._alpha}, "
f"beta={self._beta}, "
f"eps={self._eps},"
f"{perc: 4.4f}% filled)"
f"eps={self._eps}"
)

def __getstate__(self):
Expand Down Expand Up @@ -1726,7 +1721,7 @@ def sample(self, storage: Storage, batch_size: int) -> Tuple[torch.Tensor, dict]
terminated = torch.zeros_like(truncated)
if traj_terminated.any():
if isinstance(seq_length, int):
terminated.view(num_slices, -1)[:, traj_terminated] = 1
terminated.view(num_slices, -1)[traj_terminated, -1] = 1
else:
terminated[(seq_length.cumsum(0) - 1)[traj_terminated]] = 1
truncated = truncated & ~terminated
Expand Down

0 comments on commit 730dd45

Please sign in to comment.