[CI] Fix CI issues (pytorch#2084)

ai4co · Apr 17, 2024 · 730dd45 · 730dd45
1 parent 8570bd3
commit 730dd45
Show file tree

Hide file tree

Showing 8 changed files with 44 additions and 26 deletions.
diff --git a/.github/unittest/linux_libs/scripts_habitat/setup_env.sh b/.github/unittest/linux_libs/scripts_habitat/setup_env.sh
@@ -39,9 +39,11 @@ if [ ! -d "${env_dir}" ]; then
     conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
 fi
 conda activate "${env_dir}"
-#pip3 uninstall cython -y
-#pip uninstall cython -y
-#conda uninstall cython -y
+
+# set debug variables
+conda env config vars set MAGNUM_LOG=debug HABITAT_SIM_LOG=debug
+conda deactivate && conda activate "${env_dir}"
+
 pip3 install "cython<3"
 conda install -c anaconda cython="<3.0.0" -y
 

diff --git a/.github/workflows/test-linux-habitat.yml b/.github/workflows/test-linux-habitat.yml
@@ -19,14 +19,14 @@ jobs:
   tests:
     strategy:
       matrix:
-        python_version: ["3.9"] # "3.8", "3.9", "3.10", "3.11"
-        cuda_arch_version: ["11.6"] # "11.6", "11.7"
+        python_version: ["3.9"]
+        cuda_arch_version: ["12.1"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       repository: pytorch/rl
-      docker-image: "nvidia/cuda:12.2.0-devel-ubuntu22.04"
+      docker-image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
       gpu-arch-type: cuda
       gpu-arch-version: ${{ matrix.cuda_arch_version }}
       timeout: 90

diff --git a/.github/workflows/test-linux-libs.yml b/.github/workflows/test-linux-libs.yml
@@ -53,14 +53,16 @@ jobs:
   unittests-brax:
     strategy:
       matrix:
-        python_version: ["3.9"]
+        python_version: ["3.11"]
         cuda_arch_version: ["12.1"]
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
@@ -73,7 +75,7 @@ jobs:
 
         set -euo pipefail
 
-        export PYTHON_VERSION="3.9"
+        export PYTHON_VERSION="3.11"
         export CU_VERSION="12.1"
         export TAR_OPTIONS="--no-same-owner"
         export UPLOAD_CHANNEL="nightly"
@@ -123,7 +125,7 @@ jobs:
       matrix:
         python_version: ["3.9"]
         cuda_arch_version: ["12.1"]
-    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Data') }}
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
@@ -224,12 +226,14 @@ jobs:
       matrix:
         python_version: ["3.9"]
         cuda_arch_version: ["12.1"]
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
@@ -324,12 +328,14 @@ jobs:
         bash .github/unittest/linux_libs/scripts_openx/post_process.sh
 
   unittests-pettingzoo:
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
@@ -360,6 +366,7 @@ jobs:
       matrix:
         python_version: ["3.9"]
         cuda_arch_version: ["12.1"]
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
@@ -468,6 +475,7 @@ jobs:
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
@@ -532,12 +540,14 @@ jobs:
       matrix:
         python_version: ["3.9"]
         cuda_arch_version: ["12.1"]
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then

diff --git a/test/test_env.py b/test/test_env.py
@@ -115,6 +115,10 @@
 
 IS_OSX = platform == "darwin"
 IS_WIN = platform == "win32"
+if IS_WIN:
+    mp_ctx = "spawn"
+else:
+    mp_ctx = "fork"
 
 ## TO BE FIXED: DiscreteActionProjection queries a randint on each worker, which leads to divergent results between
 ## the serial and parallel batched envs
@@ -463,7 +467,7 @@ def test_parallel_devices(
                 env.shared_tensordict_parent.device.type == torch.device(edevice).type
             )
 
-    @pytest.mark.parametrize("start_method", [None, "fork"])
+    @pytest.mark.parametrize("start_method", [None, mp_ctx])
     def test_serial_for_single(self, maybe_fork_ParallelEnv, start_method):
         env = ParallelEnv(
             1,
@@ -2959,7 +2963,7 @@ def test_auto_reset_parallel(self):
         env = ParallelEnv(
             2,
             functools.partial(AutoResettingCountingEnv, 4, auto_reset=True),
-            mp_start_method="fork",
+            mp_start_method=mp_ctx,
         )
         r = env.rollout(20, policy, break_when_any_done=False)
         assert r.shape == torch.Size([2, 20])
@@ -2982,7 +2986,7 @@ def test_auto_reset_parallel_hetero(self):
                 functools.partial(AutoResettingCountingEnv, 4, auto_reset=True),
                 functools.partial(AutoResettingCountingEnv, 5, auto_reset=True),
             ],
-            mp_start_method="fork",
+            mp_start_method=mp_ctx,
         )
         r = env.rollout(20, policy, break_when_any_done=False)
         assert r.shape == torch.Size([2, 20])

diff --git a/test/test_modules.py b/test/test_modules.py
@@ -9,7 +9,8 @@
 import numpy as np
 import pytest
 import torch
-from _utils_internal import get_default_devices
+
+from _utils_internal import get_default_devices, retry
 from mocking_classes import MockBatchedUnLockedEnv
 from packaging import version
 from tensordict import TensorDict
@@ -890,6 +891,7 @@ def _get_mock_input_td(
         )
         return td
 
+    @retry(AssertionError, 3)
     @pytest.mark.parametrize("n_agents", [1, 3])
     @pytest.mark.parametrize("share_params", [True, False])
     @pytest.mark.parametrize("centralised", [True, False])

diff --git a/test/test_rb.py b/test/test_rb.py
@@ -2794,8 +2794,6 @@ def test_rb_multidim_collector(
                 if transform is not None:
                     assert s.ndim == 2
         except Exception:
-            print(f"Failing at iter {i}")  # noqa: T201
-            print(f"rb {rb}")  # noqa: T201
             raise
 
     @pytest.mark.parametrize("strict_length", [True, False])

diff --git a/test/test_transforms.py b/test/test_transforms.py
@@ -13,6 +13,7 @@
 import sys
 from copy import copy
 from functools import partial
+from sys import platform
 
 import numpy as np
 import pytest
@@ -125,6 +126,12 @@
 from torchrl.envs.utils import check_env_specs, step_mdp
 from torchrl.modules import GRUModule, LSTMModule, MLP, ProbabilisticActor, TanhNormal
 
+IS_WIN = platform == "win32"
+if IS_WIN:
+    mp_ctx = "spawn"
+else:
+    mp_ctx = "fork"
+
 TIMEOUT = 100.0
 
 _has_gymnasium = importlib.util.find_spec("gymnasium") is not None
@@ -9404,7 +9411,7 @@ def make_env():
         env = ParallelEnv(
             2,
             make_env,
-            mp_start_method="fork" if not torch.cuda.is_available() else "spawn",
+            mp_start_method=mp_ctx if not torch.cuda.is_available() else "spawn",
         )
         assert env.device is None
         try:
@@ -9447,7 +9454,7 @@ def make_env():
             ParallelEnv(
                 2,
                 make_env,
-                mp_start_method="fork" if not torch.cuda.is_available() else "spawn",
+                mp_start_method=mp_ctx if not torch.cuda.is_available() else "spawn",
             ),
             DeviceCastTransform(
                 "cpu:1",
@@ -10696,7 +10703,7 @@ def make_env(stateless=stateless, reshape_fn=reshape_fn):
             assert env.batch_size == expected_batch_size
             return env
 
-        env = ParallelEnv(2, make_env, mp_start_method="fork")
+        env = ParallelEnv(2, make_env, mp_start_method=mp_ctx)
         assert env.batch_size == (2, *make_env().batch_size)
         check_env_specs(env)
 
@@ -10751,7 +10758,7 @@ def make_env(stateless=stateless, reshape_fn=reshape_fn):
         assert transform.batch_size is None
 
         env = TransformedEnv(
-            ParallelEnv(2, make_env, mp_start_method="fork"), transform
+            ParallelEnv(2, make_env, mp_start_method=mp_ctx), transform
         )
         assert env.batch_size == expected_batch_size
         check_env_specs(env)

diff --git a/torchrl/data/replay_buffers/samplers.py b/torchrl/data/replay_buffers/samplers.py
@@ -1572,10 +1572,6 @@ def __init__(
         )
 
     def __repr__(self):
-        if self._sample_list is not None:
-            perc = len(self._sample_list) / self.len_storage * 100
-        else:
-            perc = 0.0
         return (
             f"{self.__class__.__name__}("
             f"num_slices={self.num_slices}, "
@@ -1586,8 +1582,7 @@ def __repr__(self):
             f"strict_length={self.strict_length},"
             f"alpha={self._alpha}, "
             f"beta={self._beta}, "
-            f"eps={self._eps},"
-            f"{perc: 4.4f}% filled)"
+            f"eps={self._eps}"
         )
 
     def __getstate__(self):
@@ -1726,7 +1721,7 @@ def sample(self, storage: Storage, batch_size: int) -> Tuple[torch.Tensor, dict]
             terminated = torch.zeros_like(truncated)
             if traj_terminated.any():
                 if isinstance(seq_length, int):
-                    terminated.view(num_slices, -1)[:, traj_terminated] = 1
+                    terminated.view(num_slices, -1)[traj_terminated, -1] = 1
                 else:
                     terminated[(seq_length.cumsum(0) - 1)[traj_terminated]] = 1
             truncated = truncated & ~terminated