[CI] Reduce CI time (pytorch#1226)

Co-authored-by: BY571 <sebastian.dittert@gmx.de>
ai4co · Jun 7, 2023 · e1d1874 · e1d1874
1 parent 76bb23a
commit e1d1874
Show file tree

Hide file tree

Showing 34 changed files with 420 additions and 325 deletions.
diff --git a/.circleci/unittest/linux/scripts/run_test.sh b/.circleci/unittest/linux/scripts/run_test.sh
@@ -19,8 +19,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir
 export MKL_THREADING_LAYER=GNU
 export CKPT_BACKEND=torch
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 20
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 20 -k 'test_gym or test_dm_control_pixels or test_dm_control or test_tb'
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest --instafail -v --durations 20 --ignore test/test_distributed.py
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 200
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym or test_dm_control_pixels or test_dm_control or test_tb'
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest --instafail -v --durations 200 --ignore test/test_distributed.py
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_distributed/scripts/run_test.sh b/.circleci/unittest/linux_distributed/scripts/run_test.sh
@@ -19,8 +19,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir
 export MKL_THREADING_LAYER=GNU
 export CKPT_BACKEND=torch
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 20
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 20 -k 'test_gym or test_dm_control_pixels or test_dm_control or test_tb'
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py --instafail -v --durations 20
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 200
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym or test_dm_control_pixels or test_dm_control or test_tb'
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py --instafail -v --durations 200
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_examples/scripts/run_test.sh b/.circleci/unittest/linux_examples/scripts/run_test.sh
@@ -25,8 +25,8 @@ lib_dir="${env_dir}/lib"
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir
 export MKL_THREADING_LAYER=GNU
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 20
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 20
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 200
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 200
 
 # With batched environments
 python .circleci/unittest/helpers/coverage_run_parallel.py examples/ppo/ppo.py \

diff --git a/.circleci/unittest/linux_libs/scripts_brax/run_test.sh b/.circleci/unittest/linux_libs/scripts_brax/run_test.sh
@@ -28,6 +28,6 @@ export MAGNUM_LOG=verbose MAGNUM_GPU_VALIDATION=ON
 # this workflow only tests the libs
 python -c "import brax"
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 20 --capture no -k TestBrax --error-for-skips
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 --capture no -k TestBrax --error-for-skips
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_libs/scripts_d4rl/run_test.sh b/.circleci/unittest/linux_libs/scripts_d4rl/run_test.sh
@@ -37,6 +37,6 @@ conda deactivate && conda activate ./env
 # this workflow only tests the libs
 python -c "import gym, d4rl"
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 20 --capture no -k TestD4RL --error-for-skips
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 --capture no -k TestD4RL --error-for-skips
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_libs/scripts_envpool/run_test.sh b/.circleci/unittest/linux_libs/scripts_envpool/run_test.sh
@@ -27,6 +27,6 @@ export MKL_THREADING_LAYER=GNU
 # this workflow only tests the libs
 python -c "import envpool"
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 20 --capture no -k TestEnvPool --error-for-skips
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 --capture no -k TestEnvPool --error-for-skips
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_libs/scripts_gym/run_test.sh b/.circleci/unittest/linux_libs/scripts_gym/run_test.sh
@@ -17,11 +17,11 @@ lib_dir="${env_dir}/lib"
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir
 export MKL_THREADING_LAYER=GNU
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 20
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 20 -k 'test_gym'
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 200
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym'
 
 export DISPLAY=':99.0'
 Xvfb :99 -screen 0 1400x900x24 > /dev/null 2>&1 &
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 20 -k "gym" --error-for-skips
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 -k "gym" --error-for-skips
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_libs/scripts_habitat/run_test.sh b/.circleci/unittest/linux_libs/scripts_habitat/run_test.sh
@@ -41,6 +41,6 @@ env = HabitatEnv('HabitatRenderPick-v0')
 env.reset()
 """
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 20 --capture no -k TestHabitat --error-for-skips
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 --capture no -k TestHabitat --error-for-skips
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_libs/scripts_jumanji/run_test.sh b/.circleci/unittest/linux_libs/scripts_jumanji/run_test.sh
@@ -28,6 +28,6 @@ export MAGNUM_LOG=verbose MAGNUM_GPU_VALIDATION=ON
 # this workflow only tests the libs
 python -c "import jumanji"
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 20 --capture no -k TestJumanji --error-for-skips
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 --capture no -k TestJumanji --error-for-skips
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_libs/scripts_sklearn/run_test.sh b/.circleci/unittest/linux_libs/scripts_sklearn/run_test.sh
@@ -22,6 +22,6 @@ conda deactivate && conda activate ./env
 # this workflow only tests the libs
 python -c "import sklearn, pandas"
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 20 --capture no -k TestOpenML --error-for-skips
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 --capture no -k TestOpenML --error-for-skips
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_libs/scripts_vmas/run_test.sh b/.circleci/unittest/linux_libs/scripts_vmas/run_test.sh
@@ -25,6 +25,6 @@ export MAGNUM_LOG=verbose MAGNUM_GPU_VALIDATION=ON
 # this workflow only tests the libs
 python -c "import vmas"
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 20 --capture no -k TestVmas --error-for-skips
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 --capture no -k TestVmas --error-for-skips
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_olddeps/scripts_gym_0_13/run_test.sh b/.circleci/unittest/linux_olddeps/scripts_gym_0_13/run_test.sh
@@ -17,13 +17,13 @@ lib_dir="${env_dir}/lib"
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir
 export MKL_THREADING_LAYER=GNU
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 20
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 20 -k 'test_gym or test_dm_control_pixels or test_dm_control'
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 200
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym or test_dm_control_pixels or test_dm_control'
 
 export DISPLAY=':99.0'
 Xvfb :99 -screen 0 1400x900x24 > /dev/null 2>&1 &
-CKPT_BACKEND=torch MUJOCO_GL=egl python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest --instafail -v --durations 20 --ignore test/test_distributed.py
-#pytest --instafail -v --durations 20
+CKPT_BACKEND=torch MUJOCO_GL=egl python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest --instafail -v --durations 200 --ignore test/test_distributed.py
+#pytest --instafail -v --durations 200
 #python test/test_libs.py
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_optdeps/scripts/run_test.sh b/.circleci/unittest/linux_optdeps/scripts/run_test.sh
@@ -17,7 +17,7 @@ root_dir="$(git rev-parse --show-toplevel)"
 export MKL_THREADING_LAYER=GNU
 export CKPT_BACKEND=torch
 
-#MUJOCO_GL=glfw pytest --cov=torchrl --junitxml=test-results/junit.xml -v --durations 20
-MUJOCO_GL=egl python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest --instafail -v --durations 20 --ignore test/test_distributed.py
+#MUJOCO_GL=glfw pytest --cov=torchrl --junitxml=test-results/junit.xml -v --durations 200
+MUJOCO_GL=egl python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest --instafail -v --durations 200 --ignore test/test_distributed.py
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/linux_stable/scripts/run_test.sh b/.circleci/unittest/linux_stable/scripts/run_test.sh
@@ -19,8 +19,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir
 export MKL_THREADING_LAYER=GNU
 export CKPT_BACKEND=torch
 
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 20
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 20 -k 'test_gym or test_dm_control_pixels or test_dm_control or test_tb'
-python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest --instafail -v --durations 20 --ignore test/test_distributed.py
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test.py -v --durations 200
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym or test_dm_control_pixels or test_dm_control or test_tb'
+python .circleci/unittest/helpers/coverage_run_parallel.py -m pytest --instafail -v --durations 200 --ignore test/test_distributed.py
 coverage combine
 coverage xml -i
diff --git a/.circleci/unittest/windows_optdepts/scripts/run_test.sh b/.circleci/unittest/windows_optdepts/scripts/run_test.sh
@@ -9,4 +9,4 @@ this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 source "$this_dir/set_cuda_envs.sh"
 
 python -m torch.utils.collect_env
-pytest --junitxml=test-results/junit.xml -v --durations 20  --ignore test/test_distributed.py
+pytest --junitxml=test-results/junit.xml -v --durations 200  --ignore test/test_distributed.py
diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
@@ -148,7 +148,7 @@ jobs:
           python -m torch.utils.collect_env
           python -c "import torchrl; print(torchrl.__version__);from torchrl.data import ReplayBuffer"
           EXIT_STATUS=0
-          pytest test/smoke_test.py -v --durations 20
+          pytest test/smoke_test.py -v --durations 200
           exit $EXIT_STATUS
 
   upload-wheel-linux:
@@ -270,7 +270,7 @@ jobs:
           python3 -m torch.utils.collect_env
           python3 -c "import torchrl; print(torchrl.__version__);from torchrl.data import ReplayBuffer"
           EXIT_STATUS=0
-          pytest test/smoke_test.py -v --durations 20
+          pytest test/smoke_test.py -v --durations 200
           exit $EXIT_STATUS
 
   build-wheel-windows:
@@ -359,7 +359,7 @@ jobs:
           python -m torch.utils.collect_env
           python -c "import torchrl; print(torchrl.__version__);from torchrl.data import ReplayBuffer"
           EXIT_STATUS=0
-          pytest test/smoke_test.py -v --durations 20
+          pytest test/smoke_test.py -v --durations 200
           exit $EXIT_STATUS
 
   upload-wheel-windows:

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -165,7 +165,7 @@ jobs:
           python -m torch.utils.collect_env
           python -c "import torchrl; print(torchrl.__version__)"
           EXIT_STATUS=0
-          pytest test/smoke_test.py -v --durations 20
+          pytest test/smoke_test.py -v --durations 200
           exit $EXIT_STATUS
 
   test-wheel-windows:
@@ -221,5 +221,5 @@ jobs:
           python -m torch.utils.collect_env
           python -c "import torchrl; print(torchrl.__version__)"
           EXIT_STATUS=0
-          pytest test/smoke_test.py -v --durations 20
+          pytest test/smoke_test.py -v --durations 200
           exit $EXIT_STATUS
diff --git a/test/_utils_internal.py b/test/_utils_internal.py
@@ -87,6 +87,17 @@ def get_available_devices():
     return devices
 
 
+def get_default_devices():
+    num_cuda = torch.cuda.device_count()
+    if num_cuda == 0:
+        return [torch.device("cpu")]
+    elif num_cuda == 1:
+        return [torch.device("cuda:0")]
+    else:
+        # then run on all devices
+        return get_available_devices()
+
+
 def generate_seeds(seed, repeat):
     seeds = [seed]
     for _ in range(repeat - 1):

diff --git a/test/test_actors.py b/test/test_actors.py
@@ -7,7 +7,7 @@
 import pytest
 import torch
 
-from _utils_internal import get_available_devices
+from _utils_internal import get_default_devices
 from tensordict import TensorDict
 from tensordict.nn import TensorDictModule
 from torch import nn
@@ -408,7 +408,7 @@ def test_qvalue_hook_categorical_1_dim_batch(self, action_space, expected_action
         assert (values == in_values).all()
 
 
-@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("device", get_default_devices())
 def test_value_based_policy(device):
     torch.manual_seed(0)
     obs_dim = 4
@@ -481,7 +481,7 @@ def test_qvalactor_construct(
     QValueActor(**kwargs)
 
 
-@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("device", get_default_devices())
 def test_value_based_policy_categorical(device):
     torch.manual_seed(0)
     obs_dim = 4
@@ -512,7 +512,7 @@ def make_net():
     assert (0 <= action).all() and (action < action_dim).all()
 
 
-@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("device", get_default_devices())
 def test_actorcritic(device):
     common_module = SafeModule(
         module=nn.Linear(3, 4), in_keys=["obs"], out_keys=["hidden"], spec=None

diff --git a/test/test_collector.py b/test/test_collector.py
@@ -153,7 +153,7 @@ def _is_consistent_device_type(
     _os_is_windows and _python_is_3_10,
     reason="Windows Access Violation in torch.multiprocessing / BrokenPipeError in multiprocessing.connection",
 )
-@pytest.mark.parametrize("num_env", [1, 2])
+@pytest.mark.parametrize("num_env", [2])
 @pytest.mark.parametrize("device", ["cuda", "cpu", None])
 @pytest.mark.parametrize("policy_device", ["cuda", "cpu", None])
 @pytest.mark.parametrize("storing_device", ["cuda", "cpu", None])
@@ -185,7 +185,9 @@ def env_fn(seed):
     else:
 
         def env_fn(seed):
-            env = ParallelEnv(
+            # 1226: faster execution
+            # env = ParallelEnv(
+            env = SerialEnv(
                 num_workers=num_env,
                 create_env_fn=make_make_env("vec"),
                 create_env_kwargs=[{"seed": i} for i in range(seed, seed + num_env)],
@@ -476,8 +478,16 @@ def make_env(seed):
 
 
 @pytest.mark.parametrize("num_env", [1, 2])
-@pytest.mark.parametrize("env_name", ["vec", "conv"])
-def test_collector_batch_size(num_env, env_name, seed=100):
+@pytest.mark.parametrize(
+    "env_name",
+    [
+        "vec",
+    ],
+)  # 1226: for efficiency, we just test vec, not "conv"
+def test_collector_batch_size(
+    num_env, env_name, seed=100, num_workers=2, frames_per_batch=20
+):
+    """Tests that there are 'frames_per_batch' frames in each batch of a collection."""
     if num_env == 3 and _os_is_windows:
         pytest.skip("Test timeout (> 10 min) on CI pipeline Windows machine with GPU")
     if num_env == 1:
@@ -489,17 +499,16 @@ def env_fn():
     else:
 
         def env_fn():
-            env = ParallelEnv(
-                num_workers=num_env, create_env_fn=make_make_env(env_name)
-            )
+            # 1226: For efficiency, we don't use Parallel but Serial
+            # env = ParallelEnv(
+            env = SerialEnv(num_workers=num_env, create_env_fn=make_make_env(env_name))
             return env
 
     policy = make_policy(env_name)
 
     torch.manual_seed(0)
     np.random.seed(0)
-    num_workers = 2
-    frames_per_batch = 20
+
     ccollector = MultiaSyncDataCollector(
         create_env_fn=[env_fn for _ in range(num_workers)],
         policy=policy,
@@ -644,8 +653,13 @@ def env_fn(seed):
 
 
 @pytest.mark.parametrize("num_env", [1, 2])
-@pytest.mark.parametrize("collector_class", [SyncDataCollector, aSyncDataCollector])
-@pytest.mark.parametrize("env_name", ["conv", "vec"])
+@pytest.mark.parametrize(
+    "collector_class",
+    [
+        SyncDataCollector,
+    ],
+)  # aSyncDataCollector])
+@pytest.mark.parametrize("env_name", ["vec"])  # 1226: removing "conv" for efficiency
 def test_traj_len_consistency(num_env, env_name, collector_class, seed=100):
     """Tests that various frames_per_batch lead to the same results."""
 
@@ -669,9 +683,6 @@ def env_fn(seed):
 
     policy = make_policy(env_name)
 
-    def make_frames_per_batch(frames_per_batch):
-        return -(-frames_per_batch // num_env) * num_env
-
     collector1 = collector_class(
         create_env_fn=env_fn,
         create_env_kwargs={"seed": seed},
@@ -925,9 +936,10 @@ def make_env():
         MultiSyncDataCollector,
     ],
 )
-@pytest.mark.parametrize("init_random_frames", [0, 50])
-@pytest.mark.parametrize("explicit_spec", [False, True])
-@pytest.mark.parametrize("split_trajs", [True, False])
+@pytest.mark.parametrize("init_random_frames", [50])  # 1226: faster execution
+@pytest.mark.parametrize(
+    "explicit_spec,split_trajs", [[True, True], [False, False]]
+)  # 1226: faster execution
 def test_collector_output_keys(
     collector_class, init_random_frames, explicit_spec, split_trajs
 ):
@@ -1265,7 +1277,9 @@ def env_fn(seed):
             assert batch["collector"]["traj_ids"][0] != -1
             assert batch["collector"]["traj_ids"][1] == -1
 
-    @pytest.mark.parametrize("env_name", ["conv", "vec"])
+    @pytest.mark.parametrize(
+        "env_name", ["vec"]
+    )  # 1226: removing "conv" for efficiency
     def test_multisync_collector_interruptor_mechanism(self, env_name, seed=100):
 
         frames_per_batch = 800
@@ -1393,10 +1407,10 @@ def forward(self, td):
         [
             ["cpu", "cuda"],
             ["cuda", "cpu"],
-            ["cpu", "cuda:0"],
-            ["cuda:0", "cpu"],
-            ["cuda", "cuda:0"],
-            ["cuda:0", "cuda"],
+            # ["cpu", "cuda:0"],  # 1226: faster execution
+            # ["cuda:0", "cpu"],
+            # ["cuda", "cuda:0"],
+            # ["cuda:0", "cuda"],
         ],
     )
     def test_param_sync(self, give_weights, collector, policy_device, env_device):