Running jobs on SLURM now uses experiment configuration to allocate t…

…he right amount of resources (facebookresearch#184) Summary: Improve the submission of distributed training on SLURM (rebase of previous PR facebookresearch#144): - use the configuration of the experiment to deduce the number of nodes and GPUs to allocate on SLURM: the user does not have to specify it manually, avoiding potential mistakes - move SLURM config from bash to standard VISSL YAML config, allowing SLURM options (like the options of VISSL) to be configured with hydra overrides - use Python submitit library instead of bash to start SLURM jobs: a prerequisite to deal with the points above, moving to Python is what allows to read the hydra VISSL configuration to start the SLURM jobs Pull Request resolved: facebookresearch#184 Reviewed By: prigoyal Differential Revision: D26353975 Pulled By: QuentinDuval fbshipit-source-id: e38657ed825607d7c1b87a3933ccb3b88710cb47
ArrowLuo · Feb 10, 2021 · 8e20679 · 8e20679
1 parent 4345df7
commit 8e20679
Show file tree

Hide file tree

Showing 8 changed files with 367 additions and 392 deletions.
diff --git a/.gitignore b/.gitignore
@@ -69,6 +69,8 @@ docs/_build/
 # PyBuilder
 target/
 
+# PyCharm IDE
+.idea/
 
 # Docusaurus site
 website/yarn.lock

diff --git a/dev/launch_slurm.sh b/dev/launch_slurm.sh
@@ -1,134 +1,46 @@
 #!/bin/bash
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
+######################### EXAMPLE USAGE #################################
+#
+# ./dev/launch_slurm.sh
+#    config=benchmark/linear_image_classification/imagenet1k/eval_resnet_8gpu_transfer_in1k_linear
+#    config.MODEL.WEIGHTS_INIT.PARAMS_FILE=/checkpoint/user/checkpoint.torch
+#
+# Configuration for SLURM can be provided as additional hydra overrides:
+#
+# ./dev/launch_slurm.sh
+#    config=benchmark/linear_image_classification/imagenet1k/eval_resnet_8gpu_transfer_in1k_linear
+#    config.MODEL.WEIGHTS_INIT.PARAMS_FILE=/checkpoint/user/checkpoint.torch
+#    config.SLURM.NAME=linear_evaluation
+#    config.SLURM.COMMENT=linear_evaluation_on_simclr
+#    config.SLURM.PARTITION=learnfair
+
 ######################### INPUT PARAMS ##################################
-# number of machines to distribute training on
-NODES=${NODES-1}
-# number of gpus per machine to use for training
-NUM_GPU=${NUM_GPU-8}
-# gpus type: P100 | V100 | V100_32G etc. User should set this based on their machine
-GPU_TYPE=${GPU_TYPE-V100}
-# name of the training. for example: simclr_2node_resnet50_in1k. This is helpful to clearly recognize the training
-EXPT_NAME=${EXPT_NAME}
-# how much CPU memory to use
-MEM=${MEM-250g}
-# number of CPUs used for each trainer (i.e. each gpu)
-CPU=${CPU-8}
-# directory where all the training artifacts like checkpoints etc will be written
-OUTPUT_DIR=${OUTPUT_DIR}
-# partition of the cluster on which training should run. User should determine this parameter for their cluster
-PARTITION=${PARTITION-learnfair}
-# any helpful comment that slurm dashboard can display
-COMMENT=${COMMENT-vissl_training}
-GITHUB_REPO=${GITHUB_REPO-vissl}
-# what branch of VISSL should be used. specify your custom branch
-BRANCH=${BRANCH-master}
-# automatically determined and used for distributed training.
-# each training run must have a unique id and vissl defaults to date
-RUN_ID=$(date +'%Y%m%d')
-# number of dataloader workers to use per gpu
-NUM_DATA_WORKERS=${NUM_DATA_WORKERS-8}
-# multi-processing method to use in PyTorch. Options: forkserver | fork | spawn
-MULTI_PROCESSING_METHOD=${MULTI_PROCESSING_METHOD-forkserver}
 
-# specify the training configuration to run. For example: to train swav for 100epochs
-# config=pretrain/swav/swav_8node_resnet config.OPTIMIZER.num_epochs=1-00
 CFG=( "$@" )
 
-
-if [ "$NODES" = "1" ]; then
-  SLURM_START_IDX=9
-else
-  SLURM_START_IDX=10
-fi
-
-EXP_ROOT_DIR="/checkpoint/$USER/${GITHUB_REPO}/${RUN_ID}_${BRANCH}/$EXPT_NAME/"
-
-echo $SLURM_START_IDX
-####################### SBATCH settings ####################################
-URL="git@github.com:fairinternal/ssl_scaling.git"
-HEADER="/private/home/$USER/temp_header"
-cat > ${HEADER} <<- EOM
-#!/bin/bash
-#SBATCH --nodes=$NODES
-#SBATCH --gres=gpu:$NUM_GPU
-#SBATCH --ntasks-per-node=1
-#SBATCH --cpus-per-task=$CPU
-#SBATCH --partition=$PARTITION
-#SBATCH --comment="$COMMENT"
-#SBATCH --time=72:00:00
-#SBATCH --signal=USR1@600
-#SBATCH --open-mode=append
-#SBATCH --mem=$MEM
-#SBATCH --output=$EXP_ROOT_DIR/%j.out
-#SBATCH --err=$EXP_ROOT_DIR/%j.err
-
-master_node=\${SLURM_NODELIST:0:9}\${SLURM_NODELIST:$SLURM_START_IDX:4}
-echo \$master_node
-dist_run_id=\$master_node
-EOM
-
-echo "HEADER: $HEADER"
-
 ####################### setup experiment dir ###################################
-# create the experiments folder
-RUN_SCRIPT="$EXP_ROOT_DIR/$GITHUB_REPO/tools/run_distributed_engines.py"
+
+# create a temporary experiment folder to run the SLURM job in isolation
+RUN_ID=$(date +'%Y-%m-%d-%H-%M-%S')
+EXP_ROOT_DIR="/checkpoint/$USER/vissl/$RUN_ID"
 CHECKPOINT_DIR="$EXP_ROOT_DIR/checkpoints/"
 
 echo "EXP_ROOT_DIR: $EXP_ROOT_DIR"
-echo "RUN_SCRIPT: $RUN_SCRIPT"
 echo "CHECKPOINT_DIR: $CHECKPOINT_DIR"
 
-# make the exp_dir and clone github code and relevant branch
 rm -rf $EXP_ROOT_DIR
 mkdir -p "$EXP_ROOT_DIR"
 mkdir -p "$CHECKPOINT_DIR"
-cd "$EXP_ROOT_DIR" || exit
-git clone "$URL" -b "$BRANCH" --single-branch
-cd "$GITHUB_REPO" || exit
-git submodule update --init
-SHA1=$(git rev-parse HEAD)
-echo "$SHA1">"$EXP_ROOT_DIR"/git
-
-####################### prepare launch script ##################################
-dist_port=40050
-((dist_port++))
-
-SCRIPT_PATH="$EXP_ROOT_DIR/launcher.sh"
-cp "$HEADER" "$SCRIPT_PATH"
-
-echo "export PYTHONPATH="$EXP_ROOT_DIR/$GITHUB_REPO/:$PYTHONPATH"
-dist_run_id+=":$dist_port"
-echo \$dist_run_id
-srun --label python -u $RUN_SCRIPT \
-  hydra.run.dir=$CHECKPOINT_DIR \
-  ${CFG[*]} \
-  config.CHECKPOINT.DIR=$CHECKPOINT_DIR \
-  config.DATA.NUM_DATALOADER_WORKERS=$NUM_DATA_WORKERS \
-  config.MULTI_PROCESSING_METHOD=$MULTI_PROCESSING_METHOD \
-  config.DISTRIBUTED.INIT_METHOD=tcp \
-  config.DISTRIBUTED.RUN_ID=\$dist_run_id " >> "$SCRIPT_PATH"
-chmod +x "$SCRIPT_PATH"
-((dist_port++))
-
-########################### setup trap handler ##################################
+cp -r . $EXP_ROOT_DIR
 
-# Install signal handler for automatic requeue
-trap_handler () {
-   echo "Caught signal: $1"
-   # SIGTERM must be bypassed
-   if [ "$1" = "TERM" ]; then
-       echo "bypass sigterm"
-   else
-     # Submit a new job to the queue
-     echo "Requeuing  ${SLURM_ARRAY_JOB_ID} ${SLURM_ARRAY_TASK_ID}"
-     # SLURM_JOB_ID is a unique representation of the job, equivalent
-     # to above
-     scontrol requeue "${SLURM_JOB_ID}"
-   fi
-}
-trap 'trap_handler USR1' USR1
-trap 'trap_handler TERM' TERM
+####################### launch script #########################################
 
-########################### launch experiment ##################################
-sbatch --job-name="$EXPT_NAME" "$SCRIPT_PATH"
+export PYTHONPATH="$EXP_ROOT_DIR/:$PYTHONPATH"
+python -u "$EXP_ROOT_DIR/tools/run_distributed_engines.py" \
+  "${CFG[@]}" \
+  hydra.run.dir="$EXP_ROOT_DIR" \
+  config.SLURM.ENABLED=true \
+  config.SLURM.LOG_FOLDER="$EXP_ROOT_DIR" \
+  config.CHECKPOINT.DIR="$CHECKPOINT_DIR"
diff --git a/docs/source/large_scale/distributed_training.rst b/docs/source/large_scale/distributed_training.rst
@@ -107,63 +107,5 @@ The list of all the options exposed by VISSL:
 Using SLURM
 =============
 
-VISSL supports SLURM by default for training models. VISSL code automatically detects if the training environment is SLURM based on SLURM environment variables like :code:`SLURM_NODEID`, :code:`SLURMD_NODENAME`, :code:`SLURM_STEP_NODELIST`.
-
-VISSL also provides a helper bash script `dev/launch_slurm.sh <https://github.com/facebookresearch/vissl/blob/master/dev/launch_slurm.sh>`_ that allows launching a given training on SLURM. Users can modify this script to meet their needs.
-
-The bash script takes the following inputs:
-
-
-.. code-block:: bash
-
-    # number of machines to distribute training on
-    NODES=${NODES-1}
-    # number of gpus per machine to use for training
-    NUM_GPU=${NUM_GPU-8}
-    # gpus type: P100 | V100 | V100_32G etc. User should set this based on their machine
-    GPU_TYPE=${GPU_TYPE-V100}
-    # name of the training. for example: simclr_2node_resnet50_in1k. This is helpful to clearly recognize the training
-    EXPT_NAME=${EXPT_NAME}
-    # how much CPU memory to use
-    MEM=${MEM-250g}
-    # number of CPUs used for each trainer (i.e. each gpu)
-    CPU=${CPU-8}
-    # directory where all the training artifacts like checkpoints etc will be written
-    OUTPUT_DIR=${OUTPUT_DIR}
-    # partition of the cluster on which training should run. User should determine this parameter for their cluster
-    PARTITION=${PARTITION-learnfair}
-    # any helpful comment that slurm dashboard can display
-    COMMENT=${COMMENT-vissl_training}
-    GITHUB_REPO=${GITHUB_REPO-vissl}
-    # what branch of VISSL should be used. specify your custom branch
-    BRANCH=${BRANCH-master}
-    # automatically determined and used for distributed training.
-    # each training run must have a unique id and vissl defaults to date
-    RUN_ID=$(date +'%Y%m%d')
-    # number of dataloader workers to use per gpu
-    NUM_DATA_WORKERS=${NUM_DATA_WORKERS-8}
-    # multi-processing method to use in PyTorch. Options: forkserver | fork | spawn
-    MULTI_PROCESSING_METHOD=${MULTI_PROCESSING_METHOD-forkserver}
-    # specify the training configuration to run. For example: to train swav for 100epochs
-    # config=pretrain/swav/swav_8node_resnet config.OPTIMIZER.num_epochs=100
-    CFG=( "$@" )
-
-
-To run the script for training SwAV on 8 machines where each machine has 8-gpus and for 100epochs, the script can be run as:
-
-
-.. code-block:: bash
-
-    cd $HOME/vissl && NODES=8 \
-      NUM_GPU=8 \
-      GPU_TYPE=V100 \
-      MEM=200g \
-      CPU=8 \
-      EXPT_NAME=swav_100ep_rn50_in1k \
-      OUTPUT_DIR=/tmp/swav/ \
-      PARTITION=learnfair \
-      BRANCH=master \
-      NUM_DATA_WORKERS=4 \
-      MULTI_PROCESSING_METHOD=forkserver \
-      ./dev/launch_slurm.sh \
-      config=pretrain/swav/swav_8node_resnet config.OPTIMIZER.num_epochs=100
+Please follow the documentation
+`here <https://github.com/facebookresearch/vissl/blob/master/docs/source/train_resource_setup.rst#train-on-slurm-cluster>`_
diff --git a/docs/source/train_resource_setup.rst b/docs/source/train_resource_setup.rst
@@ -8,7 +8,7 @@ VISSL supports training any model on CPUs. Typically, this involves correctly se
     MACHINE:
       DEVICE: cpu
     DISTRIBUTED:
-      BACKEND: gloo           # set to "gloo" for cpu only trianing
+      BACKEND: gloo           # set to "gloo" for cpu only training
       NUM_NODES: 1            # no change needed
       NUM_PROC_PER_NODE: 2    # user sets this to number of gpus to use
       INIT_METHOD: tcp        # set to "file" if desired
@@ -33,61 +33,58 @@ Train on SLURM cluster
 
 VISSL supports SLURM by default for training models. VISSL code automatically detects if the training environment is SLURM based on SLURM environment variables like :code:`SLURM_NODEID`, :code:`SLURMD_NODENAME`, :code:`SLURM_STEP_NODELIST`.
 
-VISSL also provides a helper bash script `dev/launch_slurm.sh <https://github.com/facebookresearch/vissl/blob/master/dev/launch_slurm.sh>`_ that allows launching a given training on SLURM. Users can modify this script to meet their needs.
+VISSL also provides a helper bash script `dev/launch_slurm.sh <https://github.com/facebookresearch/vissl/blob/master/dev/launch_slurm.sh>`_ that allows launching a given training on SLURM.
+This script uses the content of the configuration to allocate the right number of nodes and GPUs on SLURM.
 
-The bash script takes the following inputs:
+More precisely, the number of nodes and GPU by node to allocate is driven by the usual DISTRIBUTED training configuration:
 
+.. code-block:: yaml
+
+    DISTRIBUTED:
+      NUM_NODES: 1            # no change needed
+      NUM_PROC_PER_NODE: 2    # user sets this to number of gpus to use
+
+While the more SLURM specific options are located in the "SLURM" configuration block:
+
+.. code-block:: yaml
+
+  SLURM:
+    # Name of the job on SLURM
+    NAME: "vissl"
+    # Comment of the job on SLURM
+    COMMENT: "vissl job"
+    # Partition of SLURM on which to run the job
+    PARTITION: "learnfair"
+    # Where the logs produced by the SLURM jobs will be output
+    LOG_FOLDER: "."
+    # Maximum number of hours needed by the job to complete. Above this limit, the job might be pre-empted.
+    TIME_HOURS: 72
+    # Additional constraints on the hardware of the nodes to allocate (example 'volta' to select a volta GPU)
+    CONSTRAINT: ""
+    # GB of RAM memory to allocate for each node
+    MEM_GB: 250
+    # TCP port on which the workers will synchronize themselves with torch distributed
+    PORT_ID: 40050
+
+Users can customize these values by using the standard hydra override syntax (same as for any other item in the configuration), or can modify the script to fit their needs.
+
+**Examples:**
+
+To run a linear evaluation benchmark on a chosen checkpoint, on the SLURM partition named "dev", with the name "lin_eval":
 
 .. code-block:: bash
 
-    # number of machines to distribute training on
-    NODES=${NODES-1}
-    # number of gpus per machine to use for training
-    NUM_GPU=${NUM_GPU-8}
-    # gpus type: P100 | V100 | V100_32G etc. User should set this based on their machine
-    GPU_TYPE=${GPU_TYPE-V100}
-    # name of the training. for example: simclr_2node_resnet50_in1k. This is helpful to clearly recognize the training
-    EXPT_NAME=${EXPT_NAME}
-    # how much CPU memory to use
-    MEM=${MEM-250g}
-    # number of CPUs used for each trainer (i.e. each gpu)
-    CPU=${CPU-8}
-    # directory where all the training artifacts like checkpoints etc will be written
-    OUTPUT_DIR=${OUTPUT_DIR}
-    # partition of the cluster on which training should run. User should determine this parameter for their cluster
-    PARTITION=${PARTITION-learnfair}
-    # any helpful comment that slurm dashboard can display
-    COMMENT=${COMMENT-vissl_training}
-    GITHUB_REPO=${GITHUB_REPO-vissl}
-    # what branch of VISSL should be used. specify your custom branch
-    BRANCH=${BRANCH-master}
-    # automatically determined and used for distributed training.
-    # each training run must have a unique id and vissl defaults to date
-    RUN_ID=$(date +'%Y%m%d')
-    # number of dataloader workers to use per gpu
-    NUM_DATA_WORKERS=${NUM_DATA_WORKERS-8}
-    # multi-processing method to use in PyTorch. Options: forkserver | fork | spawn
-    MULTI_PROCESSING_METHOD=${MULTI_PROCESSING_METHOD-forkserver}
-    # specify the training configuration to run. For example: to train swav for 100epochs
-    # config=pretrain/swav/swav_8node_resnet config.OPTIMIZER.num_epochs=100
-    CFG=( "$@" )
-
-
-To run the script for training SwAV on 8 machines where each machine has 8-gpus and for 100epochs, the script can be run as:
+    ./dev/launch_slurm.sh \
+        config=benchmark/linear_image_classification/imagenet1k/eval_resnet_8gpu_transfer_in1k_linear \
+        config.MODEL.WEIGHTS_INIT.PARAMS_FILE=/path/to/my/checkpoint.torch \
+        config.SLURM.NAME=lin_eval \
+        config.SLURM.PARTITION=dev
 
+To run a distributed training of SwAV on 8 nodes where each machine has 8 GPUs and for 100 epochs, on the default partition, with the name "swav_100ep_rn50_in1k":
 
 .. code-block:: bash
 
-    cd $HOME/vissl && NODES=8 \
-      NUM_GPU=8 \
-      GPU_TYPE=V100 \
-      MEM=200g \
-      CPU=8 \
-      EXPT_NAME=swav_100ep_rn50_in1k \
-      OUTPUT_DIR=/tmp/swav/ \
-      PARTITION=learnfair \
-      BRANCH=master \
-      NUM_DATA_WORKERS=4 \
-      MULTI_PROCESSING_METHOD=forkserver \
-      ./dev/launch_slurm.sh \
-      config=pretrain/swav/swav_8node_resnet config.OPTIMIZER.num_epochs=100
+    ./dev/launch_slurm.sh \
+        config=pretrain/swav/swav_8node_resnet \
+        config.OPTIMIZER.num_epochs=100 \
+        config.SLURM.NAME=swav_100ep_rn50_in1k
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,7 @@ cython
 scikit-learn
 parameterized==0.7.4
 numpy>=1.15
+submitit>=1.1.5
 tabulate
 pycocotools>=2.0.1
 fvcore