[mnist_pytorch] Optimise build and switch backend from MPI to GLOO (k…

…ubeflow#480) * Refactor Python module: - Replace MPI by GLOO as backend to avoid having to recompily Pytorch - Replace DistributedDataParallel() class with official version when using GPUs - Remove unnecessary method to disable logs in workers - Refactor run() * Simplify Dockerfile by using Pytorch 0.4 official image with Cuda and remove mpirun call
Willjay90 · Jan 16, 2019 · 152c38b · 152c38b
1 parent 1ed08b9
commit 152c38b
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 119 deletions.
diff --git a/pytorch_mnist/training/ddp/mnist/Dockerfile.traincpu b/pytorch_mnist/training/ddp/mnist/Dockerfile.traincpu
@@ -1,44 +1,8 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-ARG PYTHON_VERSION=3.6
+FROM pytorch/pytorch:0.4_cuda9_cudnn7
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
-         build-essential \
-         cmake \
-         git \
-         curl \
-         vim \
-         ca-certificates \
-         openssh-client \
-         libjpeg-dev \
-         libpng-dev &&\
-     rm -rf /var/lib/apt/lists/*
-
-RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include cython typing && \
-     /opt/conda/bin/conda install -y -c pytorch magma-cuda90 && \
-     /opt/conda/bin/conda install -c conda-forge openmpi && \
-     /opt/conda/bin/conda clean -ya
-ENV PATH /opt/conda/bin:$PATH
-# This must be done before pip so that requirements.txt is available
-WORKDIR /opt/pytorch
-#COPY . .
-RUN git clone --recursive https://github.com/pytorch/pytorch
-#RUN git submodule update --init --recursive
-# Checkout 1.0rc1 release as latest master seems to have MPI backend detection broken
-RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
-    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
-    cd pytorch/ && git checkout tags/v1.0rc1 && git submodule update --init --recursive && \
-    pip install -v .
-
-RUN /opt/conda/bin/conda config --set ssl_verify False
-RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
-RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org torchvision
 WORKDIR /workspace
 RUN chmod -R a+w /workspace
-
 ADD ./mnist_DDP.py /opt/pytorch_dist_mnist/
-ENTRYPOINT ["mpirun", "-n", "4", "--allow-run-as-root", "python", "-u", "/opt/pytorch_dist_mnist/mnist_DDP.py", "--modelpath", "/mnt/kubeflow-gcfs/pytorch/model"]
+
+ENTRYPOINT ["python", "-u", "/opt/pytorch_dist_mnist/mnist_DDP.py", "--modelpath", "/mnt/kubeflow-gcfs/pytorch/model"]
 
diff --git a/pytorch_mnist/training/ddp/mnist/Dockerfile.traingpu b/pytorch_mnist/training/ddp/mnist/Dockerfile.traingpu
@@ -1,55 +1,9 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-ARG PYTHON_VERSION=3.6
+FROM pytorch/pytorch:0.4_cuda9_cudnn7
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
-         build-essential \
-         cmake \
-         git \
-         curl \
-         vim \
-         wget \
-         ca-certificates \
-         openssh-client \
-         libjpeg-dev \
-         libpng-dev &&\
-     rm -rf /var/lib/apt/lists/*
-
-RUN wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
-    gunzip -c openmpi-3.0.0.tar.gz | tar xf - && \
-    cd openmpi-3.0.0 && \
-    ./configure --prefix=/home/.openmpi --with-cuda && \
-    make all install
-
-ENV PATH="$PATH:/home/.openmpi/bin"
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
-
-RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
-RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda update conda && \
-     /opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include cython typing && \
-     /opt/conda/bin/conda clean -ya
-ENV PATH /opt/conda/bin:$PATH
-# This must be done before pip so that requirements.txt is available
-WORKDIR /opt/pytorch
-
-RUN git clone --recursive https://github.com/pytorch/pytorch
-
-# Checkout 1.0rc1 release as latest master seems to have MPI backend detection broken
-RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
-    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" BUILD_ATEN_MOBILE=ON USE_OPENMP=OFF \
-    cd pytorch/ && git checkout tags/v1.0rc1 && git submodule update --init --recursive && \
-    pip install -v .
-
-RUN /opt/conda/bin/conda config --set ssl_verify False
-RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
-RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org torchvision
 WORKDIR /workspace
 RUN chmod -R a+w /workspace
 ADD ./mnist_DDP.py /opt/pytorch_dist_mnist/
 
-ENTRYPOINT ["mpirun", "-n", "4", "--allow-run-as-root", "python", "-u", "/opt/pytorch_dist_mnist/mnist_DDP.py", "--gpu", "--modelpath", "/mnt/kubeflow-gcfs/pytorch/model"]
+ENTRYPOINT ["python", "-u", "/opt/pytorch_dist_mnist/mnist_DDP.py", "--gpu", "--modelpath", "/mnt/kubeflow-gcfs/pytorch/model"]
 
 
diff --git a/pytorch_mnist/training/ddp/mnist/mnist_DDP.py b/pytorch_mnist/training/ddp/mnist/mnist_DDP.py
@@ -166,14 +166,17 @@ def average_gradients(model):
     param.grad.data /= size
 
 
-def run(rank, size, modelpath, gpu):
+def run(modelpath, gpu):
   """ Distributed Synchronous SGD Example """
+  rank = dist.get_rank()
   torch.manual_seed(1234)
   train_set, bsz = partition_dataset(rank)
   model = Net()
   if gpu:
     model = model.cuda()
-  model = DistributedDataParallel(model)
+    model = torch.nn.parallel.DistributedDataParallel(model)
+  else:
+    model = DistributedDataParallel(model)
   optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
   model_dir = modelpath
 
@@ -213,31 +216,6 @@ def run(rank, size, modelpath, gpu):
     logging.info("CPU training time= {}".format(str(datetime.datetime.now() - time_start)))
 
 
-def init_print(rank, size, debug_print=True):
-  if not debug_print:
-    """ In case run on hundreds of nodes, you may want to mute all the nodes except master """
-    if rank > 0:
-      sys.stdout = open(os.devnull, 'w')
-      sys.stderr = open(os.devnull, 'w')
-  else:
-    # labelled print with info of [rank/size]
-    old_out = sys.stdout
-
-    class LabeledStdout:
-      def __init__(self, rank, size):
-        self._r = rank
-        self._s = size
-        self.flush = sys.stdout.flush
-
-      def write(self, x):
-        if x == '\n':
-          old_out.write(x)
-        else:
-          old_out.write('[%d/%d] %s' % (self._r, self._s, x))
-
-    sys.stdout = LabeledStdout(rank, size)
-
-
 if __name__ == "__main__":
   import argparse
   logging.basicConfig(level=logging.INFO,
@@ -261,8 +239,6 @@ def write(self, x):
       logging.info("CUDA Device Name:", torch.cuda.get_device_name(0))
       logging.info("CUDA Version:", torch.version.cuda)
     logging.info("=========================\n")
-  dist.init_process_group(backend='mpi')
-  size = dist.get_world_size()
-  rank = dist.get_rank()
-  init_print(rank, size)
-  run(rank=rank, size=size, modelpath=args.modelpath, gpu=args.gpu)
+  dist.init_process_group(backend='gloo')
+  run(modelpath=args.modelpath, gpu=args.gpu)
+  dist.destroy_process_group()