forked from kubeflow/examples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[mnist_pytorch] Optimise build and switch backend from MPI to GLOO (k…
…ubeflow#480) * Refactor Python module: - Replace MPI by GLOO as backend to avoid having to recompily Pytorch - Replace DistributedDataParallel() class with official version when using GPUs - Remove unnecessary method to disable logs in workers - Refactor run() * Simplify Dockerfile by using Pytorch 0.4 official image with Cuda and remove mpirun call
- Loading branch information
1 parent
1ed08b9
commit 152c38b
Showing
3 changed files
with
13 additions
and
119 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,44 +1,8 @@ | ||
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 | ||
ARG PYTHON_VERSION=3.6 | ||
FROM pytorch/pytorch:0.4_cuda9_cudnn7 | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
cmake \ | ||
git \ | ||
curl \ | ||
vim \ | ||
ca-certificates \ | ||
openssh-client \ | ||
libjpeg-dev \ | ||
libpng-dev &&\ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ | ||
chmod +x ~/miniconda.sh && \ | ||
~/miniconda.sh -b -p /opt/conda && \ | ||
rm ~/miniconda.sh && \ | ||
/opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include cython typing && \ | ||
/opt/conda/bin/conda install -y -c pytorch magma-cuda90 && \ | ||
/opt/conda/bin/conda install -c conda-forge openmpi && \ | ||
/opt/conda/bin/conda clean -ya | ||
ENV PATH /opt/conda/bin:$PATH | ||
# This must be done before pip so that requirements.txt is available | ||
WORKDIR /opt/pytorch | ||
#COPY . . | ||
RUN git clone --recursive https://github.com/pytorch/pytorch | ||
#RUN git submodule update --init --recursive | ||
# Checkout 1.0rc1 release as latest master seems to have MPI backend detection broken | ||
RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ | ||
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ | ||
cd pytorch/ && git checkout tags/v1.0rc1 && git submodule update --init --recursive && \ | ||
pip install -v . | ||
|
||
RUN /opt/conda/bin/conda config --set ssl_verify False | ||
RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org | ||
RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org torchvision | ||
WORKDIR /workspace | ||
RUN chmod -R a+w /workspace | ||
|
||
ADD ./mnist_DDP.py /opt/pytorch_dist_mnist/ | ||
ENTRYPOINT ["mpirun", "-n", "4", "--allow-run-as-root", "python", "-u", "/opt/pytorch_dist_mnist/mnist_DDP.py", "--modelpath", "/mnt/kubeflow-gcfs/pytorch/model"] | ||
|
||
ENTRYPOINT ["python", "-u", "/opt/pytorch_dist_mnist/mnist_DDP.py", "--modelpath", "/mnt/kubeflow-gcfs/pytorch/model"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,55 +1,9 @@ | ||
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 | ||
ARG PYTHON_VERSION=3.6 | ||
FROM pytorch/pytorch:0.4_cuda9_cudnn7 | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
cmake \ | ||
git \ | ||
curl \ | ||
vim \ | ||
wget \ | ||
ca-certificates \ | ||
openssh-client \ | ||
libjpeg-dev \ | ||
libpng-dev &&\ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
RUN wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \ | ||
gunzip -c openmpi-3.0.0.tar.gz | tar xf - && \ | ||
cd openmpi-3.0.0 && \ | ||
./configure --prefix=/home/.openmpi --with-cuda && \ | ||
make all install | ||
|
||
ENV PATH="$PATH:/home/.openmpi/bin" | ||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" | ||
|
||
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value | ||
RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ | ||
chmod +x ~/miniconda.sh && \ | ||
~/miniconda.sh -b -p /opt/conda && \ | ||
rm ~/miniconda.sh && \ | ||
/opt/conda/bin/conda update conda && \ | ||
/opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include cython typing && \ | ||
/opt/conda/bin/conda clean -ya | ||
ENV PATH /opt/conda/bin:$PATH | ||
# This must be done before pip so that requirements.txt is available | ||
WORKDIR /opt/pytorch | ||
|
||
RUN git clone --recursive https://github.com/pytorch/pytorch | ||
|
||
# Checkout 1.0rc1 release as latest master seems to have MPI backend detection broken | ||
RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ | ||
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" BUILD_ATEN_MOBILE=ON USE_OPENMP=OFF \ | ||
cd pytorch/ && git checkout tags/v1.0rc1 && git submodule update --init --recursive && \ | ||
pip install -v . | ||
|
||
RUN /opt/conda/bin/conda config --set ssl_verify False | ||
RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org | ||
RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org torchvision | ||
WORKDIR /workspace | ||
RUN chmod -R a+w /workspace | ||
ADD ./mnist_DDP.py /opt/pytorch_dist_mnist/ | ||
|
||
ENTRYPOINT ["mpirun", "-n", "4", "--allow-run-as-root", "python", "-u", "/opt/pytorch_dist_mnist/mnist_DDP.py", "--gpu", "--modelpath", "/mnt/kubeflow-gcfs/pytorch/model"] | ||
ENTRYPOINT ["python", "-u", "/opt/pytorch_dist_mnist/mnist_DDP.py", "--gpu", "--modelpath", "/mnt/kubeflow-gcfs/pytorch/model"] | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters