Skip to content

Commit

Permalink
[docker] Support non-root container (ray-project#11407)
Browse files Browse the repository at this point in the history
  • Loading branch information
Leemoonsoo authored Nov 12, 2020
1 parent 62c7ab5 commit 9920933
Show file tree
Hide file tree
Showing 10 changed files with 71 additions and 45 deletions.
39 changes: 27 additions & 12 deletions docker/base-deps/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,23 @@ ENV TZ=America/Los_Angeles
ENV PATH "/root/anaconda3/bin:$PATH"
ARG DEBIAN_FRONTEND=noninteractive
ARG PYTHON_VERSION=3.7.7
RUN apt-get update -y && apt-get upgrade -y \
&& apt-get install -y \

ARG RAY_UID=1000
ARG RAY_GID=100

RUN apt-get update -y \
&& apt-get install -y sudo tzdata \
&& useradd -ms /bin/bash -d /home/ray ray --uid $RAY_UID --gid $RAY_GID \
&& usermod -aG sudo ray \
&& echo 'ray ALL=NOPASSWD: ALL' >> /etc/sudoers \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

USER $RAY_UID
ENV HOME=/home/ray

RUN sudo apt-get update -y && sudo apt-get upgrade -y \
&& sudo apt-get install -y \
git \
wget \
cmake \
Expand All @@ -26,7 +41,7 @@ RUN apt-get update -y && apt-get upgrade -y \
-O /tmp/miniconda.sh \
&& /bin/bash /tmp/miniconda.sh -b -u -p $HOME/anaconda3 \
&& $HOME/anaconda3/bin/conda init \
&& echo 'export PATH=$HOME/anaconda3/bin:$PATH' > /etc/profile.d/conda.sh \
&& echo 'export PATH=$HOME/anaconda3/bin:$PATH' >> /home/ray/.bashrc \
&& rm /tmp/miniconda.sh \
&& $HOME/anaconda3/bin/conda install -y \
libgcc python=$PYTHON_VERSION \
Expand All @@ -42,15 +57,15 @@ RUN apt-get update -y && apt-get upgrade -y \
# AttributeError: 'numpy.ufunc' object has no attribute '__module__'
&& $HOME/anaconda3/bin/pip uninstall -y dask \
# We install cmake temporarily to get psutil
&& apt-get autoremove -y cmake \
&& sudo apt-get autoremove -y cmake \
# Either install kubectl or remove wget
&& (if [ "$AUTOSCALER" = "autoscaler" ]; \
then wget -O - -q https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \
&& touch /etc/apt/sources.list.d/kubernetes.list \
&& echo "deb http://apt.kubernetes.io/ kubernetes-xenial main" | tee -a /etc/apt/sources.list.d/kubernetes.list \
&& apt-get update \
&& apt-get install kubectl; \
else apt-get autoremove -y wget; \
then wget -O - -q https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - \
&& sudo touch /etc/apt/sources.list.d/kubernetes.list \
&& echo "deb http://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee -a /etc/apt/sources.list.d/kubernetes.list \
&& sudo apt-get update \
&& sudo apt-get install kubectl; \
else sudo apt-get autoremove -y wget; \
fi;) \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
&& sudo rm -rf /var/lib/apt/lists/* \
&& sudo apt-get clean
2 changes: 1 addition & 1 deletion docker/ray-deps/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ RUN $HOME/anaconda3/bin/pip --no-cache-dir install $(basename $WHEEL_PATH)[all]
"azure-mgmt-compute==12.0.0" \
"azure-mgmt-msi==1.0.0" \
"azure-mgmt-network==10.1.0"; fi) \
&& $HOME/anaconda3/bin/pip uninstall ray -y && rm $(basename $WHEEL_PATH)
&& $HOME/anaconda3/bin/pip uninstall ray -y && sudo rm $(basename $WHEEL_PATH)
10 changes: 5 additions & 5 deletions docker/ray-ml/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@ COPY requirements_ml_docker.txt ./
COPY requirements_rllib.txt ./
COPY requirements_tune.txt ./

RUN apt-get update \
&& apt-get install -y gcc \
RUN sudo apt-get update \
&& sudo apt-get install -y gcc \
cmake \
libgtk2.0-dev \
zlib1g-dev \
libgl1-mesa-dev \
&& $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements.txt \
&& $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements_ml_docker.txt \
&& rm requirements.txt && rm requirements_ml_docker.txt \
&& apt-get remove cmake gcc -y \
&& apt-get clean
&& sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \
&& sudo apt-get remove cmake gcc -y \
&& sudo apt-get clean

2 changes: 1 addition & 1 deletion docker/ray/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8
COPY $WHEEL_PATH .
RUN $HOME/anaconda3/bin/pip --no-cache-dir install `basename $WHEEL_PATH`[all] \
&& rm `basename $WHEEL_PATH`
&& sudo rm `basename $WHEEL_PATH`
22 changes: 17 additions & 5 deletions python/ray/autoscaler/_private/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from ray.autoscaler._private.cli_logger import cli_logger, cf
from ray.util.debug import log_once

from ray.autoscaler._private.constants import RAY_HOME

logger = logging.getLogger(__name__)

# How long to wait for a node to start, in seconds
Expand Down Expand Up @@ -190,7 +192,7 @@ def run_rsync_up(self, source, target, options=None):
logger.warning("'rsync_filter' detected but is currently "
"unsupported for k8s.")
if target.startswith("~"):
target = "/root" + target[1:]
target = RAY_HOME + target[1:]

try:
flags = "-aqz" if is_rsync_silent() else "-avz"
Expand All @@ -206,7 +208,7 @@ def run_rsync_up(self, source, target, options=None):
"rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
UserWarning)
if target.startswith("~"):
target = "/root" + target[1:]
target = RAY_HOME + target[1:]

self.process_runner.check_call(self.kubectl + [
"cp", source, "{}/{}:{}".format(self.namespace, self.node_id,
Expand All @@ -215,7 +217,7 @@ def run_rsync_up(self, source, target, options=None):

def run_rsync_down(self, source, target, options=None):
if target.startswith("~"):
target = "/root" + target[1:]
target = RAY_HOME + target[1:]

try:
flags = "-aqz" if is_rsync_silent() else "-avz"
Expand All @@ -231,7 +233,7 @@ def run_rsync_down(self, source, target, options=None):
"rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
UserWarning)
if target.startswith("~"):
target = "/root" + target[1:]
target = RAY_HOME + target[1:]

self.process_runner.check_call(self.kubectl + [
"cp", "{}/{}:{}".format(self.namespace, self.node_id, source),
Expand Down Expand Up @@ -699,14 +701,24 @@ def run_init(self, *, as_head, file_mounts):
cleaned_bind_mounts.pop(mnt, None)

if not self._check_container_status():
# Get home directory
image_env = self.ssh_command_runner.run(
"docker inspect -f '{{json .Config.Env}}' " + image,
with_output=True).decode().strip()
home_directory = "/root"
for env_var in json.loads(image_env):
if env_var.startswith("HOME="):
home_directory = env_var.split("HOME=")[1]
break

start_command = docker_start_cmds(
self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts,
self.container_name,
self.docker_config.get(
"run_options", []) + self.docker_config.get(
f"{'head' if as_head else 'worker'}_run_options",
[]) + self._configure_runtime(),
self.ssh_command_runner.cluster_name)
self.ssh_command_runner.cluster_name, home_directory)
self.run(start_command, run_env="host")
else:
running_image = self.run(
Expand Down
3 changes: 3 additions & 0 deletions python/ray/autoscaler/_private/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,6 @@ def env_integer(key, default):
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
# Max number of retries to create an EC2 node (retry different subnet)
BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5)

# ray home path in the container image
RAY_HOME = "/home/ray"
6 changes: 3 additions & 3 deletions python/ray/autoscaler/_private/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,15 @@ def check_docker_image(cname):


def docker_start_cmds(user, image, mount_dict, container_name, user_options,
cluster_name):
cluster_name, home_directory):
# Imported here due to circular dependency.
from ray.autoscaler.sdk import get_docker_host_mount_location
docker_mount_prefix = get_docker_host_mount_location(cluster_name)
mount = {f"{docker_mount_prefix}/{dst}": dst for dst in mount_dict}

# TODO(ilr) Move away from defaulting to /root/
mount_flags = " ".join([
"-v {src}:{dest}".format(src=k, dest=v.replace("~/", "/root/"))
"-v {src}:{dest}".format(
src=k, dest=v.replace("~/", home_directory + "/"))
for k, v in mount.items()
])

Expand Down
18 changes: 0 additions & 18 deletions python/ray/autoscaler/_private/staroid/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,3 @@ def __init__(self,
if kube_api_server is not None:
self.kubectl.extend(["--server", kube_api_server])
os.environ["KUBE_API_SERVER"] = kube_api_server

def _rewrite_target_home_dir(self, target):
# Staroid forces containers to run non-root permission. Ray docker
# image does not have a support for non-root user at the moment.
# Use /tmp/ray as a home directory until docker image supports
# non-root user.

if target.startswith("~/"):
return "/home/ray" + target[1:]
return target

def run_rsync_up(self, source, target, options=None):
target = self._rewrite_target_home_dir(target)
super().run_rsync_up(source, target, options)

def run_rsync_down(self, source, target, options=None):
target = self._rewrite_target_home_dir(target)
super().run_rsync_down(source, target, options)
10 changes: 10 additions & 0 deletions python/ray/tests/test_autoscaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ def testGetOrCreateHeadNode(self):
# Two initial calls to docker cp, one before run, two final calls to cp
runner.respond_to_call(".State.Running",
["false", "false", "false", "true", "true"])
runner.respond_to_call("json .Config.Env", ["[]"])
commands.get_or_create_head_node(
SMALL_CLUSTER,
config_path,
Expand Down Expand Up @@ -968,6 +969,7 @@ def testConfiguresNewNodes(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
Expand Down Expand Up @@ -1006,6 +1008,7 @@ def testConfiguresOutdatedNodes(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
Expand Down Expand Up @@ -1159,6 +1162,7 @@ def testRecoverUnhealthyWorkers(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
Expand Down Expand Up @@ -1227,6 +1231,7 @@ def testSetupCommandsWithNoNodeCaching(self):
config_path = self.write_config(config)
self.provider = MockProvider(cache_stopped=False)
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
Expand Down Expand Up @@ -1269,6 +1274,7 @@ def testSetupCommandsWithStoppedNodeCaching(self):
config_path = self.write_config(config)
self.provider = MockProvider(cache_stopped=True)
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(3)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
Expand Down Expand Up @@ -1335,6 +1341,7 @@ def testMultiNodeReuse(self):
config_path = self.write_config(config)
self.provider = MockProvider(cache_stopped=True)
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(13)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
Expand Down Expand Up @@ -1385,6 +1392,7 @@ def testContinuousFileMounts(self):
config["max_workers"] = 2
config_path = self.write_config(config)
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
Expand Down Expand Up @@ -1438,6 +1446,7 @@ def testFileMountsNonContinuous(self):
config["max_workers"] = 2
config_path = self.write_config(config)
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
Expand Down Expand Up @@ -1484,6 +1493,7 @@ def testFileMountsNonContinuous(self):
from ray.autoscaler._private import util
util._hash_cache = {}
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
lm = LoadMetrics()
autoscaler = StandardAutoscaler(
config_path,
Expand Down
4 changes: 4 additions & 0 deletions python/ray/tests/test_resource_demand_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,6 +1002,7 @@ def testGetOrCreateMultiNodeType(self):
config_path = self.write_config(MULTI_WORKER_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]"])
get_or_create_head_node(
MULTI_WORKER_CLUSTER,
config_path,
Expand Down Expand Up @@ -1272,6 +1273,7 @@ def testResourcePassing(self):
config_path = self.write_config(config)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
Expand Down Expand Up @@ -1353,6 +1355,7 @@ def testCommandPassing(self):
config_path = self.write_config(config)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
Expand Down Expand Up @@ -1405,6 +1408,7 @@ def testDockerWorkers(self):
config_path = self.write_config(config)
self.provider = MockProvider()
runner = MockProcessRunner()
runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
autoscaler = StandardAutoscaler(
config_path,
LoadMetrics(),
Expand Down

0 comments on commit 9920933

Please sign in to comment.