From 9920933e318986cfe6cd8a8fad12b1d8b4490917 Mon Sep 17 00:00:00 2001 From: Lee moon soo Date: Thu, 12 Nov 2020 08:41:50 -0800 Subject: [PATCH] [docker] Support non-root container (#11407) --- docker/base-deps/Dockerfile | 39 +++++++++++++------ docker/ray-deps/Dockerfile | 2 +- docker/ray-ml/Dockerfile | 10 ++--- docker/ray/Dockerfile | 2 +- .../ray/autoscaler/_private/command_runner.py | 22 ++++++++--- python/ray/autoscaler/_private/constants.py | 3 ++ python/ray/autoscaler/_private/docker.py | 6 +-- .../_private/staroid/command_runner.py | 18 --------- python/ray/tests/test_autoscaler.py | 10 +++++ .../tests/test_resource_demand_scheduler.py | 4 ++ 10 files changed, 71 insertions(+), 45 deletions(-) diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index f3d453d20017..ccb74645c47c 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -10,8 +10,23 @@ ENV TZ=America/Los_Angeles ENV PATH "/root/anaconda3/bin:$PATH" ARG DEBIAN_FRONTEND=noninteractive ARG PYTHON_VERSION=3.7.7 -RUN apt-get update -y && apt-get upgrade -y \ - && apt-get install -y \ + +ARG RAY_UID=1000 +ARG RAY_GID=100 + +RUN apt-get update -y \ + && apt-get install -y sudo tzdata \ + && useradd -ms /bin/bash -d /home/ray ray --uid $RAY_UID --gid $RAY_GID \ + && usermod -aG sudo ray \ + && echo 'ray ALL=NOPASSWD: ALL' >> /etc/sudoers \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +USER $RAY_UID +ENV HOME=/home/ray + +RUN sudo apt-get update -y && sudo apt-get upgrade -y \ + && sudo apt-get install -y \ git \ wget \ cmake \ @@ -26,7 +41,7 @@ RUN apt-get update -y && apt-get upgrade -y \ -O /tmp/miniconda.sh \ && /bin/bash /tmp/miniconda.sh -b -u -p $HOME/anaconda3 \ && $HOME/anaconda3/bin/conda init \ - && echo 'export PATH=$HOME/anaconda3/bin:$PATH' > /etc/profile.d/conda.sh \ + && echo 'export PATH=$HOME/anaconda3/bin:$PATH' >> /home/ray/.bashrc \ && rm /tmp/miniconda.sh \ && $HOME/anaconda3/bin/conda install -y \ libgcc python=$PYTHON_VERSION \ @@ -42,15 +57,15 @@ RUN apt-get update -y && apt-get upgrade -y \ # AttributeError: 'numpy.ufunc' object has no attribute '__module__' && $HOME/anaconda3/bin/pip uninstall -y dask \ # We install cmake temporarily to get psutil - && apt-get autoremove -y cmake \ + && sudo apt-get autoremove -y cmake \ # Either install kubectl or remove wget && (if [ "$AUTOSCALER" = "autoscaler" ]; \ - then wget -O - -q https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \ - && touch /etc/apt/sources.list.d/kubernetes.list \ - && echo "deb http://apt.kubernetes.io/ kubernetes-xenial main" | tee -a /etc/apt/sources.list.d/kubernetes.list \ - && apt-get update \ - && apt-get install kubectl; \ - else apt-get autoremove -y wget; \ + then wget -O - -q https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - \ + && sudo touch /etc/apt/sources.list.d/kubernetes.list \ + && echo "deb http://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee -a /etc/apt/sources.list.d/kubernetes.list \ + && sudo apt-get update \ + && sudo apt-get install kubectl; \ + else sudo apt-get autoremove -y wget; \ fi;) \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get clean + && sudo rm -rf /var/lib/apt/lists/* \ + && sudo apt-get clean diff --git a/docker/ray-deps/Dockerfile b/docker/ray-deps/Dockerfile index 13333c063e60..4dc812b608b0 100644 --- a/docker/ray-deps/Dockerfile +++ b/docker/ray-deps/Dockerfile @@ -17,4 +17,4 @@ RUN $HOME/anaconda3/bin/pip --no-cache-dir install $(basename $WHEEL_PATH)[all] "azure-mgmt-compute==12.0.0" \ "azure-mgmt-msi==1.0.0" \ "azure-mgmt-network==10.1.0"; fi) \ - && $HOME/anaconda3/bin/pip uninstall ray -y && rm $(basename $WHEEL_PATH) + && $HOME/anaconda3/bin/pip uninstall ray -y && sudo rm $(basename $WHEEL_PATH) diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 9d091bdbbb28..cbe73ddd8e3e 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -7,15 +7,15 @@ COPY requirements_ml_docker.txt ./ COPY requirements_rllib.txt ./ COPY requirements_tune.txt ./ -RUN apt-get update \ - && apt-get install -y gcc \ +RUN sudo apt-get update \ + && sudo apt-get install -y gcc \ cmake \ libgtk2.0-dev \ zlib1g-dev \ libgl1-mesa-dev \ && $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements_ml_docker.txt \ - && rm requirements.txt && rm requirements_ml_docker.txt \ - && apt-get remove cmake gcc -y \ - && apt-get clean + && sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \ + && sudo apt-get remove cmake gcc -y \ + && sudo apt-get clean diff --git a/docker/ray/Dockerfile b/docker/ray/Dockerfile index 0b031d0ec768..6657fc45d4b4 100644 --- a/docker/ray/Dockerfile +++ b/docker/ray/Dockerfile @@ -6,4 +6,4 @@ ENV LC_ALL=C.UTF-8 ENV LANG=C.UTF-8 COPY $WHEEL_PATH . RUN $HOME/anaconda3/bin/pip --no-cache-dir install `basename $WHEEL_PATH`[all] \ -&& rm `basename $WHEEL_PATH` +&& sudo rm `basename $WHEEL_PATH` diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index d1f1850aaf51..546ff8c7b2f3 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -25,6 +25,8 @@ from ray.autoscaler._private.cli_logger import cli_logger, cf from ray.util.debug import log_once +from ray.autoscaler._private.constants import RAY_HOME + logger = logging.getLogger(__name__) # How long to wait for a node to start, in seconds @@ -190,7 +192,7 @@ def run_rsync_up(self, source, target, options=None): logger.warning("'rsync_filter' detected but is currently " "unsupported for k8s.") if target.startswith("~"): - target = "/root" + target[1:] + target = RAY_HOME + target[1:] try: flags = "-aqz" if is_rsync_silent() else "-avz" @@ -206,7 +208,7 @@ def run_rsync_up(self, source, target, options=None): "rsync failed: '{}'. Falling back to 'kubectl cp'".format(e), UserWarning) if target.startswith("~"): - target = "/root" + target[1:] + target = RAY_HOME + target[1:] self.process_runner.check_call(self.kubectl + [ "cp", source, "{}/{}:{}".format(self.namespace, self.node_id, @@ -215,7 +217,7 @@ def run_rsync_up(self, source, target, options=None): def run_rsync_down(self, source, target, options=None): if target.startswith("~"): - target = "/root" + target[1:] + target = RAY_HOME + target[1:] try: flags = "-aqz" if is_rsync_silent() else "-avz" @@ -231,7 +233,7 @@ def run_rsync_down(self, source, target, options=None): "rsync failed: '{}'. Falling back to 'kubectl cp'".format(e), UserWarning) if target.startswith("~"): - target = "/root" + target[1:] + target = RAY_HOME + target[1:] self.process_runner.check_call(self.kubectl + [ "cp", "{}/{}:{}".format(self.namespace, self.node_id, source), @@ -699,6 +701,16 @@ def run_init(self, *, as_head, file_mounts): cleaned_bind_mounts.pop(mnt, None) if not self._check_container_status(): + # Get home directory + image_env = self.ssh_command_runner.run( + "docker inspect -f '{{json .Config.Env}}' " + image, + with_output=True).decode().strip() + home_directory = "/root" + for env_var in json.loads(image_env): + if env_var.startswith("HOME="): + home_directory = env_var.split("HOME=")[1] + break + start_command = docker_start_cmds( self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts, self.container_name, @@ -706,7 +718,7 @@ def run_init(self, *, as_head, file_mounts): "run_options", []) + self.docker_config.get( f"{'head' if as_head else 'worker'}_run_options", []) + self._configure_runtime(), - self.ssh_command_runner.cluster_name) + self.ssh_command_runner.cluster_name, home_directory) self.run(start_command, run_env="host") else: running_image = self.run( diff --git a/python/ray/autoscaler/_private/constants.py b/python/ray/autoscaler/_private/constants.py index de438197e1b8..06c01ddd9f64 100644 --- a/python/ray/autoscaler/_private/constants.py +++ b/python/ray/autoscaler/_private/constants.py @@ -41,3 +41,6 @@ def env_integer(key, default): BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12) # Max number of retries to create an EC2 node (retry different subnet) BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5) + +# ray home path in the container image +RAY_HOME = "/home/ray" diff --git a/python/ray/autoscaler/_private/docker.py b/python/ray/autoscaler/_private/docker.py index 62d49e590ca1..46bb20a3feca 100644 --- a/python/ray/autoscaler/_private/docker.py +++ b/python/ray/autoscaler/_private/docker.py @@ -65,15 +65,15 @@ def check_docker_image(cname): def docker_start_cmds(user, image, mount_dict, container_name, user_options, - cluster_name): + cluster_name, home_directory): # Imported here due to circular dependency. from ray.autoscaler.sdk import get_docker_host_mount_location docker_mount_prefix = get_docker_host_mount_location(cluster_name) mount = {f"{docker_mount_prefix}/{dst}": dst for dst in mount_dict} - # TODO(ilr) Move away from defaulting to /root/ mount_flags = " ".join([ - "-v {src}:{dest}".format(src=k, dest=v.replace("~/", "/root/")) + "-v {src}:{dest}".format( + src=k, dest=v.replace("~/", home_directory + "/")) for k, v in mount.items() ]) diff --git a/python/ray/autoscaler/_private/staroid/command_runner.py b/python/ray/autoscaler/_private/staroid/command_runner.py index 28f17b2a4e46..69f54b14d0dd 100644 --- a/python/ray/autoscaler/_private/staroid/command_runner.py +++ b/python/ray/autoscaler/_private/staroid/command_runner.py @@ -17,21 +17,3 @@ def __init__(self, if kube_api_server is not None: self.kubectl.extend(["--server", kube_api_server]) os.environ["KUBE_API_SERVER"] = kube_api_server - - def _rewrite_target_home_dir(self, target): - # Staroid forces containers to run non-root permission. Ray docker - # image does not have a support for non-root user at the moment. - # Use /tmp/ray as a home directory until docker image supports - # non-root user. - - if target.startswith("~/"): - return "/home/ray" + target[1:] - return target - - def run_rsync_up(self, source, target, options=None): - target = self._rewrite_target_home_dir(target) - super().run_rsync_up(source, target, options) - - def run_rsync_down(self, source, target, options=None): - target = self._rewrite_target_home_dir(target) - super().run_rsync_down(source, target, options) diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py index 3cbd4b2028d6..87b4c8add8bd 100644 --- a/python/ray/tests/test_autoscaler.py +++ b/python/ray/tests/test_autoscaler.py @@ -390,6 +390,7 @@ def testGetOrCreateHeadNode(self): # Two initial calls to docker cp, one before run, two final calls to cp runner.respond_to_call(".State.Running", ["false", "false", "false", "true", "true"]) + runner.respond_to_call("json .Config.Env", ["[]"]) commands.get_or_create_head_node( SMALL_CLUSTER, config_path, @@ -968,6 +969,7 @@ def testConfiguresNewNodes(self): config_path = self.write_config(SMALL_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)]) autoscaler = StandardAutoscaler( config_path, LoadMetrics(), @@ -1006,6 +1008,7 @@ def testConfiguresOutdatedNodes(self): config_path = self.write_config(SMALL_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)]) autoscaler = StandardAutoscaler( config_path, LoadMetrics(), @@ -1159,6 +1162,7 @@ def testRecoverUnhealthyWorkers(self): config_path = self.write_config(SMALL_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)]) lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, @@ -1227,6 +1231,7 @@ def testSetupCommandsWithNoNodeCaching(self): config_path = self.write_config(config) self.provider = MockProvider(cache_stopped=False) runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)]) lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, @@ -1269,6 +1274,7 @@ def testSetupCommandsWithStoppedNodeCaching(self): config_path = self.write_config(config) self.provider = MockProvider(cache_stopped=True) runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(3)]) lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, @@ -1335,6 +1341,7 @@ def testMultiNodeReuse(self): config_path = self.write_config(config) self.provider = MockProvider(cache_stopped=True) runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(13)]) lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, @@ -1385,6 +1392,7 @@ def testContinuousFileMounts(self): config["max_workers"] = 2 config_path = self.write_config(config) runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)]) lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, @@ -1438,6 +1446,7 @@ def testFileMountsNonContinuous(self): config["max_workers"] = 2 config_path = self.write_config(config) runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)]) lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, @@ -1484,6 +1493,7 @@ def testFileMountsNonContinuous(self): from ray.autoscaler._private import util util._hash_cache = {} runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)]) lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index ccaa57022f5c..8be955c11ab9 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -1002,6 +1002,7 @@ def testGetOrCreateMultiNodeType(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]"]) get_or_create_head_node( MULTI_WORKER_CLUSTER, config_path, @@ -1272,6 +1273,7 @@ def testResourcePassing(self): config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)]) autoscaler = StandardAutoscaler( config_path, LoadMetrics(), @@ -1353,6 +1355,7 @@ def testCommandPassing(self): config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)]) autoscaler = StandardAutoscaler( config_path, LoadMetrics(), @@ -1405,6 +1408,7 @@ def testDockerWorkers(self): config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)]) autoscaler = StandardAutoscaler( config_path, LoadMetrics(),