python/ray/cluster_utils.py

import copy
import json
import logging
import os
import subprocess
import tempfile
import time
from typing import Dict, Optional

import yaml

import ray
import ray._private.services
from ray._private import ray_constants
from ray._private.client_mode_hook import disable_client_hook
from ray._raylet import GcsClientOptions
from ray.autoscaler._private.fake_multi_node.node_provider import FAKE_HEAD_NODE_ID
from ray.util.annotations import DeveloperAPI

logger = logging.getLogger(__name__)

cluster_not_supported = os.name == "nt"


@DeveloperAPI
class AutoscalingCluster:
    """Create a local autoscaling cluster for testing.

    See test_autoscaler_fake_multinode.py for an end-to-end example.
    """

    def __init__(
        self,
        head_resources: dict,
        worker_node_types: dict,
        autoscaler_v2: bool = False,
        **config_kwargs,
    ):
        """Create the cluster.

        Args:
            head_resources: resources of the head node, including CPU.
            worker_node_types: autoscaler node types config for worker nodes.
        """
        self._head_resources = head_resources
        self._config = self._generate_config(
            head_resources,
            worker_node_types,
            autoscaler_v2=autoscaler_v2,
            **config_kwargs,
        )
        self._autoscaler_v2 = autoscaler_v2

    def _generate_config(
        self, head_resources, worker_node_types, autoscaler_v2=False, **config_kwargs
    ):
        base_config = yaml.safe_load(
            open(
                os.path.join(
                    os.path.dirname(ray.__file__),
                    "autoscaler/_private/fake_multi_node/example.yaml",
                )
            )
        )
        custom_config = copy.deepcopy(base_config)
        custom_config["available_node_types"] = worker_node_types
        custom_config["available_node_types"]["ray.head.default"] = {
            "resources": head_resources,
            "node_config": {},
            "max_workers": 0,
        }

        # Autoscaler v2 specific configs
        if autoscaler_v2:
            custom_config["provider"]["launch_multiple"] = True
            custom_config["provider"]["head_node_id"] = FAKE_HEAD_NODE_ID
        custom_config.update(config_kwargs)
        return custom_config

    def start(self, _system_config=None, override_env: Optional[Dict] = None):
        """Start the cluster.

        After this call returns, you can connect to the cluster with
        ray.init("auto").
        """
        subprocess.check_call(["ray", "stop", "--force"])
        _, fake_config = tempfile.mkstemp()
        with open(fake_config, "w") as f:
            f.write(json.dumps(self._config))
        cmd = [
            "ray",
            "start",
            "--autoscaling-config={}".format(fake_config),
            "--head",
        ]
        if "CPU" in self._head_resources:
            cmd.append("--num-cpus={}".format(self._head_resources.pop("CPU")))
        if "GPU" in self._head_resources:
            cmd.append("--num-gpus={}".format(self._head_resources.pop("GPU")))
        if "object_store_memory" in self._head_resources:
            cmd.append(
                "--object-store-memory={}".format(
                    self._head_resources.pop("object_store_memory")
                )
            )
        if self._head_resources:
            cmd.append("--resources='{}'".format(json.dumps(self._head_resources)))
        if _system_config is not None:
            cmd.append(
                "--system-config={}".format(
                    json.dumps(_system_config, separators=(",", ":"))
                )
            )
        env = os.environ.copy()
        env.update({"AUTOSCALER_UPDATE_INTERVAL_S": "1", "RAY_FAKE_CLUSTER": "1"})
        if self._autoscaler_v2:
            # Set the necessary environment variables for autoscaler v2.
            env.update(
                {
                    "RAY_enable_autoscaler_v2": "1",
                    "RAY_CLOUD_INSTANCE_ID": FAKE_HEAD_NODE_ID,
                    "RAY_OVERRIDE_NODE_ID_FOR_TESTING": FAKE_HEAD_NODE_ID,
                }
            )
        if override_env:
            env.update(override_env)
        subprocess.check_call(cmd, env=env)

    def shutdown(self):
        """Terminate the cluster."""
        subprocess.check_call(["ray", "stop", "--force"])


@DeveloperAPI
class Cluster:
    def __init__(
        self,
        initialize_head: bool = False,
        connect: bool = False,
        head_node_args: dict = None,
        shutdown_at_exit: bool = True,
    ):
        """Initializes all services of a Ray cluster.

        Args:
            initialize_head: Automatically start a Ray cluster
                by initializing the head node. Defaults to False.
            connect: If `initialize_head=True` and `connect=True`,
                ray.init will be called with the address of this cluster
                passed in.
            head_node_args: Arguments to be passed into
                `start_ray_head` via `self.add_node`.
            shutdown_at_exit: If True, registers an exit hook
                for shutting down all started processes.
        """
        if cluster_not_supported:
            logger.warning(
                "Ray cluster mode is currently experimental and untested on "
                "Windows. If you are using it and running into issues please "
                "file a report at https://github.com/ray-project/ray/issues."
            )
        self.head_node = None
        self.worker_nodes = set()
        self.redis_address = None
        self.connected = False
        # Create a new global state accessor for fetching GCS table.
        self.global_state = ray._private.state.GlobalState()
        self._shutdown_at_exit = shutdown_at_exit
        if not initialize_head and connect:
            raise RuntimeError("Cannot connect to uninitialized cluster.")

        if initialize_head:
            head_node_args = head_node_args or {}
            self.add_node(**head_node_args)
            if connect:
                self.connect()

    @property
    def gcs_address(self):
        if self.head_node is None:
            return None
        return self.head_node.gcs_address

    @property
    def address(self):
        return self.gcs_address

    def connect(self, namespace=None):
        """Connect the driver to the cluster."""
        assert self.address is not None
        assert not self.connected
        output_info = ray.init(
            namespace=namespace,
            ignore_reinit_error=True,
            address=self.address,
            _redis_username=self.redis_username,
            _redis_password=self.redis_password,
        )
        logger.info(output_info)
        self.connected = True

    def add_node(self, wait: bool = True, **node_args):
        """Adds a node to the local Ray Cluster.

        All nodes are by default started with the following settings:
            cleanup=True,
            num_cpus=1,
            object_store_memory=150 * 1024 * 1024  # 150 MiB

        Args:
            wait: Whether to wait until the node is alive.
            node_args: Keyword arguments used in `start_ray_head` and
                `start_ray_node`. Overrides defaults.

        Returns:
            Node object of the added Ray node.
        """
        default_kwargs = {
            "num_cpus": 1,
            "num_gpus": 0,
            "object_store_memory": 150 * 1024 * 1024,  # 150 MiB
            "min_worker_port": 0,
            "max_worker_port": 0,
        }
        ray_params = ray._private.parameter.RayParams(**node_args)
        ray_params.update_if_absent(**default_kwargs)
        with disable_client_hook():
            if self.head_node is None:
                node = ray._private.node.Node(
                    ray_params,
                    head=True,
                    shutdown_at_exit=self._shutdown_at_exit,
                    spawn_reaper=self._shutdown_at_exit,
                )
                self.head_node = node
                self.redis_address = self.head_node.redis_address
                self.redis_username = node_args.get(
                    "redis_username", ray_constants.REDIS_DEFAULT_USERNAME
                )
                self.redis_password = node_args.get(
                    "redis_password", ray_constants.REDIS_DEFAULT_PASSWORD
                )
                self.webui_url = self.head_node.webui_url
                # Init global state accessor when creating head node.
                gcs_options = GcsClientOptions.create(
                    node.gcs_address,
                    None,
                    allow_cluster_id_nil=True,
                    fetch_cluster_id_if_nil=False,
                )
                self.global_state._initialize_global_state(gcs_options)
                # Write the Ray cluster address for convenience in unit
                # testing. ray.init() and ray.init(address="auto") will connect
                # to the local cluster.
                ray._private.utils.write_ray_address(self.head_node.gcs_address)
            else:
                ray_params.update_if_absent(redis_address=self.redis_address)
                ray_params.update_if_absent(gcs_address=self.gcs_address)
                # We only need one log monitor per physical node.
                ray_params.update_if_absent(include_log_monitor=False)
                # Let grpc pick a port.
                ray_params.update_if_absent(node_manager_port=0)
                if "dashboard_agent_listen_port" not in node_args:
                    # Pick a random one to not conflict
                    # with the head node dashboard agent
                    ray_params.dashboard_agent_listen_port = None

                node = ray._private.node.Node(
                    ray_params,
                    head=False,
                    shutdown_at_exit=self._shutdown_at_exit,
                    spawn_reaper=self._shutdown_at_exit,
                )
                self.worker_nodes.add(node)

            if wait:
                # Wait for the node to appear in the client table. We do this
                # so that the nodes appears in the client table in the order
                # that the corresponding calls to add_node were made. We do
                # this because in the tests we assume that the driver is
                # connected to the first node that is added.
                self._wait_for_node(node)

        return node

    def remove_node(self, node, allow_graceful=True):
        """Kills all processes associated with worker node.

        Args:
            node: Worker node of which all associated processes
                will be removed.
        """
        global_node = ray._private.worker._global_node
        if global_node is not None:
            if node._raylet_socket_name == global_node._raylet_socket_name:
                ray.shutdown()
                raise ValueError(
                    "Removing a node that is connected to this Ray client "
                    "is not allowed because it will break the driver."
                    "You can use the get_other_node utility to avoid removing"
                    "a node that the Ray client is connected."
                )

        node.destroy_external_storage()
        if self.head_node == node:
            # We have to wait to prevent the raylet becomes a zombie which will prevent
            # worker from exiting
            self.head_node.kill_all_processes(
                check_alive=False, allow_graceful=allow_graceful, wait=True
            )
            self.head_node = None
            # TODO(rliaw): Do we need to kill all worker processes?
        else:
            # We have to wait to prevent the raylet becomes a zombie which will prevent
            # worker from exiting
            node.kill_all_processes(
                check_alive=False, allow_graceful=allow_graceful, wait=True
            )
            self.worker_nodes.remove(node)

        assert (
            not node.any_processes_alive()
        ), "There are zombie processes left over after killing."

    def _wait_for_node(self, node, timeout: float = 30):
        """Wait until this node has appeared in the client table.

        Args:
            node (ray._private.node.Node): The node to wait for.
            timeout: The amount of time in seconds to wait before raising an
                exception.

        Raises:
            TimeoutError: An exception is raised if the timeout expires before
                the node appears in the client table.
        """
        ray._private.services.wait_for_node(
            node.gcs_address,
            node.plasma_store_socket_name,
            timeout,
        )

    def wait_for_nodes(self, timeout: float = 30):
        """Waits for correct number of nodes to be registered.

        This will wait until the number of live nodes in the client table
        exactly matches the number of "add_node" calls minus the number of
        "remove_node" calls that have been made on this cluster. This means
        that if a node dies without "remove_node" having been called, this will
        raise an exception.

        Args:
            timeout: The number of seconds to wait for nodes to join
                before failing.

        Raises:
            TimeoutError: An exception is raised if we time out while waiting
                for nodes to join.
        """
        start_time = time.time()
        while time.time() - start_time < timeout:
            live_clients = self.global_state._live_node_ids()

            expected = len(self.list_all_nodes())
            if len(live_clients) == expected:
                logger.debug("All nodes registered as expected.")
                return
            else:
                logger.debug(
                    f"{len(live_clients)} nodes are currently registered, "
                    f"but we are expecting {expected}"
                )
                time.sleep(0.1)
        raise TimeoutError("Timed out while waiting for nodes to join.")

    def list_all_nodes(self):
        """Lists all nodes.

        TODO(rliaw): What is the desired behavior if a head node
        dies before worker nodes die?

        Returns:
            List of all nodes, including the head node.
        """
        nodes = list(self.worker_nodes)
        if self.head_node:
            nodes = [self.head_node] + nodes
        return nodes

    def remaining_processes_alive(self):
        """Returns a bool indicating whether all processes are alive or not.

        Note that this ignores processes that have been explicitly killed,
        e.g., via a command like node.kill_raylet().

        Returns:
            True if all processes are alive and false otherwise.
        """
        return all(node.remaining_processes_alive() for node in self.list_all_nodes())

    def shutdown(self):
        """Removes all nodes."""

        # We create a list here as a copy because `remove_node`
        # modifies `self.worker_nodes`.
        all_nodes = list(self.worker_nodes)
        for node in all_nodes:
            self.remove_node(node)

        if self.head_node is not None:
            self.remove_node(self.head_node)
        # need to reset internal kv since gcs is down
        ray.experimental.internal_kv._internal_kv_reset()
        # Delete the cluster address.
        ray._private.utils.reset_ray_address()