update

Signed-off-by: Kai-Hsun Chen <kaihsun@anyscale.com>
ray-project · kevin85421 · Dec 26, 2024 · Dec 26, 2024 · Dec 26, 2024 · Dec 26, 2024
commit d8dadb47e95d518e2b62fce3327b611d1303c275
diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py
@@ -965,7 +965,7 @@ def experimental_wait_and_get_mutable_objects(
         )
 
         if skip_deserialization:
-            return None, set()
+            return data_metadata_pairs, set()
 
         non_complete_object_refs_set = set()
         for i, (data, _) in enumerate(data_metadata_pairs):

diff --git a/python/ray/experimental/channel/common.py b/python/ray/experimental/channel/common.py
@@ -256,9 +256,12 @@ def read(self, timeout: Optional[float] = None) -> Any:
         """
         raise NotImplementedError
 
-    def get_ray_waitables(self) -> List[ObjectRef]:
+    def get_ray_waitables(self) -> List[Tuple[ObjectRef, bool]]:
         """
-        Get the ObjectRefs that will be read in the next read() call.
+        Get a list of tuples containing an ObjectRef and a boolean flag.
+        The flag indicates whether the ObjectRef should skip deserialization
+        in `experimental_wait_and_get_mutable_objects` and instead be
+        deserialized in the channel's `read()` method instead.
         """
         raise NotImplementedError
 
@@ -308,11 +311,20 @@ def _read_list(self, timeout: Optional[float] = None) -> List[Any]:
 
     def _get_all_waitables_to_num_consumers(self) -> Dict[ObjectRef, int]:
         waitable_to_num_consumers = {}
+        skip_deserialization_waitables_to_num_consumers = {}
         for c in self._input_channels:
             waitables = c.get_ray_waitables()
-            for w in waitables:
-                waitable_to_num_consumers[w] = waitable_to_num_consumers.get(w, 0) + 1
-        return waitable_to_num_consumers
+            for waitable, skip_deserialization in waitables:
+                target_dict = (
+                    skip_deserialization_waitables_to_num_consumers
+                    if skip_deserialization
+                    else waitable_to_num_consumers
+                )
+                target_dict[waitable] = target_dict.get(waitable, 0) + 1
+        return (
+            waitable_to_num_consumers,
+            skip_deserialization_waitables_to_num_consumers,
+        )
 
     def read(self, timeout: Optional[float] = None) -> List[Any]:
         """
@@ -377,30 +389,48 @@ def _read_list(self, timeout: Optional[float] = None) -> List[Any]:
         timeout = 1e6 if timeout is None or timeout == -1 else timeout
         self._consume_non_complete_object_refs_if_needed(timeout)
 
-        waitable_to_num_consumers = self._get_all_waitables_to_num_consumers()
-        all_waitables = list(waitable_to_num_consumers.keys())
+        (
+            waitables_to_num_consumers,
+            skip_deserialization_waitables_to_num_consumers,
+        ) = self._get_all_waitables_to_num_consumers()
+        normal_waitables = list(waitables_to_num_consumers.keys())
+        skip_deserialization_waitables = list(
+            skip_deserialization_waitables_to_num_consumers.keys()
+        )
 
         timeout_point = time.monotonic() + timeout
         worker = ray._private.worker.global_worker
-        while len(all_waitables) > 0:
+        while len(normal_waitables) > 0 or len(skip_deserialization_waitables) > 0:
             # Retrieve at most one object each time.
+            use_normal_waitables = len(normal_waitables) > 0
+            target_waitable_group = (
+                normal_waitables
+                if use_normal_waitables
+                else skip_deserialization_waitables
+            )
+            target_waitable_group_num_consumers = (
+                waitables_to_num_consumers
+                if use_normal_waitables
+                else skip_deserialization_waitables_to_num_consumers
+            )
             (
                 values,
                 non_complete_object_refs_set,
             ) = worker.experimental_wait_and_get_mutable_objects(
-                all_waitables,
+                target_waitable_group,
                 num_returns=1,
                 timeout_ms=max(0, (timeout_point - time.monotonic()) * 1000),
                 return_exceptions=True,
+                skip_deserialization=not use_normal_waitables,
                 suppress_timeout_errors=True,
             )
             ctx = ChannelContext.get_current().serialization_context
             for i, value in enumerate(values):
-                if all_waitables[i] in non_complete_object_refs_set:
+                if target_waitable_group[i] in non_complete_object_refs_set:
                     continue
                 if isinstance(value, ray.exceptions.RayTaskError):
                     self._non_complete_object_refs = list(non_complete_object_refs_set)
-                    for w in all_waitables:
+                    for w in target_waitable_group:
                         ctx.reset_data(w)
                     # If we raise an exception immediately, it will be considered
                     # as a system error which will cause the execution loop to
@@ -411,17 +441,21 @@ def _read_list(self, timeout: Optional[float] = None) -> List[Any]:
                     # get an undefined partial result.
                     return [value for _ in range(len(self._input_channels))]
                 ctx.set_data(
-                    all_waitables[i],
+                    target_waitable_group[i],
                     value,
-                    waitable_to_num_consumers[all_waitables[i]],
+                    target_waitable_group_num_consumers[target_waitable_group[i]],
                 )
-            all_waitables = list(non_complete_object_refs_set)
-            if time.monotonic() > timeout_point and len(all_waitables) != 0:
+            target_waitable_group = list(non_complete_object_refs_set)
+            if time.monotonic() > timeout_point and len(target_waitable_group) != 0:
                 # This ensures that the reader attempts to retrieve
                 # data once even when the `timeout` is 0.
                 raise ray.exceptions.RayChannelTimeoutError(
                     "Timed out waiting for channel data."
                 )
+            if use_normal_waitables:
+                normal_waitables = target_waitable_group
+            else:
+                skip_deserialization_waitables = target_waitable_group
 
         results = []
         for c in self._input_channels:
@@ -467,32 +501,60 @@ def start(self):
         self._background_task = asyncio.ensure_future(self.run())
 
     def _run(self):
+        # TODO(kevin85421): Consume non-complete object refs.
+        # TODO(kevin85421): Consume waitable one by one.
+        (
+            waitables_to_num_consumers,
+            skip_deserialization_waitables_to_num_consumers,
+        ) = self._get_all_waitables_to_num_consumers()
+        normal_waitables = list(waitables_to_num_consumers.keys())
+        skip_deserialization_waitables = list(
+            skip_deserialization_waitables_to_num_consumers.keys()
+        )
+
         results = []
-        waitable_to_num_consumers = self._get_all_waitables_to_num_consumers()
-        all_waitables = list(waitable_to_num_consumers.keys())
 
         worker = ray._private.worker.global_worker
-        while len(all_waitables) > 0:
+        while len(normal_waitables) > 0 or len(skip_deserialization_waitables) > 0:
+            use_normal_waitables = len(normal_waitables) > 0
+            target_waitable_group = (
+                normal_waitables
+                if use_normal_waitables
+                else skip_deserialization_waitables
+            )
+            target_waitable_group_num_consumers = (
+                waitables_to_num_consumers
+                if use_normal_waitables
+                else skip_deserialization_waitables_to_num_consumers
+            )
+
             (
                 values,
                 non_complete_object_refs_set,
             ) = worker.experimental_wait_and_get_mutable_objects(
-                all_waitables,
-                len(all_waitables),
+                target_waitable_group,
+                num_returns=len(target_waitable_group),
                 timeout_ms=1000,
                 return_exceptions=True,
+                skip_deserialization=not use_normal_waitables,
                 suppress_timeout_errors=True,
             )
+
             ctx = ChannelContext.get_current().serialization_context
             for i, value in enumerate(values):
-                if all_waitables[i] in non_complete_object_refs_set:
+                if target_waitable_group[i] in non_complete_object_refs_set:
                     continue
                 ctx.set_data(
-                    all_waitables[i],
+                    target_waitable_group[i],
                     value,
-                    waitable_to_num_consumers[all_waitables[i]],
+                    target_waitable_group_num_consumers[target_waitable_group[i]],
                 )
-            all_waitables = list(non_complete_object_refs_set)
+
+            target_waitable_group = list(non_complete_object_refs_set)
+            if use_normal_waitables:
+                normal_waitables = target_waitable_group
+            else:
+                skip_deserialization_waitables = target_waitable_group
             if sys.is_finalizing():
                 return results
 

diff --git a/python/ray/experimental/channel/shared_memory_channel.py b/python/ray/experimental/channel/shared_memory_channel.py
@@ -500,9 +500,9 @@ def read(self, timeout: Optional[float] = None) -> Any:
             ret = rets[0]
         return ret
 
-    def get_ray_waitables(self) -> List[ObjectRef]:
+    def get_ray_waitables(self) -> List[Tuple[ObjectRef, bool]]:
         self.ensure_registered_as_reader()
-        return [self._local_reader_ref]
+        return [(self._local_reader_ref, False)]
 
     def release_buffer(self, timeout: Optional[float] = None) -> None:
         assert (

diff --git a/python/ray/experimental/channel/torch_tensor_nccl_channel.py b/python/ray/experimental/channel/torch_tensor_nccl_channel.py
@@ -164,6 +164,10 @@ def ensure_registered_as_writer(self):
             self._cpu_data_channel.ensure_registered_as_writer()
 
     def ensure_registered_as_reader(self):
+        reader = utils.get_self_actor()
+        if reader == self._writer:
+            self._local_channel.ensure_registered_as_reader()
+            return
         self._gpu_data_channel.ensure_registered_as_reader()
         if self._cpu_data_channel is not None:
             self._cpu_data_channel.ensure_registered_as_reader()
@@ -194,12 +198,31 @@ def _send_cpu_and_gpu_data(self, value: Any, timeout: Optional[float]):
             # normally.
             self.serialization_ctx.set_use_external_transport(False)
 
-        # First send the extracted tensors through a GPU-specific channel.
-        self._gpu_data_channel.write(gpu_tensors)
-        # Next send the non-tensor data through a CPU-specific channel. The
+        # The `write` operation of the shared memory channel must be called
+        # before the `write` operation of the GPU channel. This is because in
+        # `_read_list`, the channel's `read` operation waits for all underlying
+        # mutable objects for all input channels to be consumed.
+        #
+        # Step 1: `_cpu_data_channel.write` is called to write data into the
+        # mutable object.
+        # Step 2: `_read_list` consumes the mutable object.
+        # Step 3: After all underlying mutable objects of all input channels are
+        # consumed, `read` is called in the receiver of the NCCL channel.
+        #
+        # If we call NCCL write before the CPU channel write, then the shared
+        # memory channel's `write` operation will block because the NCCL write
+        # operation blocks forever until the NCCL read operation is called. However,
+        # the `read` operation of the NCCL channel will never be called because
+        # `_read_list` will never consume the mutable object that hasn't been
+        # written yet.
+
+        # First send the non-tensor data through a CPU-specific channel. The
         # data contains placeholders for the extracted tensors.
         self._cpu_data_channel.write(cpu_data)
 
+        # Next send the extracted tensors through a GPU-specific channel.
+        self._gpu_data_channel.write(gpu_tensors)
+
     def write(self, value: Any, timeout: Optional[float] = None) -> None:
         """
         Send a value that may contain torch.Tensors that should be sent via
@@ -275,17 +298,29 @@ def _recv_cpu_and_gpu_data(
         # Next, read and deserialize the non-tensor data. The registered custom
         # deserializer will replace the found tensor placeholders with
         # `tensors`.
-        data = self._cpu_data_channel.read(
+        #
+        # We need to deserialize the CPU data channel first in `read` instead of
+        # `_read_list` because the deserialization of the CPU data channel relies
+        # on the out-of-band tensors in the serialization context. Therefore, the
+        # `read` method of the NCCL channel must be called first to ensure that
+        # the out-of-band tensors are ready.
+        serialized_data, metadata = self._cpu_data_channel.read(
             timeout=timeout,
         )
+        rets = self._worker.deserialize_objects(
+            [(serialized_data, metadata)], self._cpu_data_channel.get_ray_waitables()
+        )
+        assert len(rets) == 1
+        ret = rets[0]
+
         # Check that all placeholders had a corresponding tensor.
         (
             _,
             deserialized_tensor_placeholders,
         ) = self.serialization_ctx.reset_out_of_band_tensors([])
         assert deserialized_tensor_placeholders == set(range(len(tensors)))
 
-        return data
+        return ret
 
     def read(self, timeout: Optional[float] = None) -> Any:
         """
@@ -327,10 +362,19 @@ def read(self, timeout: Optional[float] = None) -> Any:
 
     def get_ray_waitables(self) -> List[ObjectRef]:
         self.ensure_registered_as_reader()
+        reader = utils.get_self_actor()
+        if reader == self._writer:
+            return self._local_channel.get_ray_waitables()
         waitables = []
         waitables.extend(self._gpu_data_channel.get_ray_waitables())
         if self._cpu_data_channel is not None:
-            waitables.extend(self._cpu_data_channel.get_ray_waitables())
+            cpu_waitables = self._cpu_data_channel.get_ray_waitables()
+            assert len(cpu_waitables) == 1
+            # Skip deserialization of the CPU data in `_read_list` and
+            # handle the deserialization in the channel's `read()` method
+            # after the out-of-band tensors are ready in the serialization
+            # context instead.
+            waitables.append((cpu_waitables[0][0], True))
         return waitables
 
     def close(self) -> None: