diff --git a/gpustack/detectors/fastfetch/fastfetch.py b/gpustack/detectors/fastfetch/fastfetch.py index 0d50f1b1..c87f1fbb 100644 --- a/gpustack/detectors/fastfetch/fastfetch.py +++ b/gpustack/detectors/fastfetch/fastfetch.py @@ -222,23 +222,24 @@ def _decode_gpu_devices(self, result: str) -> GPUDevicesInfo: return devices def _run_command(self, command, parse_output=True): + result = None try: result = subprocess.run( command, capture_output=True, text=True, check=True, encoding="utf-8" ) - output = result.stdout if result.returncode != 0: raise Exception(f"Unexpected return code: {result.returncode}") + output = result.stdout if output == "" or output is None: raise Exception(f"Output is empty, return code: {result.returncode}") except Exception as e: - raise Exception( - f"Failed to execute {command}: {e}," - f" stdout: {result.stdout}, stderr: {result.stderr}" - ) + error_message = f"Failed to execute {command}: {e}" + if result: + error_message += f", stdout: {result.stdout}, stderr: {result.stderr}" + raise Exception(error_message) if not parse_output: return output diff --git a/gpustack/detectors/npu_smi/npu_smi.py b/gpustack/detectors/npu_smi/npu_smi.py index 5d554dd9..f3be690a 100644 --- a/gpustack/detectors/npu_smi/npu_smi.py +++ b/gpustack/detectors/npu_smi/npu_smi.py @@ -185,24 +185,25 @@ def decode_gpu_device_mapping(self, result: str) -> Dict[tuple[int], int]: return mapping def _run_command(self, command): + result = None try: result = subprocess.run( command, capture_output=True, text=True, check=True, encoding="utf-8" ) - output = result.stdout if result.returncode != 0: raise Exception(f"Unexpected return code: {result.returncode}") + output = result.stdout if output == "" or output is None: raise Exception(f"Output is empty, return code: {result.returncode}") return output except Exception as e: - raise Exception( - f"Failed to execute {command}: {e}," - f" stdout: {result.stdout}, stderr: {result.stderr}" - ) + error_message = f"Failed to execute {command}: {e}" + if result: + error_message += f", stdout: {result.stdout}, stderr: {result.stderr}" + raise Exception(error_message) def _command_gather_gpu(self): executable_command = [ diff --git a/gpustack/detectors/nvidia_smi/nvidia_smi.py b/gpustack/detectors/nvidia_smi/nvidia_smi.py index 9b268c36..1ac79a65 100644 --- a/gpustack/detectors/nvidia_smi/nvidia_smi.py +++ b/gpustack/detectors/nvidia_smi/nvidia_smi.py @@ -65,24 +65,25 @@ def decode_gpu_devices(self, result) -> GPUDevicesInfo: # noqa: C901 return devices def _run_command(self, command): + result = None try: result = subprocess.run( command, capture_output=True, text=True, check=True, encoding="utf-8" ) - output = result.stdout if result.returncode != 0: raise Exception(f"Unexpected return code: {result.returncode}") + output = result.stdout if output == "" or output is None: raise Exception(f"Output is empty, return code: {result.returncode}") return output except Exception as e: - raise Exception( - f"Failed to execute {command}: {e}," - f" stdout: {result.stdout}, stderr: {result.stderr}" - ) + error_message = f"Failed to execute {command}: {e}" + if result: + error_message += f", stdout: {result.stdout}, stderr: {result.stderr}" + raise Exception(error_message) def _command_gather_gpu(self): executable_command = [ diff --git a/gpustack/worker/collector.py b/gpustack/worker/collector.py index d02d102a..5e702ac8 100755 --- a/gpustack/worker/collector.py +++ b/gpustack/worker/collector.py @@ -43,20 +43,25 @@ def __init__( def collect(self) -> Worker: # noqa: C901 """Collect worker status information.""" + status = WorkerStatus() - system_info = self._detector_factory.detect_system_info() - gpu_devices = self._detector_factory.detect_gpus() - - status = WorkerStatus( - gpu_devices=gpu_devices, - cpu=system_info.cpu, - memory=system_info.memory, - swap=system_info.swap, - filesystem=system_info.filesystem, - os=system_info.os, - kernel=system_info.kernel, - uptime=system_info.uptime, - ) + try: + system_info = self._detector_factory.detect_system_info() + status.cpu = system_info.cpu + status.memory = system_info.memory + status.swap = system_info.swap + status.filesystem = system_info.filesystem + status.os = system_info.os + status.kernel = system_info.kernel + status.uptime = system_info.uptime + except Exception as e: + logger.error(f"Failed to detect system info: {e}") + + try: + gpu_devices = self._detector_factory.detect_gpus() + status.gpu_devices = gpu_devices + except Exception as e: + logger.error(f"Failed to detect GPU devices: {e}") self._inject_unified_memory(status) self._inject_computed_filesystem_usage(status) @@ -84,7 +89,8 @@ def _inject_unified_memory(self, status: WorkerStatus): if status.gpu_devices is not None and len(status.gpu_devices) != 0: is_unified_memory = status.gpu_devices[0].memory.is_unified_memory - status.memory.is_unified_memory = is_unified_memory + if status.memory is not None: + status.memory.is_unified_memory = is_unified_memory def _inject_computed_filesystem_usage(self, status: WorkerStatus): if ( @@ -114,7 +120,9 @@ def _inject_computed_filesystem_usage(self, status: WorkerStatus): except Exception as e: logger.error(f"Failed to inject filesystem usage: {e}") - def _inject_allocated_resource(self, status: WorkerStatus) -> Allocated: + def _inject_allocated_resource( # noqa: C901 + self, status: WorkerStatus + ) -> Allocated: allocated = Allocated(ram=0, vram={}) try: model_instances = self._clientset.model_instances.list() @@ -136,8 +144,10 @@ def _inject_allocated_resource(self, status: WorkerStatus) -> Allocated: ) + (vram.get(gpu_index) or 0) # inject allocated resources - status.memory.allocated = allocated.ram - for ag, agv in allocated.vram.items(): - status.gpu_devices[ag].memory.allocated = agv + if status.memory is not None: + status.memory.allocated = allocated.ram + if status.gpu_devices is not None: + for ag, agv in allocated.vram.items(): + status.gpu_devices[ag].memory.allocated = agv except Exception as e: logger.error(f"Failed to inject allocated resources: {e}")