Skip to content

Commit

Permalink
[CoreEngine] optimize the policy to check job status on the slave agent.
Browse files Browse the repository at this point in the history
  • Loading branch information
fedml-alex committed Dec 7, 2023
1 parent 5d5dc12 commit 70b4429
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 22 deletions.
3 changes: 1 addition & 2 deletions devops/scripts/build-fedml-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,7 @@ if [ "$build_arm_arch_images" != "" ]; then
cd $pwd
fi

cd ./installation/build_fedml_docker
docker build -f light/Dockerfile \
docker build -f ./installation/build_fedml_docker/light/Dockerfile \
--network=host \
-t ${DOCKER_REGISTRY}/fedml/fedml:light .
cd $pwd
8 changes: 7 additions & 1 deletion installation/build_fedml_docker/light/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@ ENV HOME /home/$USER
##############################################################################
# Add docker location file
##############################################################################
ADD ./light/docker-location.yml /home/fedml/fedml-client/fedml/data/docker-location.yml
ADD ./installation/build_fedml_docker/light/docker-location.yml /home/fedml/fedml-client/fedml/data/docker-location.yml

RUN pip3 install fedml

COPY ./python ./fedml/fedml-pip
WORKDIR ./fedml/fedml-pip
RUN pip3 install -e ./




2 changes: 1 addition & 1 deletion python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
_global_training_type = None
_global_comm_backend = None

__version__ = "0.8.12a58"
__version__ = "0.8.12a59"


# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
Expand Down
43 changes: 29 additions & 14 deletions python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,20 +466,34 @@ def monitor_slave_run_process_status(self):
count += 1
if count >= 1000:
break
all_run_processes_exited = True

# Calc the timeout
started_time = int(float(job.started_time))
timeout = time.time() - started_time

# Check if all processes of the specific run are exited
run_process_list = client_constants.ClientConstants.get_learning_process_list(job.job_id)
for run_process_id in run_process_list:
try:
process = psutil.Process(int(run_process_id))
except Exception as e:
process = None
pass
if process is not None:
all_run_processes_exited = False

# If the run processes have exited but run status is not completed,
# then release gpu ids and report failed status to the master agent.
if all_run_processes_exited and not SchedulerConstants.is_run_completed(job.status):
all_run_processes_exited = True if len(run_process_list) <= 0 else False

# Get the timeout threshold
timeout_threshold = None
if job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_PROVISIONING or \
job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_QUEUED:
timeout_threshold = SchedulerConstants.TRAIN_PROVISIONING_TIMEOUT
elif job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_INITIALIZING or \
job.status == client_constants.ClientConstants.MSG_MLOPS_RUN_STATUS_STARTING or \
job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING:
timeout_threshold = SchedulerConstants.TRAIN_STARTING_TIMEOUT
elif job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_TRAINING or \
job.status == client_constants.ClientConstants.MSG_MLOPS_RUN_STATUS_RUNNING:
timeout_threshold = SchedulerConstants.TRAIN_RUNNING_TIMEOUT
elif job.status == client_constants.ClientConstants.MSG_MLOPS_RUN_STATUS_STOPPING:
timeout_threshold = SchedulerConstants.TRAIN_STOPPING_TIMEOUT

# If the run processes have exited but run status is not completed and
# timeout is out of the range, then release gpu ids and report failed status to the master agent.
if all_run_processes_exited and not SchedulerConstants.is_run_completed(job.status) and \
timeout_threshold is not None and timeout > timeout_threshold:
# Release the gpu ids
self.release_gpu_ids(job.job_id, job.edge_id)

Expand Down Expand Up @@ -509,7 +523,8 @@ def monitor_master_run_process_status(self, server_id, device_info_reporter=None
timeout_threshold = None
if job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_PROVISIONING:
timeout_threshold = SchedulerConstants.TRAIN_PROVISIONING_TIMEOUT
elif job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING:
elif job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING or \
job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING:
timeout_threshold = SchedulerConstants.TRAIN_STARTING_TIMEOUT
elif job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING:
timeout_threshold = SchedulerConstants.TRAIN_RUNNING_TIMEOUT
Expand Down
4 changes: 1 addition & 3 deletions python/fedml/computing/scheduler/master/server_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2500,9 +2500,7 @@ def setup_agent_mqtt_connection(self, service_config):
if not self.run_as_cloud_server:
self.recover_start_train_msg_after_upgrading()

JobRunnerUtils.get_instance().sync_run_process_gpu()
JobRunnerUtils.get_instance().sync_endpoint_process_gpu()
JobRunnerUtils.get_instance().reset_available_gpu_id_list(self.edge_id)
JobRunnerUtils.get_instance().sync_data_on_startup(self.edge_id)

self.master_api_daemon = MasterApiDaemon()
self.master_api_process = Process(target=self.master_api_daemon.run)
Expand Down
1 change: 1 addition & 0 deletions python/fedml/computing/scheduler/slave/client_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

class ClientConstants(object):
MSG_MLOPS_CLIENT_STATUS_OFFLINE = "OFFLINE"
MSG_MLOPS_CLIENT_STATUS_PROVISIONING = "PROVISIONING"
MSG_MLOPS_CLIENT_STATUS_IDLE = "IDLE"
MSG_MLOPS_CLIENT_STATUS_UPGRADING = "UPGRADING"
MSG_MLOPS_CLIENT_STATUS_QUEUED = "QUEUED"
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def finalize_options(self):

setup(
name="fedml",
version="0.8.12a58",
version="0.8.12a59",
author="FedML Team",
author_email="ch@fedml.ai",
description="A research and production integrated edge-cloud library for "
Expand Down

0 comments on commit 70b4429

Please sign in to comment.