Skip to content

Commit

Permalink
[CoreEngine] 1. use the raw python as the entry command of devops doc…
Browse files Browse the repository at this point in the history
…kers.

2. fix the issue for launch apis.
3. change the wording of login & logout.
  • Loading branch information
fedml-alex committed Oct 24, 2023
1 parent 3326572 commit e19b1c3
Show file tree
Hide file tree
Showing 20 changed files with 49 additions and 38 deletions.
2 changes: 1 addition & 1 deletion devops/dockerfile/client-agent/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ WORKDIR /fedml

ENV ACCOUNT_ID=0 FEDML_VERSION=${VERSION} CLIENT_DEVICE_ID=0 CLIENT_OS_NAME=linux

CMD fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -c -id ${CLIENT_DEVICE_ID} -os ${CLIENT_OS_NAME}; ./runner.sh
CMD python3 ./fedml-pip/fedml/computing/scheduler/slave/client_daemon.py -t login -r client -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -id ${CLIENT_DEVICE_ID} -os ${CLIENT_OS_NAME}; ./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/device-image/Dockerfile-Dev
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ ENV MODE=normal FEDML_VERSION=${VERSION} ACCOUNT_ID=0 SERVER_DEVICE_ID=0 \
FEDML_PACKAGE_NAME=package FEDML_PACKAGE_URL=s3_url \
FEDML_RUNNER_CMD=3dsad

CMD fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -s -r cloud_server -rc ${FEDML_RUNNER_CMD} -id ${SERVER_DEVICE_ID}; ./runner.sh
CMD python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r cloud_server -rc ${FEDML_RUNNER_CMD} -id ${SERVER_DEVICE_ID}; ./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/device-image/Dockerfile-Release
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ ENV MODE=normal FEDML_VERSION=${VERSION} ACCOUNT_ID=0 SERVER_DEVICE_ID=0 \
FEDML_PACKAGE_NAME=package FEDML_PACKAGE_URL=s3_url \
FEDML_RUNNER_CMD=3dsad

CMD fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -s -r cloud_server -rc ${FEDML_RUNNER_CMD} -id ${SERVER_DEVICE_ID}; ./runner.sh
CMD python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r cloud_server -rc ${FEDML_RUNNER_CMD} -id ${SERVER_DEVICE_ID}; ./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/device-image/Dockerfile-Test
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ ENV MODE=normal FEDML_VERSION=${VERSION} ACCOUNT_ID=0 SERVER_DEVICE_ID=0 \
FEDML_PACKAGE_NAME=package FEDML_PACKAGE_URL=s3_url \
FEDML_RUNNER_CMD=3dsad

CMD fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -s -r cloud_server -rc ${FEDML_RUNNER_CMD} -id ${SERVER_DEVICE_ID}; ./runner.sh
CMD python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r cloud_server -rc ${FEDML_RUNNER_CMD} -id ${SERVER_DEVICE_ID}; ./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/edge-server/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ WORKDIR /fedml

ENV ACCOUNT_ID=0 FEDML_VERSION=${VERSION} SERVER_DEVICE_ID=0 SERVER_OS_NAME=linux

CMD fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -s -id ${SERVER_DEVICE_ID} -os ${SERVER_OS_NAME};./runner.sh
CMD python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r edge_server -id ${SERVER_DEVICE_ID} -os ${SERVER_OS_NAME};./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/model-premise-master/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,6 @@ WORKDIR /fedml
ENV ACCOUNT_ID=0 FEDML_VERSION=${VERSION} SERVER_DEVICE_ID=0 SERVER_OS_NAME=linux INFER_HOST="127.0.0.1" \
FEDML_REDIS_ADDR="127.0.0.1" FEDML_REDIS_PORT=6379 FEDML_REDIS_PASSWORD="fedml_default"

CMD fedml model device login ${ACCOUNT_ID} -v ${FEDML_VERSION} -p -m \
CMD python3 ./fedml-pip/fedml/computing/scheduler/model_scheduler/device_server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r md.on_premise_device.master \
-ih ${INFER_HOST} -id ${SERVER_DEVICE_ID} -os ${SERVER_OS_NAME} \
-ra ${FEDML_REDIS_ADDR} -rp ${FEDML_REDIS_PORT} -rpw ${FEDML_REDIS_PASSWORD};./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/model-premise-slave/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ WORKDIR /fedml

ENV ACCOUNT_ID=0 FEDML_VERSION=${VERSION} CLIENT_DEVICE_ID=0 CLIENT_OS_NAME=linux INFER_HOST="127.0.0.1"

CMD fedml model device login ${ACCOUNT_ID} -v ${FEDML_VERSION} -p \
CMD python3 ./fedml-pip/fedml/computing/scheduler/model_scheduler/device_client_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r client \
-id ${CLIENT_DEVICE_ID} -os ${CLIENT_OS_NAME} -ih ${INFER_HOST}; ./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/multi-stages-build/Dockerfile-Traing-Dev
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,4 @@ COPY --from=fedml-image-base /fedml/fedml-pip/python/fedml ${FEDML_PIP_HOME}

ENV ACCOUNT_ID=0 FEDML_VERSION=${VERSION} SERVER_DEVICE_ID=0 SERVER_OS_NAME=linux

CMD fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -s -id ${SERVER_DEVICE_ID} -os ${SERVER_OS_NAME};./runner.sh
CMD python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r edge_server -id ${SERVER_DEVICE_ID} -os ${SERVER_OS_NAME};./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/server-agent/Dockerfile-Dev
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ ENV MODE=normal FEDML_VERSION=${VERSION} ACCOUNT_ID=0 SERVER_AGENT_ID=0 \
AWS_IAM_ACCESS_KEY=0 \
AWS_REGION=0

CMD ./set-aws-credentials.sh ${AWS_IAM_ACCESS_ID} ${AWS_IAM_ACCESS_KEY} ${AWS_REGION};fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -s -r cloud_agent -id ${SERVER_AGENT_ID};./runner.sh
CMD ./set-aws-credentials.sh ${AWS_IAM_ACCESS_ID} ${AWS_IAM_ACCESS_KEY} ${AWS_REGION};python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r cloud_agent -id ${SERVER_AGENT_ID};./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/server-agent/Dockerfile-Release
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ ENV MODE=normal FEDML_VERSION=${VERSION} ACCOUNT_ID=0 SERVER_AGENT_ID=0 \
AWS_IAM_ACCESS_KEY=0 \
AWS_REGION=0

CMD ./set-aws-credentials.sh ${AWS_IAM_ACCESS_ID} ${AWS_IAM_ACCESS_KEY} ${AWS_REGION};fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -s -r cloud_agent -id ${SERVER_AGENT_ID};./runner.sh
CMD ./set-aws-credentials.sh ${AWS_IAM_ACCESS_ID} ${AWS_IAM_ACCESS_KEY} ${AWS_REGION};python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r cloud_agent -id ${SERVER_AGENT_ID};./runner.sh
2 changes: 1 addition & 1 deletion devops/dockerfile/server-agent/Dockerfile-Test
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ ENV MODE=normal FEDML_VERSION=${VERSION} ACCOUNT_ID=0 SERVER_AGENT_ID=0 \
AWS_IAM_ACCESS_KEY=0 \
AWS_REGION=0

CMD ./set-aws-credentials.sh ${AWS_IAM_ACCESS_ID} ${AWS_IAM_ACCESS_KEY} ${AWS_REGION};fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -s -r cloud_agent -id ${SERVER_AGENT_ID};./runner.sh
CMD ./set-aws-credentials.sh ${AWS_IAM_ACCESS_ID} ${AWS_IAM_ACCESS_KEY} ${AWS_REGION};python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r cloud_agent -id ${SERVER_AGENT_ID};./runner.sh
4 changes: 2 additions & 2 deletions devops/scripts/edge-client-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ FEDML_DEVICE_ID=0
FEDML_OS_NAME=linux

if [ "${ROLE}" == "client" ]; then
fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -c -id ${FEDML_DEVICE_ID} -os ${FEDML_OS_NAME}
python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r client -id ${FEDML_DEVICE_ID} -os ${FEDML_OS_NAME}
else
fedml login ${ACCOUNT_ID} -v ${FEDML_VERSION} -s -id ${FEDML_DEVICE_ID} -os ${FEDML_OS_NAME}
python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r edge_server -id ${FEDML_DEVICE_ID} -os ${FEDML_OS_NAME}
fi

cur_loop=1
Expand Down
16 changes: 8 additions & 8 deletions python/fedml/api/api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@
exit(1)

# Get job status
log_result = fedml.api.run_logs(job_id, 1, 100)
if log_result.run_status is None:
print(f"Failed to get job status. Reason: {error_msg}")
log_result = fedml.api.run_logs(launch_result.run_id, 1, 100)
if log_result is None or log_result.run_status is None:
print(f"Failed to get job status.")
exit(1)
print(f"Run status {run_status}")
print(f"Run status {log_result.run_status}")

# Get job logs
time.sleep(30)
log_result = fedml.api.run_logs(job_id, 1, 100)
if log_result.run_status is None:
print(f"Failed to get run logs. Reason: {error_msg}")
log_result = fedml.api.run_logs(launch_result.run_id, 1, 100)
if log_result is None or log_result.run_status is None:
print(f"Failed to get run logs.")
exit(1)
print(f"Run logs {log_line_list}")
print(f"Run logs {log_result.log_line_list}")

6 changes: 4 additions & 2 deletions python/fedml/api/modules/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,14 @@ def job(yaml_file, api_key: str, resource_id: str = None, device_server: str = N
result_message= ApiConstants.LAUNCH_JOB_STATUS_REQUEST_FAILED, run_id=run_id,
project_id=project_id, inner_id=inner_id)

if run_result.job_url == "":
if run_result.run_url == "":
return LaunchResult(result_code=ApiConstants.ERROR_CODE[ApiConstants.LAUNCH_JOB_STATUS_JOB_URL_ERROR],
result_message=ApiConstants.LAUNCH_JOB_STATUS_JOB_URL_ERROR, run_id=run_id,
project_id=project_id, inner_id=inner_id)

return run_id, project_id, inner_id, 0, ""
return LaunchResult(result_code=0,
result_message="", run_id=run_id,
project_id=project_id, inner_id=inner_id)


def job_on_cluster(yaml_file, cluster: str, api_key: str, resource_id: str, device_server: str,
Expand Down
14 changes: 7 additions & 7 deletions python/fedml/cli/modules/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,19 @@ def fedml_device():
)
@click.option(
"--compute_node", "-c", default=None, is_flag=True,
help="Login as the general compute node in FEDML Nexus AI compute network. This is enabled by default. "
"After login, you can view and manage the device in the FEDML® Nexus AI Platform: https://nexus.fedml.ai/compute. "
help="Bind as the general compute node in FEDML Nexus AI compute network. This is enabled by default. "
"After binding, you can view and manage the device in the FEDML® Nexus AI Platform: https://nexus.fedml.ai/compute. "
"It can be grouped as a cluster and then you can use FEDML®Launch to schedule any job (training, deployment, federated learning) to it. "
"You can not specify the option -c and -s simultaneously.",
)
@click.option(
"--server", "-s", default=None, is_flag=True,
help="Login as the FedML on-premise parameter server (PS). It can be used for PS-based training paradigms, such as distributed training, cross-cloud training, and federated-learning. "
help="Bind as the FedML on-premise parameter server (PS). It can be used for PS-based training paradigms, such as distributed training, cross-cloud training, and federated-learning. "
"You can not specify the option -c and -s simultaneously for a single environment.",
)
@click.option(
"--provider", "-p", default=None, is_flag=True,
help="Login as the FedML compute node (GPU) provider (supplier). This is used by Nexus AI Platform - Share and Earn: https://nexus.fedml.ai/gpu-supplier. You can share your GPUs in this way and earn money. "
help="Bind as the FedML compute node (GPU) provider (supplier). This is used by Nexus AI Platform - Share and Earn: https://nexus.fedml.ai/gpu-supplier. You can share your GPUs in this way and earn money. "
"You can specify the option -p and -c simultaneously (can be used as provider for others as well compute node for your own jobs), but you can not specify -p and -s simultaneously.",
)
def fedml_device_bind(api_key, version, compute_node, server, provider):
Expand All @@ -54,13 +54,13 @@ def fedml_device_bind(api_key, version, compute_node, server, provider):
"-v",
type=str,
default="release",
help="Unbind from which version of FedML® Launch platform. It should be dev, test or release",
help="Unbind which backend environment version of FedML® Nexus AI Platform. It should be dev, test, or release.",
)
@click.option(
"--computing", "-c", default=None, is_flag=True, help="Unbind from the FedML general computing device.",
"--compute_node", "-c", default=None, is_flag=True, help="Unbind from the FedML general compute node.",
)
@click.option(
"--server", "-s", default=None, is_flag=True, help="Unbind from the FedML on-premise federated-learning server.",
"--server", "-s", default=None, is_flag=True, help="Unbind from the the FedML on-premise parameter server (PS).",
)
def fedml_device_unbind(version, computing, server):
fedml.set_env_version(version)
Expand Down
6 changes: 3 additions & 3 deletions python/fedml/cli/modules/logout.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@
@click.command("logout", help="Logout from the FedML® Nexus AI Platform")
@click.help_option("--help", "-h")
@click.option(
"--computing", "-c", default=None, is_flag=True, help="Logout from the FedML general computing device.",
"--computing", "-c", default=None, is_flag=True, help="Logout from the FedML general compute node.",
)
@click.option(
"--server", "-s", default=None, is_flag=True, help="Logout from the FedML on-premise federated-learning server.",
"--server", "-s", default=None, is_flag=True, help="Logout from the the FedML on-premise parameter server (PS).",
)
@click.option(
"--version",
"-v",
type=str,
default="release",
help="Logout from which version of FedML® Nexus AI Platform. It should be dev, test or release.",
help="Logout which backend environment version of FedML® Nexus AI Platform. It should be dev, test, or release.",
)
def fedml_logout(computing, server, version):
fedml.set_env_version(version)
Expand Down
3 changes: 3 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ class SchedulerConstants:

LAUNCH_JOB_DEFAULT_ENTRY_NAME = "fedml_job_entry_pack.sh"
LAUNCH_SERVER_JOB_DEFAULT_ENTRY_NAME = "fedml_server_job_entry_pack.sh"

CLIENT_SHELL_BASH = "bash"
CLIENT_SHELL_PS = "powershell"
9 changes: 7 additions & 2 deletions python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,13 @@ def generate_job_execute_commands(run_id, edge_id, version,
# If the job type is not launch, we need to generate an entry script wrapping with entry commands
if package_type != SchedulerConstants.JOB_PACKAGE_TYPE_LAUNCH and \
os.path.basename(entry_file_full_path) != SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME:
python_program = get_python_program()
entry_commands_filled.append(f"{python_program} {entry_file_full_path} {entry_args}\n")
if str(entry_file_full_path).endswith(".sh"):
shell_program = SchedulerConstants.CLIENT_SHELL_BASH
elif str(entry_file_full_path).endswith(".py"):
shell_program = get_python_program()
elif str(entry_file_full_path).endswith(".bat"):
shell_program = SchedulerConstants.CLIENT_SHELL_PS
entry_commands_filled.append(f"{shell_program} {entry_file_full_path} {entry_args}\n")
entry_file_full_path = os.path.join(
os.path.dirname(entry_file_full_path), os.path.basename(entry_file_full_path) + ".sh")

Expand Down
2 changes: 1 addition & 1 deletion python/fedml/computing/scheduler/comm_utils/sys_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def get_gpu_list():

def get_available_gpu_id_list(limit=1):
if enable_simulation_gpu:
return [1, 2, 3, 4, 5, 6, 7]
return [0, 1, 2, 3, 4, 5, 6, 7]

gpu_available_list = GPUtil.getAvailable(order='memory', limit=limit, maxLoad=0.01, maxMemory=0.01)
return gpu_available_list
Expand Down
5 changes: 3 additions & 2 deletions python/fedml/computing/scheduler/slave/client_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils

from ..comm_utils.yaml_utils import load_yaml_config
from ..comm_utils.constants import SchedulerConstants


class ClientConstants(object):
Expand Down Expand Up @@ -62,8 +63,8 @@ class ClientConstants(object):
CLIENT_BOOTSTRAP_LINUX_PROGRAM = "bootstrap.sh"
CLIENT_BOOTSTRAP_WIN_PROGRAM = "bootstrap.bat"

CLIENT_SHELL_BASH = "bash"
CLIENT_SHELL_PS = "powershell"
CLIENT_SHELL_BASH = SchedulerConstants.CLIENT_SHELL_BASH
CLIENT_SHELL_PS = SchedulerConstants.CLIENT_SHELL_PS
PLATFORM_WINDOWS = "Windows"

FEDML_OTA_CMD_UPGRADE = "upgrade"
Expand Down

0 comments on commit e19b1c3

Please sign in to comment.