Skip to content

Commit

Permalink
[CoreEngine] add on-premise deployment files.
Browse files Browse the repository at this point in the history
  • Loading branch information
fedml-alex committed Jan 31, 2024
1 parent 2aedf33 commit 17d1452
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 13 deletions.
7 changes: 7 additions & 0 deletions devops/scripts/build-push-fedml-cloud-image.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

version=$1
image_version=$2

if [ ${version} == "dev" ]; then
docker build --network=host -f ./devops/dockerfile/device-image/Dockerfile-Dev -t fedml/fedml-device-image:${version} .
Expand All @@ -24,5 +25,11 @@ elif [ ${version} == "release" ]; then

docker build --network=host -f ./devops/dockerfile/server-agent/Dockerfile-Release -t fedml/fedml-server-agent:${version} .
docker push fedml/fedml-server-agent:${version}
elif [ ${version} == "local" ]; then
docker build --network=host -f ./devops/dockerfile/device-image/Dockerfile-Local -t fedml/fedml-device-image:${version} .
docker push fedml/fedml-device-image:${version}

docker build --network=host -f ./devops/dockerfile/server-agent/Dockerfile-Local -t fedml/fedml-server-agent:${image_version} .
docker push fedml/fedml-server-agent:${image_version}
fi

3 changes: 3 additions & 0 deletions python/examples/launch/hello_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ job_type: train # options: train, deploy, federate
# deploy subtype: none
job_subtype: generate_training

# containerize
containerize: false

# Bootstrap shell commands which will be executed before running entry commands.
# Support multiple lines, which can be empty.
bootstrap: |
Expand Down
78 changes: 78 additions & 0 deletions python/examples/launch/hello_job_with_container.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Local directory where your source code resides.
# It should be the relative path to this job yaml file or the absolute path.
# If your job doesn't contain any source code, it can be empty.
workspace: hello_world

# Running entry commands which will be executed as the job entry point.
# If an error occurs, you should exit with a non-zero code, e.g. exit 1.
# Otherwise, you should exit with a zero code, e.g. exit 0.
# Support multiple lines, which can not be empty.
job: |
echo "current job id: $FEDML_CURRENT_RUN_ID"
echo "current edge id: $FEDML_CURRENT_EDGE_ID"
echo "Hello, Here is the launch platform."
echo "Current directory is as follows."
pwd
python3 hello_world.py
#sleep 20
#exit 1
#echo "Current GPU information is as follows."
#nvidia-smi # Print GPU information
#gpustat
#echo "Download the file from http://212.183.159.230/200MB.zip ..."
#wget http://212.183.159.230/200MB.zip
#rm ./200MB.zip*
#echo "The downloading task has finished."
# echo "Training the vision transformer model using PyTorch..."
# python vision_transformer.py --epochs 1
# If you want to use the job created by the MLOps platform,
# just uncomment the following three, then set job_id and config_id to your desired job id and related config.
#job_args:
# job_id: 2070
# config_id: 111

# If you want to create the job with specific name, just uncomment the following line and set job_name to your desired job name
#job_name: cv_job

job_type: train # options: train, deploy, federate

# train subtype: general_training, single_machine_training, cluster_distributed_training, cross_cloud_training
# federate subtype: cross_silo, simulation, web, smart_phone
# deploy subtype: none
job_subtype: generate_training

docker:
image: fedml/fedml-default-launch:cu12.1-u22.04
#registry: docker.io
#username: my_hub_user
#password: my_hub_password
#ports: [30001,3002,3003]


# Bootstrap shell commands which will be executed before running entry commands.
# Support multiple lines, which can be empty.
bootstrap: |
# pip install -r requirements.txt
echo "Bootstrap finished."
computing:
minimum_num_gpus: 1 # minimum # of GPUs to provision
maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card
#allow_cross_cloud_resources: true # true, false
#device_type: CPU # options: GPU, CPU, hybrid
resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type

data_args:
dataset_name: mnist
dataset_path: ./dataset
dataset_type: csv

model_args:
input_dim: '784'
model_cache_path: /Users/alexliang/fedml_models
model_name: lr
output_dim: '10'

training_params:
learning_rate: 0.004
2 changes: 1 addition & 1 deletion python/examples/launch/hello_world/hello_world.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
for iter_count in range(10):
acc += 0.01
loss -= 0.02
fedml.log_endpoint({"acc": acc, "loss": loss})
#fedml.log_endpoint({"acc": acc, "loss": loss})
time.sleep(0.1)

artifact = fedml.mlops.Artifact(name=f"general-file@{run_id}-{edge_id}", type=fedml.mlops.ARTIFACT_TYPE_NAME_GENERAL)
Expand Down
12 changes: 10 additions & 2 deletions python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
_global_training_type = None
_global_comm_backend = None

__version__ = "0.8.18.dev7"
__version__ = "0.8.18.dev8"


# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
Expand Down Expand Up @@ -458,7 +458,7 @@ def _get_backend_service():
# caller = getframeinfo(stack()[1][0])
# print(f"{caller.filename}:{caller.lineno} - _get_backend_service. version = {version}")
if version == "local":
return FEDML_BACKEND_SERVICE_URL_LOCAL
return f"http://{get_local_on_premise_platform_host()}:18080"
elif version == "dev":
return FEDML_BACKEND_SERVICE_URL_DEV
elif version == "test":
Expand All @@ -482,6 +482,14 @@ def _get_mqtt_service():
return FEDML_MQTT_DOMAIN_RELEASE


def set_local_on_premise_platform_host(local_on_premise_platform_host):
os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST'] = local_on_premise_platform_host


def get_local_on_premise_platform_host():
return os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST']


def _get_local_s3_like_service_url():
return FEDML_S3_DOMAIN_LOCAL

Expand Down
10 changes: 9 additions & 1 deletion python/fedml/cli/modules/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,16 @@
"--deploy_worker_num", "-dpn", default=1, type=int,
help="Deploy worker number will be started when logged in successfully.",
)
def fedml_login(api_key, version, compute_node, server, provider, deploy_worker_num):
@click.option(
"--local_on_premise_platform",
"-lp",
type=str,
default="127.0.0.1",
help="The IP address for local on-premise Nexus AI Platform.",
)
def fedml_login(api_key, version, compute_node, server, provider, deploy_worker_num, local_on_premise_platform):
fedml.set_env_version(version)
fedml.set_local_on_premise_platform_host(local_on_premise_platform)

api_key = api_key[0] if len(api_key) > 0 else None
try:
Expand Down
6 changes: 5 additions & 1 deletion python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@
from fedml.computing.scheduler.scheduler_core.compute_cache_manager import ComputeCacheManager
from dataclasses import dataclass, field, fields
from fedml.core.common.singleton import Singleton
from fedml.computing.scheduler.comm_utils.container_utils import ContainerUtils
from typing import List
import threading
import json

run_docker_without_gpu = False


@dataclass
class DockerArgs:
Expand Down Expand Up @@ -407,7 +410,8 @@ def generate_launch_docker_command(docker_args: DockerArgs, run_id: int, edge_id
JobRunnerUtils.remove_cuda_visible_devices_lines(entry_file_full_path)
# docker command expects device ids in such format: '"device=0,2,3"'
device_str = f'"device={cuda_visible_gpu_ids_str}"'
docker_command.extend(["--gpus", f"'{device_str}'"])
if not run_docker_without_gpu:
docker_command.extend(["--gpus", f"'{device_str}'"])

# Add Port Mapping
for port in docker_args.ports:
Expand Down
6 changes: 4 additions & 2 deletions python/fedml/computing/scheduler/master/server_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2123,8 +2123,10 @@ def callback_runner_id_status(self, topic, payload):
# Stop log processor for current run
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id)
if self.use_local_process_as_cloud_server:
RunProcessUtils.kill_process(os.getpid())
raise Exception("Killed")
#RunProcessUtils.kill_process(os.getpid())
cloud_server_process = self.run_process_map.get(run_id_str, None)
if cloud_server_process is not None:
RunProcessUtils.kill_process(cloud_server_process.pid)
else:
self.stop_cloud_server()

Expand Down
13 changes: 8 additions & 5 deletions python/fedml/computing/scheduler/slave/client_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1109,10 +1109,13 @@ def callback_runner_id_status(self, topic, payload):
RunProcessUtils.kill_process(run_process.pid)

# Terminate the run docker container if exists
container_name = JobRunnerUtils.get_run_container_name(run_id)
docker_client = JobRunnerUtils.get_docker_client(DockerArgs())
logging.info(f"Terminating the run docker container {container_name} if exists...")
JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client)
try:
container_name = JobRunnerUtils.get_run_container_name(run_id)
docker_client = JobRunnerUtils.get_docker_client(DockerArgs())
logging.info(f"Terminating the run docker container {container_name} if exists...")
JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client)
except Exception as e:
logging.info(f"Exception when terminating docker container {traceback.format_exc()}.")

self.run_process_map.pop(run_id_str)

Expand Down Expand Up @@ -1318,7 +1321,7 @@ def bind_account_and_device_id(self, url, account_id, device_id, os_name, api_ke
"accountid": account_id,
"deviceid": device_id,
"type": os_name,
"status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE,
"state": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE,
"processor": cpu_info,
"core_type": cpu_info,
"network": "",
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def finalize_options(self):

setup(
name="fedml",
version="0.8.18.dev7",
version="0.8.18.dev8",
author="FedML Team",
author_email="ch@fedml.ai",
description="A research and production integrated edge-cloud library for "
Expand Down

0 comments on commit 17d1452

Please sign in to comment.