[CoreEngine] add on-premise deployment files.

jrpizani · Jan 31, 2024 · 17d1452 · 17d1452
1 parent 2aedf33
commit 17d1452
Show file tree

Hide file tree

Showing 10 changed files with 126 additions and 13 deletions.
diff --git a/devops/scripts/build-push-fedml-cloud-image.sh b/devops/scripts/build-push-fedml-cloud-image.sh
@@ -1,5 +1,6 @@
 
 version=$1
+image_version=$2
 
 if [ ${version} == "dev" ]; then
   docker build --network=host -f ./devops/dockerfile/device-image/Dockerfile-Dev -t fedml/fedml-device-image:${version} .
@@ -24,5 +25,11 @@ elif [ ${version} == "release" ]; then
 
   docker build --network=host -f ./devops/dockerfile/server-agent/Dockerfile-Release -t fedml/fedml-server-agent:${version} .
   docker push fedml/fedml-server-agent:${version}
+elif [ ${version} == "local" ]; then
+  docker build --network=host -f ./devops/dockerfile/device-image/Dockerfile-Local -t fedml/fedml-device-image:${version} .
+  docker push fedml/fedml-device-image:${version}
+
+  docker build --network=host -f ./devops/dockerfile/server-agent/Dockerfile-Local -t fedml/fedml-server-agent:${image_version} .
+  docker push fedml/fedml-server-agent:${image_version}
 fi
 
diff --git a/python/examples/launch/hello_job.yaml b/python/examples/launch/hello_job.yaml
@@ -42,6 +42,9 @@ job_type: train              # options: train, deploy, federate
 # deploy subtype: none
 job_subtype: generate_training
 
+# containerize
+containerize: false
+
 # Bootstrap shell commands which will be executed before running entry commands.
 # Support multiple lines, which can be empty.
 bootstrap: |

diff --git a/python/examples/launch/hello_job_with_container.yaml b/python/examples/launch/hello_job_with_container.yaml
@@ -0,0 +1,78 @@
+# Local directory where your source code resides.
+# It should be the relative path to this job yaml file or the absolute path.
+# If your job doesn't contain any source code, it can be empty.
+workspace: hello_world
+
+# Running entry commands which will be executed as the job entry point.
+# If an error occurs, you should exit with a non-zero code, e.g. exit 1.
+# Otherwise, you should exit with a zero code, e.g. exit 0.
+# Support multiple lines, which can not be empty.
+job: |
+    echo "current job id: $FEDML_CURRENT_RUN_ID"
+    echo "current edge id: $FEDML_CURRENT_EDGE_ID"
+    echo "Hello, Here is the launch platform."
+    echo "Current directory is as follows."
+    pwd
+    python3 hello_world.py
+    #sleep 20
+    #exit 1
+    #echo "Current GPU information is as follows."
+    #nvidia-smi # Print GPU information
+    #gpustat
+    #echo "Download the file from http://212.183.159.230/200MB.zip ..."
+    #wget http://212.183.159.230/200MB.zip
+    #rm ./200MB.zip*
+    #echo "The downloading task has finished."
+    # echo "Training the vision transformer model using PyTorch..."
+    # python vision_transformer.py --epochs 1
+
+# If you want to use the job created by the MLOps platform,
+# just uncomment the following three, then set job_id and config_id to your desired job id and related config.
+#job_args:
+#  job_id: 2070
+#  config_id: 111
+
+# If you want to create the job with specific name, just uncomment the following line and set job_name to your desired job name
+#job_name: cv_job
+
+job_type: train              # options: train, deploy, federate
+
+# train subtype: general_training, single_machine_training, cluster_distributed_training, cross_cloud_training
+# federate subtype: cross_silo, simulation, web, smart_phone
+# deploy subtype: none
+job_subtype: generate_training
+
+docker:
+  image: fedml/fedml-default-launch:cu12.1-u22.04
+  #registry: docker.io
+  #username: my_hub_user
+  #password: my_hub_password
+  #ports: [30001,3002,3003]
+
+
+# Bootstrap shell commands which will be executed before running entry commands.
+# Support multiple lines, which can be empty.
+bootstrap: |
+  # pip install -r requirements.txt
+  echo "Bootstrap finished."
+
+computing:
+  minimum_num_gpus: 1           # minimum # of GPUs to provision
+  maximum_cost_per_hour: $3000   # max cost per hour for your job per gpu card
+  #allow_cross_cloud_resources: true # true, false
+  #device_type: CPU              # options: GPU, CPU, hybrid
+  resource_type: A100-80G       # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type
+
+data_args:
+  dataset_name: mnist
+  dataset_path: ./dataset
+  dataset_type: csv
+
+model_args:
+  input_dim: '784'
+  model_cache_path: /Users/alexliang/fedml_models
+  model_name: lr
+  output_dim: '10'
+
+training_params:
+  learning_rate: 0.004
diff --git a/python/examples/launch/hello_world/hello_world.py b/python/examples/launch/hello_world/hello_world.py
@@ -16,7 +16,7 @@
     for iter_count in range(10):
         acc += 0.01
         loss -= 0.02
-        fedml.log_endpoint({"acc": acc, "loss": loss})
+        #fedml.log_endpoint({"acc": acc, "loss": loss})
         time.sleep(0.1)
 
     artifact = fedml.mlops.Artifact(name=f"general-file@{run_id}-{edge_id}", type=fedml.mlops.ARTIFACT_TYPE_NAME_GENERAL)

diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py
@@ -34,7 +34,7 @@
 _global_training_type = None
 _global_comm_backend = None
 
-__version__ = "0.8.18.dev7"
+__version__ = "0.8.18.dev8"
 
 
 # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
@@ -458,7 +458,7 @@ def _get_backend_service():
     # caller = getframeinfo(stack()[1][0])    
     # print(f"{caller.filename}:{caller.lineno} - _get_backend_service. version = {version}")
     if version == "local":
-        return FEDML_BACKEND_SERVICE_URL_LOCAL
+        return f"http://{get_local_on_premise_platform_host()}:18080"
     elif version == "dev":
         return FEDML_BACKEND_SERVICE_URL_DEV
     elif version == "test":
@@ -482,6 +482,14 @@ def _get_mqtt_service():
         return FEDML_MQTT_DOMAIN_RELEASE
 
 
+def set_local_on_premise_platform_host(local_on_premise_platform_host):
+    os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST'] = local_on_premise_platform_host
+
+
+def get_local_on_premise_platform_host():
+    return os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST']
+
+
 def _get_local_s3_like_service_url():
     return FEDML_S3_DOMAIN_LOCAL
 

diff --git a/python/fedml/cli/modules/login.py b/python/fedml/cli/modules/login.py
@@ -37,8 +37,16 @@
     "--deploy_worker_num", "-dpn", default=1, type=int,
     help="Deploy worker number will be started when logged in successfully.",
 )
-def fedml_login(api_key, version, compute_node, server, provider, deploy_worker_num):
+@click.option(
+    "--local_on_premise_platform",
+    "-lp",
+    type=str,
+    default="127.0.0.1",
+    help="The IP address for local on-premise Nexus AI Platform.",
+)
+def fedml_login(api_key, version, compute_node, server, provider, deploy_worker_num, local_on_premise_platform):
     fedml.set_env_version(version)
+    fedml.set_local_on_premise_platform_host(local_on_premise_platform)
 
     api_key = api_key[0] if len(api_key) > 0 else None
     try:

diff --git a/python/fedml/computing/scheduler/comm_utils/job_utils.py b/python/fedml/computing/scheduler/comm_utils/job_utils.py
@@ -14,10 +14,13 @@
 from fedml.computing.scheduler.scheduler_core.compute_cache_manager import ComputeCacheManager
 from dataclasses import dataclass, field, fields
 from fedml.core.common.singleton import Singleton
+from fedml.computing.scheduler.comm_utils.container_utils import ContainerUtils
 from typing import List
 import threading
 import json
 
+run_docker_without_gpu = False
+
 
 @dataclass
 class DockerArgs:
@@ -407,7 +410,8 @@ def generate_launch_docker_command(docker_args: DockerArgs, run_id: int, edge_id
             JobRunnerUtils.remove_cuda_visible_devices_lines(entry_file_full_path)
             # docker command expects device ids in such format: '"device=0,2,3"'
             device_str = f'"device={cuda_visible_gpu_ids_str}"'
-            docker_command.extend(["--gpus", f"'{device_str}'"])
+            if not run_docker_without_gpu:
+                docker_command.extend(["--gpus", f"'{device_str}'"])
 
         # Add Port Mapping
         for port in docker_args.ports:

diff --git a/python/fedml/computing/scheduler/master/server_runner.py b/python/fedml/computing/scheduler/master/server_runner.py
@@ -2123,8 +2123,10 @@ def callback_runner_id_status(self, topic, payload):
                 # Stop log processor for current run
                 MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id)
                 if self.use_local_process_as_cloud_server:
-                    RunProcessUtils.kill_process(os.getpid())
-                    raise Exception("Killed")
+                    #RunProcessUtils.kill_process(os.getpid())
+                    cloud_server_process = self.run_process_map.get(run_id_str, None)
+                    if cloud_server_process is not None:
+                        RunProcessUtils.kill_process(cloud_server_process.pid)
                 else:
                     self.stop_cloud_server()
 

diff --git a/python/fedml/computing/scheduler/slave/client_runner.py b/python/fedml/computing/scheduler/slave/client_runner.py
@@ -1109,10 +1109,13 @@ def callback_runner_id_status(self, topic, payload):
                     RunProcessUtils.kill_process(run_process.pid)
 
                     # Terminate the run docker container if exists
-                    container_name = JobRunnerUtils.get_run_container_name(run_id)
-                    docker_client = JobRunnerUtils.get_docker_client(DockerArgs())
-                    logging.info(f"Terminating the run docker container {container_name} if exists...")
-                    JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client)
+                    try:
+                        container_name = JobRunnerUtils.get_run_container_name(run_id)
+                        docker_client = JobRunnerUtils.get_docker_client(DockerArgs())
+                        logging.info(f"Terminating the run docker container {container_name} if exists...")
+                        JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client)
+                    except Exception as e:
+                        logging.info(f"Exception when terminating docker container {traceback.format_exc()}.")
 
                 self.run_process_map.pop(run_id_str)
 
@@ -1318,7 +1321,7 @@ def bind_account_and_device_id(self, url, account_id, device_id, os_name, api_ke
             "accountid": account_id,
             "deviceid": device_id,
             "type": os_name,
-            "status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE,
+            "state": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE,
             "processor": cpu_info,
             "core_type": cpu_info,
             "network": "",

diff --git a/python/setup.py b/python/setup.py
@@ -113,7 +113,7 @@ def finalize_options(self):
 
 setup(
     name="fedml",
-    version="0.8.18.dev7",
+    version="0.8.18.dev8",
     author="FedML Team",
     author_email="ch@fedml.ai",
     description="A research and production integrated edge-cloud library for "