From f30f2edebb015dd533ef3d4c2e389ad8e5c65219 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Mon, 1 May 2023 17:28:08 -0700 Subject: [PATCH] [Templates] Reintroduce requirements.txt + temporary patch fixes (#34903) Signed-off-by: Justin Yu --- .github/CODEOWNERS | 1 + doc/BUILD | 13 ++- .../01_batch_inference/batch_inference.ipynb | 62 +++++++----- .../many_model_training.ipynb | 34 ++++--- .../02_many_model_training/requirements.txt | 1 + .../requirements.txt | 10 ++ .../serving_stable_diffusion.ipynb | 71 +++++++------- .../batch_inference.ipynb | 66 ++++++++----- .../many_model_training.ipynb | 63 +++++++++---- .../02_many_model_training/requirements.txt | 1 + .../requirements.txt | 1 + .../serving_stable_diffusion.ipynb | 94 +++++++++++-------- release/release_tests.yaml | 36 +++---- 13 files changed, 284 insertions(+), 169 deletions(-) create mode 100644 doc/source/templates/02_many_model_training/requirements.txt create mode 100644 doc/source/templates/03_serving_stable_diffusion/requirements.txt rename doc/source/templates/tests/{ => 01_batch_inference}/batch_inference.ipynb (89%) rename doc/source/templates/tests/{ => 02_many_model_training}/many_model_training.ipynb (87%) create mode 120000 doc/source/templates/tests/02_many_model_training/requirements.txt create mode 120000 doc/source/templates/tests/03_serving_stable_diffusion/requirements.txt rename doc/source/templates/tests/{ => 03_serving_stable_diffusion}/serving_stable_diffusion.ipynb (91%) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 52bf373935c91..066ff295d9911 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -11,6 +11,7 @@ # NOTE: Add @ray-project/ray-docs to all following docs subdirs. /doc/ @ray-project/ray-docs /doc/source/use-cases.rst @ericl @pcmoritz +/doc/source/templates @justinvyu @sofianhnaide # ==== Ray core ==== diff --git a/doc/BUILD b/doc/BUILD index 7cd534c82a650..a2bb7a8bd79d3 100644 --- a/doc/BUILD +++ b/doc/BUILD @@ -236,7 +236,10 @@ py_test_run_all_subdirectory( filegroup( name = "workspace_templates", - srcs = glob(["source/templates/tests/*.ipynb"]), + srcs = glob([ + "source/templates/tests/**/*.ipynb", + "source/templates/tests/**/requirements.txt" + ]), visibility = ["//doc:__subpackages__"] ) @@ -255,7 +258,8 @@ py_test( py_test_run_all_notebooks( size = "large", - include = ["source/templates/tests/many_model_training.ipynb"], + # TODO(justinvyu): Merge tests/ with the regular versions of the templates. + include = ["source/templates/tests/02_many_model_training/many_model_training.ipynb"], exclude = [], data = ["//doc:workspace_templates"], tags = ["exclusive", "team:ml", "ray_air"], @@ -267,8 +271,9 @@ py_test_run_all_notebooks( py_test_run_all_notebooks( size = "large", include = [ - "source/templates/tests/batch_inference.ipynb", - "source/templates/tests/serving_stable_diffusion.ipynb" + # TODO(justinvyu): Merge tests/ with the regular versions of the templates. + "source/templates/tests/01_batch_inference/batch_inference.ipynb", + "source/templates/tests/03_serving_stable_diffusion/serving_stable_diffusion.ipynb" ], exclude = [], data = ["//doc:workspace_templates"], diff --git a/doc/source/templates/01_batch_inference/batch_inference.ipynb b/doc/source/templates/01_batch_inference/batch_inference.ipynb index ce2f2ced7d065..14b109020e871 100644 --- a/doc/source/templates/01_batch_inference/batch_inference.ipynb +++ b/doc/source/templates/01_batch_inference/batch_inference.ipynb @@ -8,14 +8,14 @@ "source": [ "# Scaling Batch Inference with Ray Data\n", "\n", - "This template is a quickstart to using [Ray Data](https://docs.ray.io/en/latest/data/data.html) for batch inference. Ray Data is one of many libraries under the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). See [this blog post](https://www.anyscale.com/blog/model-batch-inference-in-ray-actors-actorpool-and-datasets) for more information on why and how you should perform batch inference with Ray!\n", + "This template is a quickstart to using [Ray Data](https://docs.ray.io/en/latest/data/dataset.html) for batch inference. Ray Data is one of many libraries under the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). See [this blog post](https://www.anyscale.com/blog/model-batch-inference-in-ray-actors-actorpool-and-datasets) for more information on why and how you should perform batch inference with Ray!\n", "\n", "This template walks through GPU batch prediction on an image dataset using a PyTorch model, but the framework and data format are there just to help you build your own application!\n", "\n", "At a high level, this template will:\n", - "1. [Load your dataset using Ray Data.](https://docs.ray.io/en/latest/data/creating-datastreams.html)\n", - "2. [Preprocess your dataset before feeding it to your model.](https://docs.ray.io/en/latest/data/transforming-datastreams.html)\n", - "3. [Initialize your model and perform inference on a shard of your dataset with a remote actor.](https://docs.ray.io/en/latest/data/transforming-datastreams.html#callable-class-udfs)\n", + "1. [Load your dataset using Ray Data.](https://docs.ray.io/en/latest/data/creating-datasets.html)\n", + "2. [Preprocess your dataset before feeding it to your model.](https://docs.ray.io/en/latest/data/transforming-datasets.html)\n", + "3. [Initialize your model and perform inference on a shard of your dataset with a remote actor.](https://docs.ray.io/en/latest/data/transforming-datasets.html#writing-user-defined-functions-udfs)\n", "4. [Save your prediction results.](https://docs.ray.io/en/latest/data/api/input_output.html)\n", "\n", "> Slot in your code below wherever you see the ✂️ icon to build a many model training Ray application off of this template!" @@ -52,42 +52,46 @@ { "cell_type": "code", "execution_count": null, - "id": "770bbdc7", - "metadata": {}, + "id": "9d49681f-baf0-4ed8-9740-5c4e38744311", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "!ray status" + "NUM_WORKERS: int = 4\n", + "NUM_GPUS_PER_WORKER: float = 1\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "9d49681f-baf0-4ed8-9740-5c4e38744311", - "metadata": { - "tags": [] - }, + "id": "770bbdc7", + "metadata": {}, "outputs": [], "source": [ - "NUM_WORKERS: int = 4\n", - "NUM_GPUS_PER_WORKER: float = 1\n" + "!ray status" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "23321ba8", "metadata": {}, "source": [ "```{tip}\n", - "Try setting `NUM_GPUS_PER_WORKER` to a fractional amount! This will leverage Ray's fractional resource allocation, which means you can schedule multiple batch inference workers to happen on the same GPU.\n", + "Try setting `NUM_GPUS_PER_WORKER` to a fractional amount! This will leverage Ray's fractional resource allocation, which means you can schedule multiple batch inference workers to use the same GPU.\n", "```" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3b6f2352", "metadata": {}, "source": [ - "> ✂️ Replace this function with logic to load your own data with Ray Data." + "> ✂️ Replace this function with logic to load your own data with Ray Data.\n", + ">\n", + "> See [the Ray Data guide on creating datasets](https://docs.ray.io/en/latest/data/creating-datasets.html) to learn how to create a dataset based on the data type and how file storage format." ] }, { @@ -97,7 +101,7 @@ "metadata": {}, "outputs": [], "source": [ - "def load_ray_dataset() -> ray.data.Datastream:\n", + "def load_ray_dataset():\n", " from ray.data.datasource.partitioning import Partitioning\n", "\n", " s3_uri = \"s3://anonymous@air-example-data-2/imagenette2/val/\"\n", @@ -163,7 +167,9 @@ "outputs": [], "source": [ "ds = ds.map_batches(preprocess, batch_format=\"numpy\")\n", - "ds.schema()\n" + "\n", + "print(\"Dataset schema:\\n\", ds.schema())\n", + "print(\"Number of images:\", ds.count())\n" ] }, { @@ -194,9 +200,9 @@ " def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n", " # \n", " input_data = torch.as_tensor(batch[\"image\"], device=self.device)\n", - " with torch.no_grad():\n", - " result = self.model(input_data)\n", - " return {\"predictions\": result.cpu().numpy()}\n" + " with torch.inference_mode():\n", + " pred = self.model(input_data)\n", + " return {\"predicted_class_index\": pred.argmax(dim=1).detach().cpu().numpy()}\n" ] }, { @@ -218,8 +224,9 @@ " PredictCallable,\n", " batch_size=128,\n", " compute=ray.data.ActorPoolStrategy(\n", - " # Fix the number of batch inference workers to a specified value.\n", - " size=NUM_WORKERS,\n", + " # Fix the number of batch inference workers to `NUM_WORKERS`.\n", + " min_size=NUM_WORKERS,\n", + " max_size=NUM_WORKERS,\n", " ),\n", " num_gpus=NUM_GPUS_PER_WORKER,\n", " batch_format=\"numpy\",\n", @@ -237,6 +244,15 @@ "preds.schema()\n" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2565ba08", + "metadata": {}, + "source": [ + "Show the first few predictions!" + ] + }, { "cell_type": "code", "execution_count": null, @@ -244,7 +260,7 @@ "metadata": {}, "outputs": [], "source": [ - "preds.take(1)\n" + "preds.take(5)\n" ] }, { diff --git a/doc/source/templates/02_many_model_training/many_model_training.ipynb b/doc/source/templates/02_many_model_training/many_model_training.ipynb index 1f9613c0b56c7..0645706d75cd0 100644 --- a/doc/source/templates/02_many_model_training/many_model_training.ipynb +++ b/doc/source/templates/02_many_model_training/many_model_training.ipynb @@ -37,8 +37,7 @@ "\n", "This template requires certain Python packages to be available to every node in the cluster.\n", "\n", - "> ✂️ Add your own package dependencies! You can specify bounds for package versions\n", - "> in the same format as a `requirements.txt` file.\n" + "> ✂️ Add your own package dependencies in the `requirements.txt` file!\n" ] }, { @@ -50,9 +49,21 @@ }, "outputs": [], "source": [ - "requirements = [\n", - " \"statsforecast==1.5.0\",\n", - "]\n" + "requirements_path = \"./requirements.txt\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92161434", + "metadata": {}, + "outputs": [], + "source": [ + "with open(requirements_path, \"r\") as f:\n", + " requirements = f.read().strip().splitlines()\n", + "\n", + "print(\"Requirements:\")\n", + "print(\"\\n\".join(requirements))\n" ] }, { @@ -64,7 +75,9 @@ "First, we may want to use these modules right here in our script, which is running on the head node.\n", "Install the Python packages on the head node using `pip install`.\n", "\n", - "You may need to restart this notebook kernel to access the installed packages.\n" + "```{note}\n", + "You may need to restart this notebook kernel to access the installed packages.\n", + "```\n" ] }, { @@ -74,9 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "all_requirements = \" \".join(requirements)\n", - "\n", - "%pip install {all_requirements}\n" + "%pip install -r {requirements_path} --upgrade" ] }, { @@ -118,11 +129,12 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b8fc83d0", "metadata": {}, "source": [ - "> ✂️ Replace this value to change the number of data partitions you will use. This will be total the number of Tune trials you will run!\n", + "> ✂️ Replace this value to change the number of data partitions you will use (<= 5000 for this dataset). This will be total the number of Tune trials you will run!\n", ">\n", "> Note that this template fits two models per data partition and reports the best performing one." ] @@ -136,7 +148,7 @@ }, "outputs": [], "source": [ - "NUM_DATA_PARTITIONS: int = 1000\n" + "NUM_DATA_PARTITIONS: int = 500\n" ] }, { diff --git a/doc/source/templates/02_many_model_training/requirements.txt b/doc/source/templates/02_many_model_training/requirements.txt new file mode 100644 index 0000000000000..25eaf54289234 --- /dev/null +++ b/doc/source/templates/02_many_model_training/requirements.txt @@ -0,0 +1 @@ +statsforecast==1.5.0 diff --git a/doc/source/templates/03_serving_stable_diffusion/requirements.txt b/doc/source/templates/03_serving_stable_diffusion/requirements.txt new file mode 100644 index 0000000000000..eac6df67b25ed --- /dev/null +++ b/doc/source/templates/03_serving_stable_diffusion/requirements.txt @@ -0,0 +1,10 @@ +accelerate==0.14.0 +diffusers==0.15.1 +matplotlib>=3.5.3,<=3.7.1 +numpy>=1.21.6,<=1.23.5 +Pillow==9.3.0 +scipy>=1.7.3,<=1.9.3 +tensorboard>=2.11.2,<=2.12.0 +torch==1.13.0 +torchvision==0.14.0 +transformers==4.28.1 diff --git a/doc/source/templates/03_serving_stable_diffusion/serving_stable_diffusion.ipynb b/doc/source/templates/03_serving_stable_diffusion/serving_stable_diffusion.ipynb index e3b7aad594333..9c79e32010f19 100644 --- a/doc/source/templates/03_serving_stable_diffusion/serving_stable_diffusion.ipynb +++ b/doc/source/templates/03_serving_stable_diffusion/serving_stable_diffusion.ipynb @@ -16,37 +16,39 @@ ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "25364e8e", + "id": "2ea9629f", "metadata": {}, "source": [ "## Handling Dependencies\n", "\n", "This template requires certain Python packages to be available to every node in the cluster.\n", "\n", - "> ✂️ Add your own package dependencies! You can specify bounds for package versions\n", - "> in the same format as a `requirements.txt` file.\n" + "> ✂️ Add your own package dependencies in the `requirements.txt` file!\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "1b79bfb9", + "id": "e43b49fc", "metadata": {}, "outputs": [], "source": [ - "requirements = [\n", - " \"accelerate==0.14.0\",\n", - " \"diffusers==0.15.1\",\n", - " \"numpy>=1.21.6,<=1.23.5\",\n", - " \"Pillow==9.3.0\",\n", - " \"scipy>=1.7.3,<=1.9.3\",\n", - " \"tensorboard>=2.11.2,<=2.12.0\",\n", - " \"torch==1.13.0\",\n", - " \"torchvision==0.14.0\",\n", - " \"transformers==4.28.1\",\n", - "]\n" + "requirements_path = \"./requirements.txt\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19504900", + "metadata": {}, + "outputs": [], + "source": [ + "with open(requirements_path, \"r\") as f:\n", + " requirements = f.read().strip().splitlines()\n", + "\n", + "print(\"Requirements:\")\n", + "print(\"\\n\".join(requirements))\n" ] }, { @@ -70,9 +72,7 @@ "metadata": {}, "outputs": [], "source": [ - "all_requirements = \" \".join(requirements)\n", - "\n", - "%pip install {all_requirements}\n" + "%pip install -r {requirements_path} --upgrade" ] }, { @@ -83,11 +83,7 @@ "source": [ "Next, we need to make sure all worker nodes also have access to the dependencies.\n", "For this, use a [Ray Runtime Environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments)\n", - "to dynamically set up dependencies throughout the cluster.\n", - "\n", - "```{note}\n", - "This will be used later when setting up the Ray Serve deployment.\n", - "```\n" + "to dynamically set up dependencies throughout the cluster.\n" ] }, { @@ -97,7 +93,9 @@ "metadata": {}, "outputs": [], "source": [ - "runtime_env = {\"pip\": requirements}\n" + "import ray\n", + "\n", + "ray.init(runtime_env={\"pip\": requirements})\n" ] }, { @@ -158,7 +156,7 @@ "NUM_GPUS_PER_REPLICA: float = 1\n", "\n", "# Control the output size: (IMAGE_SIZE, IMAGE_SIZE)\n", - "# NOTE: Generated image quality degrades rapidly if you reduce size too much.\n", + "# NOTE: Generated image quality degrades rapidly if you reduce the size too much.\n", "IMAGE_SIZE: int = 776\n" ] }, @@ -178,11 +176,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Configure each model replica to:\n", - "# 1. Setup the dependencies listed earlier.\n", - "# 2. Use the specified resources.\n", + "# Configure each model replica to use the specified resources.\n", "ray_actor_options = {\n", - " \"runtime_env\": runtime_env,\n", " \"num_gpus\": NUM_GPUS_PER_REPLICA,\n", "}\n" ] @@ -403,7 +398,7 @@ " plt.show()\n", "\n", "\n", - "def main() -> float:\n", + "def main():\n", " try:\n", " requests.get(endpoint, timeout=0.1)\n", " except Exception as e:\n", @@ -467,8 +462,7 @@ "metadata": {}, "outputs": [], "source": [ - "mean_generation_time = main()\n", - "serve.shutdown()\n" + "mean_generation_time = main()\n" ] }, { @@ -481,6 +475,17 @@ "You can modify this template and iterate your model deployment directly on your cluster within your Anyscale Workspace,\n", "testing with the local endpoint." ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e360cf9", + "metadata": {}, + "outputs": [], + "source": [ + "# Shut down the model replicas once you're done!\n", + "serve.shutdown()\n" + ] } ], "metadata": { diff --git a/doc/source/templates/tests/batch_inference.ipynb b/doc/source/templates/tests/01_batch_inference/batch_inference.ipynb similarity index 89% rename from doc/source/templates/tests/batch_inference.ipynb rename to doc/source/templates/tests/01_batch_inference/batch_inference.ipynb index 6fd5bf32bc564..0dcfa9cbc4a95 100644 --- a/doc/source/templates/tests/batch_inference.ipynb +++ b/doc/source/templates/tests/01_batch_inference/batch_inference.ipynb @@ -6,7 +6,7 @@ "id": "cfababd6", "metadata": { "tags": [ - "test" + "remove-cell" ] }, "outputs": [], @@ -22,9 +22,8 @@ ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "6fbc3e3c", + "id": "02ff59ce", "metadata": {}, "source": [ "# Scaling Batch Inference with Ray Data\n", @@ -36,7 +35,7 @@ "At a high level, this template will:\n", "1. [Load your dataset using Ray Data.](https://docs.ray.io/en/latest/data/creating-datasets.html)\n", "2. [Preprocess your dataset before feeding it to your model.](https://docs.ray.io/en/latest/data/transforming-datasets.html)\n", - "3. [Initialize your model and perform inference on a shard of your dataset with a remote actor.](https://docs.ray.io/en/latest/data/transforming-datasets.html#callable-class-udfs)\n", + "3. [Initialize your model and perform inference on a shard of your dataset with a remote actor.](https://docs.ray.io/en/latest/data/transforming-datasets.html#writing-user-defined-functions-udfs)\n", "4. [Save your prediction results.](https://docs.ray.io/en/latest/data/api/input_output.html)\n", "\n", "> Slot in your code below wherever you see the ✂️ icon to build a many model training Ray application off of this template!" @@ -73,24 +72,40 @@ { "cell_type": "code", "execution_count": null, - "id": "770bbdc7", - "metadata": {}, + "id": "9d49681f-baf0-4ed8-9740-5c4e38744311", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "!ray status" + "NUM_WORKERS: int = 4\n", + "NUM_GPUS_PER_WORKER: float = 1\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "9d49681f-baf0-4ed8-9740-5c4e38744311", + "id": "20e9e07c", "metadata": { - "tags": [] + "tags": [ + "remove-cell" + ] }, "outputs": [], "source": [ - "NUM_WORKERS: int = 4\n", - "NUM_GPUS_PER_WORKER: float = 1\n" + "if SMOKE_TEST:\n", + " NUM_WORKERS = 4\n", + " NUM_GPUS_PER_WORKER = 0.25\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "770bbdc7", + "metadata": {}, + "outputs": [], + "source": [ + "!ray status" ] }, { @@ -105,10 +120,12 @@ }, { "cell_type": "markdown", - "id": "3b6f2352", + "id": "245f37c9", "metadata": {}, "source": [ - "> ✂️ Replace this function with logic to load your own data with Ray Data." + "> ✂️ Replace this function with logic to load your own data with Ray Data.\n", + ">\n", + "> See [the Ray Data guide on creating datasets](https://docs.ray.io/en/latest/data/creating-datasets.html) to learn how to create a dataset based on the data type and how file storage format." ] }, { @@ -118,7 +135,7 @@ "metadata": {}, "outputs": [], "source": [ - "def load_ray_dataset() -> ray.data.Dataset:\n", + "def load_ray_dataset():\n", " from ray.data.datasource.partitioning import Partitioning\n", "\n", " s3_uri = \"s3://anonymous@air-example-data-2/imagenette2/val/\"\n", @@ -146,7 +163,7 @@ "id": "965db5e8", "metadata": { "tags": [ - "test" + "remove-cell" ] }, "outputs": [], @@ -199,7 +216,9 @@ "outputs": [], "source": [ "ds = ds.map_batches(preprocess, batch_format=\"numpy\")\n", - "ds.schema()\n" + "\n", + "print(\"Dataset schema:\\n\", ds.schema())\n", + "print(\"Number of images:\", ds.count())\n" ] }, { @@ -230,9 +249,9 @@ " def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n", " # \n", " input_data = torch.as_tensor(batch[\"image\"], device=self.device)\n", - " with torch.no_grad():\n", - " result = self.model(input_data)\n", - " return {\"predictions\": result.cpu().numpy()}\n" + " with torch.inference_mode():\n", + " pred = self.model(input_data)\n", + " return {\"predicted_class_index\": pred.argmax(dim=1).detach().cpu().numpy()}\n" ] }, { @@ -254,8 +273,9 @@ " PredictCallable,\n", " batch_size=128,\n", " compute=ray.data.ActorPoolStrategy(\n", - " # Fix the number of batch inference workers to a specified value.\n", - " size=NUM_WORKERS,\n", + " # Fix the number of batch inference workers to `NUM_WORKERS`.\n", + " min_size=NUM_WORKERS,\n", + " max_size=NUM_WORKERS,\n", " ),\n", " num_gpus=NUM_GPUS_PER_WORKER,\n", " batch_format=\"numpy\",\n", @@ -280,7 +300,7 @@ "metadata": {}, "outputs": [], "source": [ - "preds.take(1)\n" + "preds.take(5)\n" ] }, { @@ -326,7 +346,7 @@ "id": "1e88a268", "metadata": { "tags": [ - "test" + "remove-cell" ] }, "outputs": [], diff --git a/doc/source/templates/tests/many_model_training.ipynb b/doc/source/templates/tests/02_many_model_training/many_model_training.ipynb similarity index 87% rename from doc/source/templates/tests/many_model_training.ipynb rename to doc/source/templates/tests/02_many_model_training/many_model_training.ipynb index f6082a27826b6..dbc4aa42bfcd4 100644 --- a/doc/source/templates/tests/many_model_training.ipynb +++ b/doc/source/templates/tests/02_many_model_training/many_model_training.ipynb @@ -26,7 +26,7 @@ "source": [ "# Scaling Many Model Training with Ray Tune\n", "\n", - "This template is a quickstart to using [Ray Tune](https://docs.ray.io/en/latest/tune/index.html) for batch inference. Ray Tune is one of many libraries under the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). See [this blog post](https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray) for more information on the benefits of performing many model training with Ray!\n", + "This template is a quickstart to using [Ray Tune](https://docs.ray.io/en/latest/tune/index.html) for training many models in parallel. Ray Tune is one of many libraries in the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). See [this blog post](https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray) for more information on the benefits of performing many model training with Ray!\n", "\n", "This template walks through time-series forecasting using `statsforecast`, but the framework and data format can be swapped out easily -- they are there just to help you build your own application!\n", "\n", @@ -46,61 +46,84 @@ ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "c56bb4d0", + "id": "182f65ea", "metadata": {}, "source": [ "## Handling Dependencies\n", "\n", "This template requires certain Python packages to be available to every node in the cluster.\n", "\n", - "> ✂️ Add your own package dependencies! You can specify bounds for package versions\n", - "> in the same format as a `requirements.txt` file.\n" + "> ✂️ Add your own package dependencies in the `requirements.txt` file!\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "0c9b3dec", + "id": "511f1722", + "metadata": {}, + "outputs": [], + "source": [ + "requirements_path = \"./requirements.txt\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9a44498", "metadata": { - "tags": [] + "tags": [ + "remove-cell" + ] }, "outputs": [], "source": [ - "requirements = [\n", - " \"statsforecast==1.5.0\",\n", - "]\n" + "if not os.path.exists(requirements_path):\n", + " # CWD is at the ray root in CI\n", + " requirements_path = \"doc/source/templates/tests/02_many_model_training/requirements.txt\"\n", + " assert os.path.exists(requirements_path), (requirements_path, os.getcwd())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cd9da7f", + "metadata": {}, + "outputs": [], + "source": [ + "with open(requirements_path, \"r\") as f:\n", + " requirements = f.read().strip().splitlines()\n", + "\n", + "print(\"Requirements:\")\n", + "print(\"\\n\".join(requirements))\n" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "eff9369f", + "id": "90a96c5b", "metadata": {}, "source": [ "First, we may want to use these modules right here in our script, which is running on the head node.\n", "Install the Python packages on the head node using `pip install`.\n", "\n", - "You may need to restart this notebook kernel to access the installed packages.\n" + "```{note}\n", + "You may need to restart this notebook kernel to access the installed packages.\n", + "```\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "5cba940c", + "id": "18069827", "metadata": {}, "outputs": [], "source": [ - "all_requirements = \" \".join(requirements)\n", - "\n", - "%pip install {all_requirements}\n" + "%pip install -r {requirements_path} --upgrade" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "1dcaea58", + "id": "3e17a4da", "metadata": {}, "source": [ "Next, we need to make sure all worker nodes also have access to the dependencies.\n", @@ -154,7 +177,7 @@ }, "outputs": [], "source": [ - "NUM_DATA_PARTITIONS: int = 1000\n" + "NUM_DATA_PARTITIONS: int = 500\n" ] }, { diff --git a/doc/source/templates/tests/02_many_model_training/requirements.txt b/doc/source/templates/tests/02_many_model_training/requirements.txt new file mode 120000 index 0000000000000..2b363f05fc09b --- /dev/null +++ b/doc/source/templates/tests/02_many_model_training/requirements.txt @@ -0,0 +1 @@ +../../02_many_model_training/requirements.txt \ No newline at end of file diff --git a/doc/source/templates/tests/03_serving_stable_diffusion/requirements.txt b/doc/source/templates/tests/03_serving_stable_diffusion/requirements.txt new file mode 120000 index 0000000000000..bb4db21916ff5 --- /dev/null +++ b/doc/source/templates/tests/03_serving_stable_diffusion/requirements.txt @@ -0,0 +1 @@ +../../03_serving_stable_diffusion/requirements.txt \ No newline at end of file diff --git a/doc/source/templates/tests/serving_stable_diffusion.ipynb b/doc/source/templates/tests/03_serving_stable_diffusion/serving_stable_diffusion.ipynb similarity index 91% rename from doc/source/templates/tests/serving_stable_diffusion.ipynb rename to doc/source/templates/tests/03_serving_stable_diffusion/serving_stable_diffusion.ipynb index 086b29335d79e..68f8a0989ffd6 100644 --- a/doc/source/templates/tests/serving_stable_diffusion.ipynb +++ b/doc/source/templates/tests/03_serving_stable_diffusion/serving_stable_diffusion.ipynb @@ -34,43 +34,61 @@ ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "25364e8e", + "id": "3c8c02eb", "metadata": {}, "source": [ "## Handling Dependencies\n", "\n", "This template requires certain Python packages to be available to every node in the cluster.\n", "\n", - "> ✂️ Add your own package dependencies! You can specify bounds for package versions\n", - "> in the same format as a `requirements.txt` file.\n" + "> ✂️ Add your own package dependencies in the `requirements.txt` file!\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "1b79bfb9", + "id": "814d966b", "metadata": {}, "outputs": [], "source": [ - "requirements = [\n", - " \"accelerate==0.14.0\",\n", - " \"diffusers==0.15.1\",\n", - " \"numpy>=1.21.6,<=1.23.5\",\n", - " \"Pillow==9.3.0\",\n", - " \"scipy>=1.7.3,<=1.9.3\",\n", - " \"tensorboard>=2.11.2,<=2.12.0\",\n", - " \"torch==1.13.0\",\n", - " \"torchvision==0.14.0\",\n", - " \"transformers==4.28.1\",\n", - "]\n" + "requirements_path = \"./requirements.txt\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dec4a7bb", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "if not os.path.exists(requirements_path):\n", + " # CWD is at the ray root in CI\n", + " requirements_path = \"doc/source/templates/tests/03_serving_stable_diffusion/requirements.txt\"\n", + " assert os.path.exists(requirements_path), (requirements_path, os.getcwd())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0d78e94", + "metadata": {}, + "outputs": [], + "source": [ + "with open(requirements_path, \"r\") as f:\n", + " requirements = f.read().strip().splitlines()\n", + "\n", + "print(\"Requirements:\")\n", + "print(\"\\n\".join(requirements))\n" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "33419c37", + "id": "6b73761e", "metadata": {}, "source": [ "First, we may want to use these modules right here in our script, which is running on the head node.\n", @@ -84,38 +102,33 @@ { "cell_type": "code", "execution_count": null, - "id": "9aadf0c5", + "id": "2f6eaf2b", "metadata": {}, "outputs": [], "source": [ - "all_requirements = \" \".join(requirements)\n", - "\n", - "%pip install {all_requirements}\n" + "%pip install -r {requirements_path} --upgrade" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "4ba5feba", + "id": "4b14415f", "metadata": {}, "source": [ "Next, we need to make sure all worker nodes also have access to the dependencies.\n", "For this, use a [Ray Runtime Environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments)\n", - "to dynamically set up dependencies throughout the cluster.\n", - "\n", - "```{note}\n", - "This will be used later when setting up the Ray Serve deployment.\n", - "```\n" + "to dynamically set up dependencies throughout the cluster.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "ca638dbb", + "id": "d8b21822", "metadata": {}, "outputs": [], "source": [ - "runtime_env = {\"pip\": requirements}\n" + "import ray\n", + "\n", + "ray.init(runtime_env={\"pip\": requirements})\n" ] }, { @@ -213,11 +226,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Configure each model replica to:\n", - "# 1. Setup the dependencies listed earlier.\n", - "# 2. Use the specified resources.\n", + "# Configure each model replica to use the specified resources.\n", "ray_actor_options = {\n", - " \"runtime_env\": runtime_env,\n", " \"num_gpus\": NUM_GPUS_PER_REPLICA,\n", "}\n" ] @@ -502,8 +512,7 @@ "metadata": {}, "outputs": [], "source": [ - "mean_generation_time = main()\n", - "serve.shutdown()\n" + "mean_generation_time = main()\n" ] }, { @@ -517,6 +526,17 @@ "testing with the local endpoint." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "3660120b", + "metadata": {}, + "outputs": [], + "source": [ + "# Shut down the model replicas once you're done!\n", + "serve.shutdown()\n" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 2daad40a8c000..e7ad60a468c26 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -835,13 +835,13 @@ - name: workspace_template_batch_inference group: Workspace templates - working_dir: workspace_templates/tests + working_dir: workspace_templates/tests/01_batch_inference python: "3.9" frequency: nightly-3x team: ml cluster: - cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/gpu/aws_release_test.yaml + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/gpu/aws_release_test.yaml run: timeout: 600 @@ -853,23 +853,23 @@ env: gce frequency: manual cluster: - cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/gpu/gce_release_test.yaml + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/gpu/gce_release_test.yaml - name: workspace_template_many_model_training group: Workspace templates - working_dir: workspace_templates/tests + working_dir: workspace_templates/tests/02_many_model_training python: "3.9" frequency: nightly-3x team: ml cluster: - cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/cpu/aws_release_test.yaml + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/cpu/aws_release_test.yaml run: timeout: 600 - script: jupyter nbconvert --to script --output _test many_model_training.ipynb && ipython _test.py + script: pip install -U -r requirements.txt && jupyter nbconvert --to script --output _test many_model_training.ipynb && ipython _test.py variations: - __suffix__: aws @@ -877,23 +877,23 @@ env: gce frequency: manual cluster: - cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/cpu/gce_release_test.yaml + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/cpu/gce_release_test.yaml - name: workspace_template_serving_stable_diffusion group: Workspace templates - working_dir: workspace_templates/tests + working_dir: workspace_templates/tests/03_serving_stable_diffusion python: "3.9" frequency: nightly-3x team: ml cluster: - cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/gpu/aws_release_test.yaml + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/gpu/aws_release_test.yaml run: - timeout: 900 - script: jupyter nbconvert --to script --output _test serving_stable_diffusion.ipynb && ipython _test.py + timeout: 600 + script: pip install -U -r requirements.txt && jupyter nbconvert --to script --output _test serving_stable_diffusion.ipynb && ipython _test.py variations: - __suffix__: aws @@ -901,8 +901,8 @@ env: gce frequency: manual cluster: - cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/gpu/gce_release_test.yaml + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/gpu/gce_release_test.yaml #######################