You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
The "Training and Deploying a TensorFlow Model in Vertex AI" lab's (lab_exercise_long.ipynb) CustomContainerTrainingJob errors out with CPU quota violation
#2457
Open
MrCsabaToth opened this issue
Oct 4, 2023
· 2 comments
job = aiplatform.CustomContainerTrainingJob(
display_name="online-retail-clv-3M-dnn-regressor",
container_uri=IMAGE_URI,
# https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers
# gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-3:latest
model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-3:latest",
)
model = job.run(
dataset=tabular_dataset,
model_display_name=MODEL_NAME,
# GCS custom job output dir.
base_output_dir=BASE_OUTPUT_DIR,
# the BQ Tabular dataset splits will be written out to their own BQ dataset for reproducibility.
bigquery_destination=f"bq://{PROJECT_ID}",
# this corresponds to the BigQuery data split column.
predefined_split_column_name="data_split",
# the model training command line arguments defined in trainer.task.
args=CMD_ARGS,
# Custom job WorkerPool arguments.
replica_count=1,
machine_type="c2-standard-4",
# Provide your Tensorboard resource name to write Tensorboard logs during training.
tensorboard=TENSORBOARD_RESOURCE_NAME,
# Provide your Vertex custom training service account created during lab setup.
service_account=f"vertex-custom-training-sa@{PROJECT_ID}.iam.gserviceaccount.com"
)
The error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[53], line 9
1 job = aiplatform.CustomContainerTrainingJob(
2 display_name="online-retail-clv-3M-dnn-regressor",
3 container_uri=IMAGE_URI,
(...)
6 model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-3:latest",
7 )
----> 9 model = job.run(
10 dataset=tabular_dataset,
11 model_display_name=MODEL_NAME,
12 # GCS custom job output dir.
13 base_output_dir=BASE_OUTPUT_DIR,
14 # the BQ Tabular dataset splits will be written out to their own BQ dataset for reproducibility.
15 bigquery_destination=f"bq://{PROJECT_ID}",
16 # this corresponds to the BigQuery data split column.
17 predefined_split_column_name="data_split",
18 # the model training command line arguments defined in trainer.task.
19 args=CMD_ARGS,
20 # Custom job WorkerPool arguments.
21 replica_count=1,
22 machine_type="c2-standard-4",
23 # Provide your Tensorboard resource name to write Tensorboard logs during training.
24 tensorboard=TENSORBOARD_RESOURCE_NAME,
25 # Provide your Vertex custom training service account created during lab setup.
26 service_account=f"vertex-custom-training-sa@{PROJECT_ID}.iam.gserviceaccount.com"
27 )
File /opt/conda/lib/python3.9/site-packages/google/cloud/aiplatform/training_jobs.py:4596, in CustomContainerTrainingJob.run(self, dataset, annotation_schema_uri, model_display_name, model_labels, model_id, parent_model, is_default_version, model_version_aliases, model_version_description, base_output_dir, service_account, network, bigquery_destination, args, environment_variables, replica_count, machine_type, accelerator_type, accelerator_count, boot_disk_type, boot_disk_size_gb, reduction_server_replica_count, reduction_server_machine_type, reduction_server_container_uri, training_fraction_split, validation_fraction_split, test_fraction_split, training_filter_split, validation_filter_split, test_filter_split, predefined_split_column_name, timestamp_split_column_name, timeout, restart_job_on_worker_restart, enable_web_access, enable_dashboard_access, tensorboard, sync, create_request_timeout, disable_retries)
4581 network = network or initializer.global_config.network
4583 worker_pool_specs, managed_model = self._prepare_and_validate_run(
4584 model_display_name=model_display_name,
4585 model_labels=model_labels,
(...)
4593 reduction_server_machine_type=reduction_server_machine_type,
4594 )
-> 4596 return self._run(
4597 dataset=dataset,
4598 annotation_schema_uri=annotation_schema_uri,
4599 worker_pool_specs=worker_pool_specs,
4600 managed_model=managed_model,
4601 model_id=model_id,
4602 parent_model=parent_model,
4603 is_default_version=is_default_version,
4604 model_version_aliases=model_version_aliases,
4605 model_version_description=model_version_description,
4606 args=args,
4607 environment_variables=environment_variables,
4608 base_output_dir=base_output_dir,
4609 service_account=service_account,
4610 network=network,
4611 bigquery_destination=bigquery_destination,
4612 training_fraction_split=training_fraction_split,
4613 validation_fraction_split=validation_fraction_split,
4614 test_fraction_split=test_fraction_split,
4615 training_filter_split=training_filter_split,
4616 validation_filter_split=validation_filter_split,
4617 test_filter_split=test_filter_split,
4618 predefined_split_column_name=predefined_split_column_name,
4619 timestamp_split_column_name=timestamp_split_column_name,
4620 timeout=timeout,
4621 restart_job_on_worker_restart=restart_job_on_worker_restart,
4622 enable_web_access=enable_web_access,
4623 enable_dashboard_access=enable_dashboard_access,
4624 tensorboard=tensorboard,
4625 reduction_server_container_uri=reduction_server_container_uri
4626 if reduction_server_replica_count > 0
4627 else None,
4628 sync=sync,
4629 create_request_timeout=create_request_timeout,
4630 disable_retries=disable_retries,
4631 )
File /opt/conda/lib/python3.9/site-packages/google/cloud/aiplatform/base.py:809, in optional_sync.<locals>.optional_run_in_thread.<locals>.wrapper(*args, **kwargs)
807 if self:
808 VertexAiResourceNounWithFutureManager.wait(self)
--> 809 return method(*args, **kwargs)
811 # callbacks to call within the Future (in same Thread)
812 internal_callbacks = []
File /opt/conda/lib/python3.9/site-packages/google/cloud/aiplatform/training_jobs.py:5302, in CustomContainerTrainingJob._run(self, dataset, annotation_schema_uri, worker_pool_specs, managed_model, model_id, parent_model, is_default_version, model_version_aliases, model_version_description, args, environment_variables, base_output_dir, service_account, network, bigquery_destination, training_fraction_split, validation_fraction_split, test_fraction_split, training_filter_split, validation_filter_split, test_filter_split, predefined_split_column_name, timestamp_split_column_name, timeout, restart_job_on_worker_restart, enable_web_access, enable_dashboard_access, tensorboard, reduction_server_container_uri, sync, create_request_timeout, block, disable_retries)
5281 spec["containerSpec"]["env"] = [
5282 {"name": key, "value": value}
5283 for key, value in environment_variables.items()
5284 ]
5286 (
5287 training_task_inputs,
5288 base_output_dir,
(...)
5299 disable_retries=disable_retries,
5300 )
-> 5302 model = self._run_job(
5303 training_task_definition=schema.training_job.definition.custom_task,
5304 training_task_inputs=training_task_inputs,
5305 dataset=dataset,
5306 annotation_schema_uri=annotation_schema_uri,
5307 training_fraction_split=training_fraction_split,
5308 validation_fraction_split=validation_fraction_split,
5309 test_fraction_split=test_fraction_split,
5310 training_filter_split=training_filter_split,
5311 validation_filter_split=validation_filter_split,
5312 test_filter_split=test_filter_split,
5313 predefined_split_column_name=predefined_split_column_name,
5314 timestamp_split_column_name=timestamp_split_column_name,
5315 model=managed_model,
5316 model_id=model_id,
5317 parent_model=parent_model,
5318 is_default_version=is_default_version,
5319 model_version_aliases=model_version_aliases,
5320 model_version_description=model_version_description,
5321 gcs_destination_uri_prefix=base_output_dir,
5322 bigquery_destination=bigquery_destination,
5323 create_request_timeout=create_request_timeout,
5324 block=block,
5325 )
5327 return model
File /opt/conda/lib/python3.9/site-packages/google/cloud/aiplatform/training_jobs.py:830, in _TrainingJob._run_job(self, training_task_definition, training_task_inputs, dataset, training_fraction_split, validation_fraction_split, test_fraction_split, training_filter_split, validation_filter_split, test_filter_split, predefined_split_column_name, timestamp_split_column_name, annotation_schema_uri, model, model_id, parent_model, is_default_version, model_version_aliases, model_version_description, gcs_destination_uri_prefix, bigquery_destination, create_request_timeout, block)
826 self._gca_resource = training_pipeline
828 _LOGGER.info("View Training:\n%s" % self._dashboard_uri())
--> 830 model = self._get_model(block=block)
832 if model is None:
833 _LOGGER.warning(
834 "Training did not produce a Managed Model returning None. "
835 + self._model_upload_fail_string
836 )
File /opt/conda/lib/python3.9/site-packages/google/cloud/aiplatform/training_jobs.py:918, in _TrainingJob._get_model(self, block)
908 """Helper method to get and instantiate the Model to Upload.
909
910 Returns:
(...)
915 RuntimeError: If Training failed.
916 """
917 if block:
--> 918 self._block_until_complete()
920 if self.has_failed:
921 raise RuntimeError(
922 f"Training Pipeline {self.resource_name} failed. No model available."
923 )
File /opt/conda/lib/python3.9/site-packages/google/cloud/aiplatform/training_jobs.py:961, in _TrainingJob._block_until_complete(self)
958 self._wait_callback()
959 time.sleep(_JOB_WAIT_TIME)
--> 961 self._raise_failure()
963 _LOGGER.log_action_completed_against_resource("run", "completed", self)
965 if self._gca_resource.model_to_upload and not self.has_failed:
File /opt/conda/lib/python3.9/site-packages/google/cloud/aiplatform/training_jobs.py:978, in _TrainingJob._raise_failure(self)
971 """Helper method to raise failure if TrainingPipeline fails.
972
973 Raises:
974 RuntimeError: If training failed.
975 """
977 if self._gca_resource.error.code != code_pb2.OK:
--> 978 raise RuntimeError("Training failed with:\n%s" % self._gca_resource.error)
RuntimeError: Training failed with:
code: 8
message: "The following quota metrics exceed quota limits: aiplatform.googleapis.com/custom_model_training_c2_cpus"
The text was updated successfully, but these errors were encountered:
Cloud Console shows that the assigned quota for aiplatform.googleapis.com/custom_model_training_c2_cpus is zero for all regions.
Switching the machine type to N1 or E2 could help, since aiplatform.googleapis.com/custom_model_training_cpus has a more generous limit (20 or 42 CPUs depending on the region).
I've tested with e2-standard-4 and the pipeline is running as expected. I'm not sure if this workaround conflicts with the Qwiklabs policy of no deviation from instructions!:roll_eyes:
This kicks off the job, 53rd code block:
The error:
The text was updated successfully, but these errors were encountered: