Skip to content

Commit

Permalink
[k8s] Allow spot instances on supported k8s clusters (#3675)
Browse files Browse the repository at this point in the history
* Add spot support

* comment

* patch get_spot_label for tests
  • Loading branch information
romilbhardwaj authored and Michaelvll committed Aug 23, 2024
1 parent 4be92f3 commit c9ad0d6
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 2 deletions.
15 changes: 14 additions & 1 deletion sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ def ssh_key_secret_field_name(self):
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES
unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
# Features to be disabled for exec auth
is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth()
if is_exec_auth:
assert isinstance(message, str), message
Expand All @@ -110,6 +111,11 @@ def _unsupported_features_for_resources(
# Pod does not have permissions to terminate itself with exec auth.
unsupported_features[
clouds.CloudImplementationFeatures.AUTO_TERMINATE] = message
# Allow spot instances if supported by the cluster
spot_label_key, _ = kubernetes_utils.get_spot_label()
if spot_label_key is not None:
unsupported_features.pop(
clouds.CloudImplementationFeatures.SPOT_INSTANCE, None)
return unsupported_features

@classmethod
Expand Down Expand Up @@ -302,6 +308,11 @@ def make_deploy_resources_variables(

fuse_device_required = bool(resources.requires_fuse)

# Configure spot labels, if requested and supported
spot_label_key, spot_label_value = None, None
if resources.use_spot:
spot_label_key, spot_label_value = kubernetes_utils.get_spot_label()

deploy_vars = {
'instance_type': resources.instance_type,
'custom_resources': custom_resources,
Expand All @@ -323,6 +334,8 @@ def make_deploy_resources_variables(
'k8s_fuse_device_required': fuse_device_required,
# Namespace to run the FUSE device manager in
'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
'k8s_spot_label_key': spot_label_key,
'k8s_spot_label_value': spot_label_value,
'image_id': image_id,
}

Expand Down
38 changes: 38 additions & 0 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1528,6 +1528,44 @@ def get_autoscaler_type(
return autoscaler_type


# Mapping of known spot label keys and values for different cluster types
# Add new cluster types here if they support spot instances along with the
# corresponding spot label key and value.
SPOT_LABEL_MAP = {
kubernetes_enums.KubernetesAutoscalerType.GKE.value:
('cloud.google.com/gke-spot', 'true')
}


def get_spot_label() -> Tuple[Optional[str], Optional[str]]:
"""Get the spot label key and value for using spot instances, if supported.
Checks if the underlying cluster supports spot instances by checking nodes
for known spot label keys and values. If found, returns the spot label key
and value. If not, checks if autoscaler is configured and returns
appropriate labels. If neither are found, returns None.
Returns:
Tuple[str, str]: Tuple containing the spot label key and value. Returns
None if spot instances are not supported.
"""
# Check if the cluster supports spot instances by checking nodes for known
# spot label keys and values
for node in get_kubernetes_nodes():
for _, (key, value) in SPOT_LABEL_MAP.items():
if key in node.metadata.labels and node.metadata.labels[
key] == value:
return key, value

# Check if autoscaler is configured. Allow spot instances if autoscaler type
# is known to support spot instances.
autoscaler_type = get_autoscaler_type()
if autoscaler_type == kubernetes_enums.KubernetesAutoscalerType.GKE:
return SPOT_LABEL_MAP[autoscaler_type.value]

return None, None


def dict_to_k8s_object(object_dict: Dict[str, Any], object_type: 'str') -> Any:
"""Converts a dictionary to a Kubernetes object.
Expand Down
15 changes: 14 additions & 1 deletion sky/templates/kubernetes-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,22 @@ available_node_types:
restartPolicy: Never

# Add node selector if GPUs are requested:
{% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
{% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
nodeSelector:
{% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
{{k8s_acc_label_key}}: {{k8s_acc_label_value}}
{% endif %}
{% if k8s_spot_label_key is not none %}
{{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
{% endif %}
{% endif %}

{% if k8s_spot_label_key is not none %}
tolerations:
- key: {{k8s_spot_label_key}}
operator: Equal
value: {{k8s_spot_label_value|tojson}}
effect: NoSchedule
{% endif %}

# This volume allocates shared memory for Ray to use for its plasma
Expand Down
2 changes: 2 additions & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,5 @@ def _get_az_mappings(_):
lambda *_args, **_kwargs: [True, []])
monkeypatch.setattr('sky.provision.kubernetes.utils.check_instance_fits',
lambda *_args, **_kwargs: [True, ''])
monkeypatch.setattr('sky.provision.kubernetes.utils.get_spot_label',
lambda *_args, **_kwargs: [None, None])

0 comments on commit c9ad0d6

Please sign in to comment.