Skip to content

Commit

Permalink
Improve RayJob controller quality to alpha
Browse files Browse the repository at this point in the history
1. Implement `spec.shutdownAfterJobFinishes`, delete the cluster once the job finishes if this field is set
2. Add status new field `JobDeploymentStatusComplete` to indicate the RayJob is complete
3. Add status field `Message`, `StartTime`, `EndTime` to expose more job status
4. Improve logs usage in the operators and make them look consistent
5. Optimize requeueAfter for some time consuming operators like waiting for dashboard ready (container takes time to start) etc.
  • Loading branch information
Jeffwan committed Jul 23, 2022
1 parent 2616746 commit 5741479
Show file tree
Hide file tree
Showing 9 changed files with 297 additions and 128 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/test-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ jobs:
# Default value should work for both pull_request and merge(push) event.
ref: ${{github.event.pull_request.head.sha}}

- name: Install goimports
run: go get golang.org/x/tools/cmd/goimports
- name: Install goimports and gofumpt
run: |
go get golang.org/x/tools/cmd/goimports
go install mvdan.cc/gofumpt@latest
- name: Run gofmt
uses: Jerome1337/gofmt-action@v1.0.4
Expand Down Expand Up @@ -89,9 +91,7 @@ jobs:
if: failure()

- name: Run gofumpt
run: |
go install mvdan.cc/gofumpt@latest
test -z "$(set -o pipefail && $(go env GOPATH)/bin/gofumpt -l apiserver/ ray-operator/ cli/ | tee gofumpt.out)" || { cat gofumpt.out && exit 1; }
run: test -z "$(set -o pipefail && $(go env GOPATH)/bin/gofumpt -l apiserver/ ray-operator/ cli/ | tee gofumpt.out)" || { cat gofumpt.out && exit 1; }

- name: Open this to see how to fix gofumpt if it fails
run: |
Expand Down
22 changes: 18 additions & 4 deletions ray-operator/apis/ray/v1alpha1/rayjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.

// JobStatus is the Ray Job Status. https://docs.ray.io/en/latest/cluster/jobs-package-ref.html#jobstatus
type JobStatus string

const (
Expand All @@ -17,6 +18,7 @@ const (
JobStatusFailed JobStatus = "FAILED"
)

// JobDeploymentStatus indicates RayJob status including RayCluster lifecycle management and Job submission
type JobDeploymentStatus string

const (
Expand All @@ -26,6 +28,7 @@ const (
JobDeploymentStatusFailedJobDeploy JobDeploymentStatus = "FailedJobDeploy"
JobDeploymentStatusRunning JobDeploymentStatus = "Running"
JobDeploymentStatusFailedToGetJobStatus JobDeploymentStatus = "FailedToGetJobStatus"
JobDeploymentStatusComplete JobDeploymentStatus = "Complete"
)

// RayJobSpec defines the desired state of RayJob
Expand All @@ -37,10 +40,14 @@ type RayJobSpec struct {
Metadata map[string]string `json:"metadata,omitempty"`
// RuntimeEnv is base64 encoded.
RuntimeEnv string `json:"runtimeEnv,omitempty"`
// TODO: If set to true, the rayCluster will be deleted after the rayJob finishes
ShutdownAfterJobFinishes bool `json:"shutdownAfterJobFinishes,omitempty"`
// If jobId is not set, a new jobId will be auto-generated.
JobId string `json:"jobId,omitempty"`
JobId string `json:"jobId,omitempty"`
// ShutdownAfterJobFinishes will determine whether to delete the ray cluster once rayJob succeed or failed.
ShutdownAfterJobFinishes bool `json:"shutdownAfterJobFinishes,omitempty"`
// TTLSecondsAfterFinished is the TTL to clean up RayCluster.
// It's only working when ShutdownAfterJobFinishes set to true.
TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"`
// RayClusterSpec is the cluster template to run the job
RayClusterSpec RayClusterSpec `json:"rayClusterSpec,omitempty"`
// clusterSelector is used to select running rayclusters by labels
ClusterSelector map[string]string `json:"clusterSelector,omitempty"`
Expand All @@ -55,7 +62,14 @@ type RayJobStatus struct {
DashboardURL string `json:"dashboardURL,omitempty"`
JobStatus JobStatus `json:"jobStatus,omitempty"`
JobDeploymentStatus JobDeploymentStatus `json:"jobDeploymentStatus,omitempty"`
RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"`
Message string `json:"message,omitempty"`
// Represents time when the job was acknowledged by the Ray cluster.
// It is not guaranteed to be set in happens-before order across separate operations.
// It is represented in RFC3339 form
StartTime *metav1.Time `json:"startTime,omitempty"`
// Represents time when the job was ended.
EndTime *metav1.Time `json:"endTime,omitempty"`
RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"`
}

//+kubebuilder:object:root=true
Expand Down
13 changes: 13 additions & 0 deletions ray-operator/apis/ray/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 21 additions & 4 deletions ray-operator/config/crd/bases/ray.io_rayjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ spec:
description: Metadata is data to store along with this job.
type: object
rayClusterSpec:
description: 'EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
NOTE: json tags are required.'
description: RayClusterSpec is the cluster template to run the job
properties:
autoscalerOptions:
description: AutoscalerOptions specifies optional configuration
Expand Down Expand Up @@ -11493,9 +11492,13 @@ spec:
description: RuntimeEnv is base64 encoded.
type: string
shutdownAfterJobFinishes:
description: 'TODO: If set to true, the rayCluster will be deleted
after the rayJob finishes'
description: ShutdownAfterJobFinishes will determine whether to delete
the ray cluster once rayJob succeed or fai
type: boolean
ttlSecondsAfterFinished:
description: TTLSecondsAfterFinished is the TTL to clean up RayCluster.
format: int32
type: integer
required:
- entrypoint
type: object
Expand All @@ -11504,13 +11507,22 @@ spec:
properties:
dashboardURL:
type: string
endTime:
description: Represents time when the job was ended.
format: date-time
type: string
jobDeploymentStatus:
description: JobDeploymentStatus indicates RayJob status including
RayCluster lifecycle management and Job submis
type: string
jobId:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed state
of cluster Important: Run "make" to regenerat'
type: string
jobStatus:
description: JobStatus is the Ray Job Status. https://docs.ray.io/en/latest/cluster/jobs-package-ref.
type: string
message:
type: string
rayClusterName:
type: string
Expand Down Expand Up @@ -11553,6 +11565,11 @@ spec:
state of cluster Important: Run "make" to regenerat'
type: string
type: object
startTime:
description: Represents time when the job was acknowledged by the
Ray cluster.
format: date-time
type: string
type: object
type: object
served: true
Expand Down
Loading

0 comments on commit 5741479

Please sign in to comment.