Skip to content

Commit

Permalink
fix: use configured RayCluster head group service account when autosc…
Browse files Browse the repository at this point in the history
…aling

closes #242
  • Loading branch information
davidxia committed May 13, 2022
1 parent c7356da commit 785cd7e
Showing 7 changed files with 128 additions and 16 deletions.
3 changes: 1 addition & 2 deletions ray-operator/controllers/common/pod.go
Original file line number Diff line number Diff line change
@@ -8,7 +8,6 @@ import (

rayiov1alpha1 "github.com/ray-project/kuberay/ray-operator/api/raycluster/v1alpha1"
"github.com/ray-project/kuberay/ray-operator/controllers/utils"

"k8s.io/apimachinery/pkg/api/resource"

logf "sigs.k8s.io/controller-runtime/pkg/log"
@@ -43,7 +42,7 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a
if instance.Spec.EnableInTreeAutoscaling != nil && *instance.Spec.EnableInTreeAutoscaling {
headSpec.RayStartParams["no-monitor"] = "true"
// set custom service account with proper roles bound.
podTemplate.Spec.ServiceAccountName = instance.Name
podTemplate.Spec.ServiceAccountName = utils.GetHeadGroupServiceAccountName(&instance)

// Note: Starting with the upcoming Ray 1.11.0, Ray will by default no longer use Redis
// should be possible to drop some of the logic around Redis passwords at that point.
3 changes: 2 additions & 1 deletion ray-operator/controllers/common/rbac.go
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@ package common

import (
"github.com/ray-project/kuberay/ray-operator/api/raycluster/v1alpha1"
"github.com/ray-project/kuberay/ray-operator/controllers/utils"
v1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -11,7 +12,7 @@ import (
func BuildServiceAccount(cluster *v1alpha1.RayCluster) (*v1.ServiceAccount, error) {
sa := &v1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: cluster.Name,
Name: utils.GetHeadGroupServiceAccountName(cluster),
Namespace: cluster.Namespace,
Labels: map[string]string{
RayClusterLabelKey: cluster.Name,
5 changes: 3 additions & 2 deletions ray-operator/controllers/raycluster_controller.go
Original file line number Diff line number Diff line change
@@ -95,7 +95,7 @@ func (r *RayClusterReconciler) Reconcile(ctx context.Context, request ctrl.Reque
}

if instance.DeletionTimestamp != nil && !instance.DeletionTimestamp.IsZero() {
log.Info("RayCluser is being deleted, just ignore", "cluster name", request.Name)
log.Info("RayCluster is being deleted, just ignore", "cluster name", request.Name)
return ctrl.Result{}, nil
}
if err := r.reconcileAutoscalerServiceAccount(instance); err != nil {
@@ -578,7 +578,8 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(instance *rayio
}

serviceAccount := &corev1.ServiceAccount{}
namespacedName := types.NamespacedName{Namespace: instance.Namespace, Name: instance.Name}
namespacedName := types.NamespacedName{Namespace: instance.Namespace, Name: utils.GetHeadGroupServiceAccountName(instance)}

if err := r.Get(context.TODO(), namespacedName, serviceAccount); err != nil {
if !errors.IsNotFound(err) {
return err
58 changes: 48 additions & 10 deletions ray-operator/controllers/raycluster_controller_fake_test.go
Original file line number Diff line number Diff line change
@@ -24,9 +24,11 @@ import (
"github.com/ray-project/kuberay/ray-operator/controllers/common"
"github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme"
"github.com/stretchr/testify/assert"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/selection"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake"
@@ -43,15 +45,17 @@ import (
)

var (
namespaceStr string
instanceName string
headGroupNameStr string
groupNameStr string
expectReplicaNum int32
testPods []runtime.Object
testRayCluster *rayiov1alpha1.RayCluster
workerSelector labels.Selector
workersToDelete []string
namespaceStr string
instanceName string
enableInTreeAutoscaling bool
headGroupNameStr string
headGroupServiceAccount string
groupNameStr string
expectReplicaNum int32
testPods []runtime.Object
testRayCluster *rayiov1alpha1.RayCluster
workerSelector labels.Selector
workersToDelete []string
)

func setupTest(t *testing.T) {
@@ -61,7 +65,9 @@ func setupTest(t *testing.T) {

namespaceStr = "default"
instanceName = "raycluster-sample"
enableInTreeAutoscaling = true
headGroupNameStr = "head-group"
headGroupServiceAccount = "head-service-account"
groupNameStr = "small-group"
expectReplicaNum = 3
workersToDelete = []string{"pod1", "pod2"}
@@ -152,7 +158,8 @@ func setupTest(t *testing.T) {
Namespace: namespaceStr,
},
Spec: rayiov1alpha1.RayClusterSpec{
RayVersion: "1.0",
RayVersion: "1.0",
EnableInTreeAutoscaling: &enableInTreeAutoscaling,
HeadGroupSpec: rayiov1alpha1.HeadGroupSpec{
ServiceType: "ClusterIP",
Replicas: pointer.Int32Ptr(1),
@@ -166,6 +173,7 @@ func setupTest(t *testing.T) {
},
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
ServiceAccountName: headGroupServiceAccount,
Containers: []corev1.Container{
{
Name: "ray-head",
@@ -555,3 +563,33 @@ func getNotFailedPodItemNum(podList corev1.PodList) int {

return count
}

func TestReconcile_AutoscalerServiceAccount(t *testing.T) {
setupTest(t)
defer tearDown(t)

fakeClient := clientFake.NewClientBuilder().WithRuntimeObjects(testPods...).Build()

saNamespacedName := types.NamespacedName{
Name: headGroupServiceAccount,
Namespace: namespaceStr,
}
sa := corev1.ServiceAccount{}
err := fakeClient.Get(context.Background(), saNamespacedName, &sa)

assert.True(t, errors.IsNotFound(err), "Head group service account should not exist yet")

testRayClusterReconciler := &RayClusterReconciler{
Client: fakeClient,
Recorder: &record.FakeRecorder{},
Scheme: scheme.Scheme,
Log: ctrl.Log.WithName("controllers").WithName("RayCluster"),
}

err = testRayClusterReconciler.reconcileAutoscalerServiceAccount(testRayCluster)
assert.Nil(t, err, "Fail to reconcile autoscaler ServiceAccount")

err = fakeClient.Get(context.Background(), saNamespacedName, &sa)

assert.Nil(t, err, "Fail to get head group ServiceAccount after reconciliation")
}
13 changes: 12 additions & 1 deletion ray-operator/controllers/raycluster_controller_test.go
Original file line number Diff line number Diff line change
@@ -46,14 +46,16 @@ const (
var _ = Context("Inside the default namespace", func() {
ctx := context.TODO()
var workerPods corev1.PodList
var enableInTreeAutoscaling = true

myRayCluster := &rayiov1alpha1.RayCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "raycluster-sample",
Namespace: "default",
},
Spec: rayiov1alpha1.RayClusterSpec{
RayVersion: "1.0",
RayVersion: "1.0",
EnableInTreeAutoscaling: &enableInTreeAutoscaling,
HeadGroupSpec: rayiov1alpha1.HeadGroupSpec{
ServiceType: "ClusterIP",
Replicas: pointer.Int32Ptr(1),
@@ -67,6 +69,7 @@ var _ = Context("Inside the default namespace", func() {
},
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
ServiceAccountName: "head-service-account",
Containers: []corev1.Container{
{
Name: "ray-head",
@@ -174,6 +177,14 @@ var _ = Context("Inside the default namespace", func() {
Expect(pod.Status.Phase).Should(Or(Equal(v1.PodPending), Equal(v1.PodRunning)))
})

It("should create the head group's specified K8s ServiceAccount if it doesn't exist", func() {
saName := utils.GetHeadGroupServiceAccountName(myRayCluster)
sa := &corev1.ServiceAccount{}
Eventually(
getResourceFunc(ctx, client.ObjectKey{Name: saName, Namespace: "default"}, sa),
time.Second*15, time.Millisecond*500).Should(BeNil(), "My head group ServiceAccount = %v", saName)
})

It("should re-create a deleted worker", func() {
Eventually(
listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}),
10 changes: 10 additions & 0 deletions ray-operator/controllers/utils/util.go
Original file line number Diff line number Diff line change
@@ -165,3 +165,13 @@ func FilterContainerByName(containers []corev1.Container, name string) (corev1.C

return corev1.Container{}, fmt.Errorf("can not find container %s", name)
}

// GetHeadGroupServiceAccountName returns the head group service account if it exists.
// Otherwise, it returns the name of the cluster itself.
func GetHeadGroupServiceAccountName(cluster *rayiov1alpha1.RayCluster) string {
headGroupServiceAccountName := cluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName
if headGroupServiceAccountName != "" {
return headGroupServiceAccountName
}
return cluster.Name
}
52 changes: 52 additions & 0 deletions ray-operator/controllers/utils/util_test.go
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@ import (

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

rayiov1alpha1 "github.com/ray-project/kuberay/ray-operator/api/raycluster/v1alpha1"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
)
@@ -65,3 +66,54 @@ func createSomePod() (pod *corev1.Pod) {
},
}
}

func TestGetHeadGroupServiceAccountName(t *testing.T) {
tests := map[string]struct {
input *rayiov1alpha1.RayCluster
want string
}{
"Ray cluster with head group service account": {
input: &rayiov1alpha1.RayCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "raycluster-sample",
Namespace: "default",
},
Spec: rayiov1alpha1.RayClusterSpec{
HeadGroupSpec: rayiov1alpha1.HeadGroupSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
ServiceAccountName: "my-service-account",
},
},
},
},
},
want: "my-service-account",
},
"Ray cluster without head group service account": {
input: &rayiov1alpha1.RayCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "raycluster-sample",
Namespace: "default",
},
Spec: rayiov1alpha1.RayClusterSpec{
HeadGroupSpec: rayiov1alpha1.HeadGroupSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{},
},
},
},
},
want: "raycluster-sample",
},
}

for name, tc := range tests {
t.Run(name, func(t *testing.T) {
got := GetHeadGroupServiceAccountName(tc.input)
if got != tc.want {
t.Fatalf("got %s, want %s", got, tc.want)
}
})
}
}

0 comments on commit 785cd7e

Please sign in to comment.