add cluster nodes info test

ray-project · Jeffwan · Jun 28, 2022 · Jun 9, 2022 · Jun 13, 2022 · Jun 13, 2022
commit b2f4791b22dae7727c0d7698c84fa800819eb8e0
diff --git a/ray-operator/controllers/ray/raycluster_controller_test.go b/ray-operator/controllers/ray/raycluster_controller_test.go
@@ -39,7 +39,7 @@ import (
 )
 
 const (
-	DefaultAttempts               = 8
+	DefaultAttempts               = 16
 	DefaultSleepDurationInSeconds = 3
 )
 
@@ -206,7 +206,7 @@ var _ = Context("Inside the default namespace", func() {
 			// adding a scale down
 			Eventually(
 				getResourceFunc(ctx, client.ObjectKey{Name: myRayCluster.Name, Namespace: "default"}, myRayCluster),
-				time.Second*3, time.Millisecond*500).Should(BeNil(), "My raycluster = %v", myRayCluster)
+				time.Second*5, time.Millisecond*500).Should(BeNil(), "My raycluster = %v", myRayCluster)
 			rep := new(int32)
 			*rep = 2
 			myRayCluster.Spec.WorkerGroupSpecs[0].Replicas = rep

diff --git a/tests/compatibility-test.py b/tests/compatibility-test.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python
 import logging
 import os
-import sys
 import tempfile
-import time
 import unittest
 from string import Template
 
 import docker
+import sys
+import time
 
 ray_version = '1.9.0'
 ray_image = "rayproject/ray:1.9.0"
@@ -21,6 +21,7 @@
 
 kuberay_sha = 'nightly'
 
+
 def shell_run(cmd):
     logger.info(cmd)
     return os.system(cmd)
@@ -61,6 +62,8 @@ def create_kuberay_cluster():
         f.write(raycluster_spec_buf)
         raycluster_spec_file = f.name
 
+    time.sleep(30)
+
     shell_assert_success('kubectl wait --for=condition=ready pod -n ray-system --all --timeout=1600s')
     assert raycluster_spec_file is not None
     shell_assert_success('kubectl apply -f {}'.format(raycluster_spec_file))
@@ -85,21 +88,30 @@ def download_images():
 
 
 class BasicRayTestCase(unittest.TestCase):
-    def setUp(self):
+
+    @classmethod
+    def setUpClass(cls):
+        # Ray cluster is running inside a local Kind environment.
+        # We use port mapping to connect to the Kind environment
+        # from another local ray container. The local ray container
+        # outside Kind environment has the same ray version as the
+        # ray cluster running inside Kind environment.
         create_cluster()
         apply_kuberay_resources()
         download_images()
         create_kuberay_cluster()
 
     def test_simple_code(self):
+        # connect from a ray containter client to ray cluster
+        # inside a local Kind environment and run a simple test
         client = docker.from_env()
         container = client.containers.run(ray_image,
                                           remove=True,
                                           detach=True,
                                           tty=True,
                                           network_mode='host')
         rtn_code, output = container.exec_run(['python',
-                                              '-c', '''
+                                               '-c', '''
 import ray
 ray.init(address='ray://127.0.0.1:10001')
 
@@ -125,7 +137,40 @@ def f(x):
 
         client.close()
 
-    def tearDown(self):
+    def test_cluster_info(self):
+        # connect from a ray containter client to ray cluster
+        # inside a local Kind environment and run a test that
+        # gets the amount of nodes in the ray cluster.
+        client = docker.from_env()
+        container = client.containers.run(ray_image,
+                                          remove=True,
+                                          detach=True,
+                                          tty=True,
+                                          network_mode='host')
+        rtn_code, output = container.exec_run(['python',
+                                               '-c', '''
+import ray
+ray.init(address='ray://127.0.0.1:10001')
+
+print(len(ray.nodes()))
+'''],
+                                              demux=True)
+        stdout_str, _ = output
+
+        container.stop()
+
+        if stdout_str != b'2\n':
+            print(output, file=sys.stderr)
+            raise Exception('invalid result.')
+        if rtn_code != 0:
+            msg = 'invalid return code {}'.format(rtn_code)
+            print(msg, file=sys.stderr)
+            raise Exception(msg)
+
+        client.close()
+
+    @classmethod
+    def tearDownClass(cls):
         delete_cluster()
 
 

diff --git a/tests/config/ray-cluster.mini.yaml.template b/tests/config/ray-cluster.mini.yaml.template
@@ -55,3 +55,64 @@ spec:
             name: dashboard
           - containerPort: 10001
             name: client
+  workerGroupSpecs:
+    - replicas: 1
+      minReplicas: 1
+      maxReplicas: 1
+      groupName: small-group
+      # the following params are used to complete the ray start: ray start --block --node-ip-address= ...
+      rayStartParams:
+        redis-password: 'LetMeInRay' # Deprecated since Ray 1.11 due to GCS bootstrapping enabled
+        node-ip-address: $$MY_POD_IP
+        block: 'true'
+      #pod template
+      template:
+        metadata:
+          labels:
+            rayCluster: raycluster-compatibility-test
+            rayNodeType: worker # will be injected if missing
+            groupName: small-group # will be injected if missing
+          # annotations for pod
+          annotations:
+            key: value
+        spec:
+          initContainers: # to avoid worker crashing before head service is created
+          - name: init-myservice
+            image: busybox:1.28
+            # Change the cluster postfix if you don't have a default setting
+            command: ['sh', '-c', "until nslookup $$RAY_IP.$$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
+          containers:
+          - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+            image: $ray_image
+            # environment variables to set in the container.Optional.
+            # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
+            env:
+            - name: TYPE
+              value: "worker"
+            - name:  RAY_DISABLE_DOCKER_CPU_WARNING
+              value: "1"
+            - name: CPU_REQUEST
+              valueFrom:
+                resourceFieldRef:
+                  containerName: machine-learning
+                  resource: requests.cpu
+            - name: MY_POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: MY_POD_IP
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+            ports:
+            - containerPort: 80
+            # use volumeMounts.Optional.
+            # Refer to https://kubernetes.io/docs/concepts/storage/volumes/
+            volumeMounts:
+              - mountPath: /var/log
+                name: log-volume
+          # use volumes
+          # Refer to https://kubernetes.io/docs/concepts/storage/volumes/
+          volumes:
+            - name: log-volume
+              emptyDir: {}