update ci tests for mnist example

kubeflow · Dec 3, 2019 · d14b442 · d14b442
1 parent 341decc
commit d14b442
Show file tree

Hide file tree

Showing 17 changed files with 386 additions and 413 deletions.
diff --git a/mnist/README.md b/mnist/README.md
@@ -6,6 +6,7 @@
   - [Prerequisites](#prerequisites)
     - [Deploy Kubeflow](#deploy-kubeflow)
     - [Local Setup](#local-setup)
+    - [GCP Setup](#gcp-setup)
   - [Modifying existing examples](#modifying-existing-examples)
     - [Prepare model](#prepare-model)
     - [Build and push model image.](#build-and-push-model-image)
@@ -53,6 +54,9 @@ You also need the following command line tools:
 
 **Note:** kustomize [v2.0.3](https://github.com/kubernetes-sigs/kustomize/releases/tag/v2.0.3) is recommented since the [problem](https://github.com/kubernetes-sigs/kustomize/issues/1295) in kustomize v2.1.0.
 
+### GCP Setup
+
+If you are using GCP, need to enable [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) to execute below steps.
 
 ## Modifying existing examples
 
@@ -225,94 +229,6 @@ kustomize edit add configmap mnist-map-training --from-literal=modelDir=gs://${B
 kustomize edit add configmap mnist-map-training --from-literal=exportDir=gs://${BUCKET}/${MODEL_PATH}/export
 ```
 
-In order to write to GCS we need to supply the TFJob with GCP credentials. We do
-this by telling our training code to use a [Google service account](https://cloud.google.com/docs/authentication/production#obtaining_and_providing_service_account_credentials_manually).
-
-If you followed the [getting started guide for GKE](https://www.kubeflow.org/docs/started/getting-started-gke/) 
-then a number of steps have already been performed for you
-
-  1. We created a Google service account named `${DEPLOYMENT}-user`
-
-     * You can run the following command to list all service accounts in your project
-
-       ```
-       gcloud --project=${PROJECT} iam service-accounts list
-       ```
-
-  2. We stored the private key for this account in a K8s secret named `user-gcp-sa`
-
-     * To see the secrets in your cluster
-     
-       ```
-       kubectl get secrets
-       ```
-
-  3. We granted this service account permission to read/write GCS buckets in this project
-
-     * To see the IAM policy you can do
-
-       ```
-       gcloud projects get-iam-policy ${PROJECT} --format=yaml
-       ```
-
-     * The output should look like the following
-
-       ```
-        bindings:
-        ...
-        - members:
-          - serviceAccount:${DEPLOYMENT}-user@${PROJEC}.iam.gserviceaccount.com
-            ...
-          role: roles/storage.admin
-          ...
-        etag: BwV_BqSmSCY=
-        version: 1
-        ```
-
-To use this service account we perform the following steps
-
-  1. Mount the secret `user-gcp-sa` into the pod and configure the mount path of the secret. 
-       ```
-       kustomize edit add configmap mnist-map-training --from-literal=secretName=user-gcp-sa
-       kustomize edit add configmap mnist-map-training --from-literal=secretMountPath=/var/secrets
-       ```
-
-     * Note: ensure your envrionment is pointed at the same `kubeflow` namespace as the `user-gcp-sa` secret
-
-  2. Next we need to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` so that our code knows where to look for the service account key.
-
-     ```
-     kustomize edit add configmap mnist-map-training --from-literal=GOOGLE_APPLICATION_CREDENTIALS=/var/secrets/user-gcp-sa.json
-     ```
-
-     * If we look at the spec for our job we can see that the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set.
-
-       ```
-        kustomize build .
-       ```
-       ```
-        apiVersion: kubeflow.org/v1beta2
-        kind: TFJob
-        metadata:
-          ...
-        spec:
-          tfReplicaSpecs:
-            Chief:
-              replicas: 1
-              template:
-                spec:
-                  containers:
-                  - command:
-                    ..
-                    env:
-                    ...
-                    - name: GOOGLE_APPLICATION_CREDENTIALS
-                      value: /var/secrets/user-gcp-sa.json
-                    ...
-                  ...
-            ...
-       ```
-
 
 You can now submit the job
 

diff --git a/mnist/serving/GCS/deployment_patch.yaml b/mnist/serving/GCS/deployment_patch.yaml
diff --git a/mnist/serving/GCS/kustomization.yaml b/mnist/serving/GCS/kustomization.yaml
@@ -3,11 +3,3 @@ kind: Kustomization
 
 bases:
 - ../base
-
-patchesJson6902:
-- path: deployment_patch.yaml
-  target:
-    group: extensions
-    kind: Deployment
-    name: $(svcName)
-    version: v1beta1
diff --git a/mnist/testing/conftest.py b/mnist/testing/conftest.py
@@ -1,14 +1,62 @@
+import os
 import pytest
 
 def pytest_addoption(parser):
+
   parser.addoption(
-      "--master", action="store", default="", help="IP address of GKE master")
+    "--tfjob_name", help="Name for the TFjob.",
+    type=str, default="mnist-test-" + os.getenv('BUILD_ID'))
+
+  parser.addoption(
+    "--namespace", help=("The namespace to run in. This should correspond to"
+                         "a namespace associated with a Kubeflow namespace."),
+    type=str, default="kubeflow-kubeflow-testing")
+
+  parser.addoption(
+    "--repos", help="The repos to checkout; leave blank to use defaults",
+    type=str, default="")
+
+  parser.addoption(
+    "--trainer_image", help="TFJob training image",
+    type=str, default="gcr.io/kubeflow-ci/mnist/model:build-" + os.getenv('BUILD_ID'))
+
+  parser.addoption(
+    "--train_steps", help="train steps for mnist testing",
+    type=str, default="200")
+
+  parser.addoption(
+    "--batch_size", help="batch size for mnist trainning",
+    type=str, default="100")
 
   parser.addoption(
-      "--namespace", action="store", default="", help="namespace of server")
+    "--learning_rate", help="mnist learnning rate",
+    type=str, default="0.01")
 
   parser.addoption(
-      "--service", action="store", default="",
+    "--num_ps", help="The number of PS",
+    type=str, default="1")
+
+  parser.addoption(
+    "--num_workers", help="The number of Worker",
+    type=str, default="2")
+
+  parser.addoption(
+    "--model_dir", help="Path for model saving",
+    type=str, default="gs://kubeflow-ci-deployment_ci-temp/mnist/models/" + os.getenv('BUILD_ID'))
+
+  parser.addoption(
+    "--export_dir", help="Path for model exporting",
+    type=str, default="gs://kubeflow-ci-deployment_ci-temp/mnist/models/" + os.getenv('BUILD_ID'))
+
+  parser.addoption(
+    "--deploy_name", help="Name for the service deployment",
+    type=str, default="mnist-test-" + os.getenv('BUILD_ID'))
+
+  parser.addoption(
+      "--master", action="store", default="", help="IP address of GKE master")
+
+  parser.addoption(
+      "--service", action="store", default="mnist-test-" + os.getenv('BUILD_ID'),
       help="The name of the mnist K8s service")
 
 @pytest.fixture
@@ -22,3 +70,47 @@ def namespace(request):
 @pytest.fixture
 def service(request):
   return request.config.getoption("--service")
+
+@pytest.fixture
+def tfjob_name(request):
+  return request.config.getoption("--tfjob_name")
+
+@pytest.fixture
+def repos(request):
+  return request.config.getoption("--repos")
+
+@pytest.fixture
+def trainer_image(request):
+  return request.config.getoption("--trainer_image")
+
+@pytest.fixture
+def train_steps(request):
+  return request.config.getoption("--train_steps")
+
+@pytest.fixture
+def batch_size(request):
+  return request.config.getoption("--batch_size")
+
+@pytest.fixture
+def learning_rate(request):
+  return request.config.getoption("--learning_rate")
+
+@pytest.fixture
+def num_ps(request):
+  return request.config.getoption("--num_ps")
+
+@pytest.fixture
+def num_workers(request):
+  return request.config.getoption("--num_workers")
+
+@pytest.fixture
+def model_dir(request):
+  return request.config.getoption("--model_dir")
+
+@pytest.fixture
+def export_dir(request):
+  return request.config.getoption("--export_dir")
+
+@pytest.fixture
+def deploy_name(request):
+  return request.config.getoption("--deploy_name")
diff --git a/mnist/testing/deploy_test.py b/mnist/testing/deploy_test.py
@@ -10,80 +10,75 @@
      * Provides utilities for testing
 
 Manually running the test
- 1. Configure your KUBECONFIG file to point to the desired cluster
- 2. Set --params=name=${NAME},namespace=${NAMESPACE}
-    * name should be the name for your job
-    * namespace should be the namespace to use
- 3. Use the modelBasePath parameter to the model to test.
-     --params=...,modelBasePath=${MODEL_BASE_PATH}
+  pytest deploy_test.py \
+    name=mnist-deploy-test-${BUILD_ID} \
+    namespace=${namespace} \
+    modelBasePath=${modelDir} \
+    exportDir=${modelDir} \
 
 """
 
 import logging
 import os
-import subprocess
+import pytest
 
+from kubernetes.config import kube_config
 from kubernetes import client as k8s_client
-from kubeflow.tf_operator import test_runner #pylint: disable=no-name-in-module
 
-from kubeflow.testing import test_util
 from kubeflow.testing import util
 
-# TODO(jlewi): Should we refactor this to use pytest like predict_test
-# and not depend on test_runner.
-class MnistDeployTest(test_util.TestCase):
-  def __init__(self, args):
-    namespace, name, env = test_runner.parse_runtime_params(args)
-    self.app_dir = args.app_dir
-
-    if not self.app_dir:
-      self.app_dir = os.path.join(os.path.dirname(__file__), "..",
-                                  "serving/GCS")
-      self.app_dir = os.path.abspath(self.app_dir)
-      logging.info("--app_dir not set defaulting to: %s", self.app_dir)
-
-    self.env = env
-    self.namespace = namespace
-    self.params = args.params
-    super(MnistDeployTest, self).__init__(class_name="MnistDeployTest",
-                                          name=name)
-
-  def test_serve(self):
-    # We repeat the test multiple times.
-    # This ensures that if we delete the job we can create a new job with the
-    # same name.
-    api_client = k8s_client.ApiClient()
-
-    # TODO (jinchihe) beflow code will be removed once new test-worker image
-    # is publish in https://github.com/kubeflow/testing/issues/373.
-    kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
-         'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
-    util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=self.app_dir)
-    util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=self.app_dir)
-
-    # Apply the components
-    configmap = 'mnist-map-serving'
-    for pair in self.params.split(","):
-      k, v = pair.split("=", 1)
-      if k == "namespace":
-        util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir)
-      else:
-        util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
-                '--from-literal=' + k + '=' + v], cwd=self.app_dir)
-
-    # Seems the util.run cannot handle pipes case, using check_call.
-    subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -'
-    subprocess.check_call(subCmd, shell=True)
-
-    util.wait_for_deployment(api_client, self.namespace, self.name,
-                             timeout_minutes=4)
-
-    # We don't delete the resources. We depend on the namespace being
-    # garbage collected.
+
+def test_deploy(record_xml_attribute, deploy_name, namespace, model_dir, export_dir):
+
+  util.set_pytest_junit(record_xml_attribute, "test_deploy")
+
+  util.maybe_activate_service_account()
+
+  app_dir = os.path.join(os.path.dirname(__file__), "../serving/GCS")
+  app_dir = os.path.abspath(app_dir)
+  logging.info("--app_dir not set defaulting to: %s", app_dir)
+
+  # TODO (@jinchihe) Using kustomize 2.0.3 to work around below issue:
+  # https://github.com/kubernetes-sigs/kustomize/issues/1295
+  kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
+           'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
+  util.run(['wget', '-q', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=app_dir)
+  util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=app_dir)
+
+  # TODO (@jinchihe): The kubectl need to be upgraded to 1.14.0 due to below issue.
+  # Invalid object doesn't have additional properties ...
+  kusUrl = 'https://storage.googleapis.com/kubernetes-release/' \
+           'release/v1.14.0/bin/linux/amd64/kubectl'
+  util.run(['wget', '-q', '-O', '/usr/local/bin/kubectl', kusUrl], cwd=app_dir)
+  util.run(['chmod', 'a+x', '/usr/local/bin/kubectl'], cwd=app_dir)
+
+  # Configure custom parameters using kustomize
+  configmap = 'mnist-map-serving'
+  util.run(['kustomize', 'edit', 'set', 'namespace', namespace], cwd=app_dir)
+  util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
+           '--from-literal=name' + '=' + deploy_name], cwd=app_dir)
+
+  util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
+            '--from-literal=modelBasePath=' + model_dir], cwd=app_dir)
+  util.run(['kustomize', 'edit', 'add', 'configmap', configmap,
+            '--from-literal=exportDir=' + export_dir], cwd=app_dir)
+
+  # Apply the components
+  util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'], cwd=app_dir)
+  util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir)
+
+  kube_config.load_kube_config()
+  api_client = k8s_client.ApiClient()
+  util.wait_for_deployment(api_client, namespace, deploy_name, timeout_minutes=4)
+
+  # We don't delete the resources. We depend on the namespace being
+  # garbage collected.
 
 if __name__ == "__main__":
-  # TODO(jlewi): It looks like using test_runner we don't exit with an error
-  # if the deployment doesn't succeed. So the Argo workflow continues which
-  # isn't what we want. Might be a good reason to switch to using
-  # pytest.
-  test_runner.main(module=__name__)
+  logging.basicConfig(level=logging.INFO,
+                      format=('%(levelname)s|%(asctime)s'
+                              '|%(pathname)s|%(lineno)d| %(message)s'),
+                      datefmt='%Y-%m-%dT%H:%M:%S',
+                      )
+  logging.getLogger().setLevel(logging.INFO)
+  pytest.main()