Add proper error handling for deploying the tests. (kubeflow#642)

* Add proper error handling for deploying the tests. * Add retries for ksonnet errors because it looks like with 0.11 we start having problems because GPU and non GPU tests both try to add the environment * If the ksonnet environment already exists this will cause an error; we should keep going. Fix kubeflow#640 * * Add retries to test_runner * Fix lint * Fix lint. * Remove YAML files.
yph152 · Jun 18, 2018 · 3900ad8 · 3900ad8
1 parent ea93817
commit 3900ad8
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 6 deletions.
diff --git a/py/deploy.py b/py/deploy.py
@@ -7,17 +7,19 @@
 import argparse
 import datetime
 import logging
+import re
+import retrying
 import subprocess
 import time
 import uuid
 
+from kubeflow.testing import util
 from kubernetes import client as k8s_client
 from kubernetes.client import rest
 from googleapiclient import discovery
 from google.cloud import storage  # pylint: disable=no-name-in-module
 
 from py import test_util
-from py import util
 
 
 def _setup_namespace(api_client, name):
@@ -46,6 +48,7 @@ def _setup_namespace(api_client, name):
 
 # TODO(jlewi): We should probably make this a reusable function since a
 # lot of test code code use it.
+@retrying.retry
 def ks_deploy(app_dir, component, params, env=None, account=None):
   """Deploy the specified ksonnet component.
 
@@ -76,7 +79,11 @@ def ks_deploy(app_dir, component, params, env=None, account=None):
 
   logging.info("Using app directory: %s", app_dir)
 
-  util.run(["ks", "env", "add", env], cwd=app_dir)
+  try:
+    util.run(["ks", "env", "add", env], cwd=app_dir)
+  except subprocess.CalledProcessError as e:
+    if not re.search(".*environment.*already exists.*", e.output):
+      raise
 
   for k, v in params.iteritems():
     util.run(

diff --git a/py/test_runner.py b/py/test_runner.py
@@ -7,15 +7,17 @@
 import json
 import os
 import re
+import retrying
+import subprocess
 import time
 import uuid
 
 from kubernetes import client as k8s_client
 from kubernetes.client import rest
 
 from google.cloud import storage  # pylint: disable=no-name-in-module
+from kubeflow.testing import util
 from py import test_util
-from py import util
 from py import tf_job_client
 
 
@@ -46,9 +48,9 @@ def wait_for_delete(client,
         tf_job_client.TF_JOB_GROUP, version, namespace,
         tf_job_client.TF_JOB_PLURAL, name)
     except rest.ApiException as e:
-      logging.exception("rest.ApiException thrown")
       if e.status == httplib.NOT_FOUND:
         return
+      logging.exception("rest.ApiException thrown")
       raise
     if status_callback:
       status_callback(results)
@@ -210,7 +212,7 @@ def parse_events(events):
 
   return pods, services
 
-
+@retrying.retry
 def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
   """Run a test."""
   gcs_client = storage.Client(project=args.project)
@@ -236,7 +238,11 @@ def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
   # Create a new environment for this run
   env = "test-env-{0}".format(salt)
 
-  util.run(["ks", "env", "add", env], cwd=args.app_dir)
+  try:
+    util.run(["ks", "env", "add", env], cwd=args.app_dir)
+  except subprocess.CalledProcessError as e:
+    if not re.search(".*environment.*already exists.*", e.output):
+      raise
 
   name = None
   namespace = None