Skip to content

Commit

Permalink
Add proper error handling for deploying the tests.
Browse files Browse the repository at this point in the history
* Add retries for ksonnet errors because it looks like with 0.11 we start
  having problems because GPU and non GPU tests both try to add the environment

* If the ksonnet environment already exists this will cause an error;
  we should keep going.

Fix kubeflow#640
  • Loading branch information
jlewi committed Jun 12, 2018
1 parent 16b2a7a commit dbfabcb
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 4 deletions.
10 changes: 9 additions & 1 deletion py/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
import argparse
import datetime
import logging
import re
import retrying
import subprocess
import time
import uuid

from kubeflow.testing import util
from kubernetes import client as k8s_client
from kubernetes.client import rest
from googleapiclient import discovery
Expand Down Expand Up @@ -46,6 +49,7 @@ def _setup_namespace(api_client, name):

# TODO(jlewi): We should probably make this a reusable function since a
# lot of test code code use it.
@retrying.retry
def ks_deploy(app_dir, component, params, env=None, account=None):
"""Deploy the specified ksonnet component.
Expand Down Expand Up @@ -76,7 +80,11 @@ def ks_deploy(app_dir, component, params, env=None, account=None):

logging.info("Using app directory: %s", app_dir)

util.run(["ks", "env", "add", env], cwd=app_dir)
try:
util.run(["ks", "env", "add", env], cwd=app_dir)
except subprocess.CalledProcessError as e:
if not re.search(".*environment.*already exists.*", msg):
raise

for k, v in params.iteritems():
util.run(
Expand Down
6 changes: 3 additions & 3 deletions py/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
from kubernetes.client import rest

from google.cloud import storage # pylint: disable=no-name-in-module
from kubeflow.testing import util
from py import test_util
from py import util
from py import tf_job_client


Expand Down Expand Up @@ -45,10 +45,10 @@ def wait_for_delete(client,
results = crd_api.get_namespaced_custom_object(
tf_job_client.TF_JOB_GROUP, version, namespace,
tf_job_client.TF_JOB_PLURAL, name)
except rest.ApiException as e:
logging.exception("rest.ApiException thrown")
except rest.ApiException as e:
if e.status == httplib.NOT_FOUND:
return
logging.exception("rest.ApiException thrown")
raise
if status_callback:
status_callback(results)
Expand Down

0 comments on commit dbfabcb

Please sign in to comment.