Skip to content

Commit

Permalink
Add proper error handling for deploying the tests. (kubeflow#642)
Browse files Browse the repository at this point in the history
* Add proper error handling for deploying the tests.

* Add retries for ksonnet errors because it looks like with 0.11 we start
  having problems because GPU and non GPU tests both try to add the environment

* If the ksonnet environment already exists this will cause an error;
  we should keep going.

Fix kubeflow#640

* * Add retries to test_runner
* Fix lint

* Fix lint.

* Remove YAML files.
  • Loading branch information
jlewi authored and k8s-ci-robot committed Jun 12, 2018
1 parent 239fa2a commit 48cd45e
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 6 deletions.
11 changes: 9 additions & 2 deletions py/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,19 @@
import argparse
import datetime
import logging
import re
import retrying
import subprocess
import time
import uuid

from kubeflow.testing import util
from kubernetes import client as k8s_client
from kubernetes.client import rest
from googleapiclient import discovery
from google.cloud import storage # pylint: disable=no-name-in-module

from py import test_util
from py import util


def _setup_namespace(api_client, name):
Expand Down Expand Up @@ -46,6 +48,7 @@ def _setup_namespace(api_client, name):

# TODO(jlewi): We should probably make this a reusable function since a
# lot of test code code use it.
@retrying.retry
def ks_deploy(app_dir, component, params, env=None, account=None):
"""Deploy the specified ksonnet component.
Expand Down Expand Up @@ -76,7 +79,11 @@ def ks_deploy(app_dir, component, params, env=None, account=None):

logging.info("Using app directory: %s", app_dir)

util.run(["ks", "env", "add", env], cwd=app_dir)
try:
util.run(["ks", "env", "add", env], cwd=app_dir)
except subprocess.CalledProcessError as e:
if not re.search(".*environment.*already exists.*", e.output):
raise

for k, v in params.iteritems():
util.run(
Expand Down
14 changes: 10 additions & 4 deletions py/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,17 @@
import json
import os
import re
import retrying
import subprocess
import time
import uuid

from kubernetes import client as k8s_client
from kubernetes.client import rest

from google.cloud import storage # pylint: disable=no-name-in-module
from kubeflow.testing import util
from py import test_util
from py import util
from py import tf_job_client


Expand Down Expand Up @@ -46,9 +48,9 @@ def wait_for_delete(client,
tf_job_client.TF_JOB_GROUP, version, namespace,
tf_job_client.TF_JOB_PLURAL, name)
except rest.ApiException as e:
logging.exception("rest.ApiException thrown")
if e.status == httplib.NOT_FOUND:
return
logging.exception("rest.ApiException thrown")
raise
if status_callback:
status_callback(results)
Expand Down Expand Up @@ -210,7 +212,7 @@ def parse_events(events):

return pods, services


@retrying.retry
def run_test(args): # pylint: disable=too-many-branches,too-many-statements
"""Run a test."""
gcs_client = storage.Client(project=args.project)
Expand All @@ -236,7 +238,11 @@ def run_test(args): # pylint: disable=too-many-branches,too-many-statements
# Create a new environment for this run
env = "test-env-{0}".format(salt)

util.run(["ks", "env", "add", env], cwd=args.app_dir)
try:
util.run(["ks", "env", "add", env], cwd=args.app_dir)
except subprocess.CalledProcessError as e:
if not re.search(".*environment.*already exists.*", e.output):
raise

name = None
namespace = None
Expand Down

0 comments on commit 48cd45e

Please sign in to comment.