Skip to content

Commit

Permalink
e2e: existing_arrikto: initial e2e test (kubeflow#4154)
Browse files Browse the repository at this point in the history
* kfctl: existing_arrikto: initial e2e test

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>

* fix deployment name and review comment

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>

* fix bad syntax with extend

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>

* increase timeout for service ip discovery

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>

* update deployments to reflect latest manifests

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>

* skip knative for existing_arrikto

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>

* bump machine size and increase timeout

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>

* python script: fix error in delete step

Related:
googleapis/google-api-python-client#299

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>

* be more conservative in adding new test

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>

* revert checklist

Signed-off-by: Yannis Zarkadas <yanniszark@arrikto.com>
  • Loading branch information
yanniszark authored and k8s-ci-robot committed Oct 14, 2019
1 parent 3b27111 commit 269f5c9
Show file tree
Hide file tree
Showing 12 changed files with 188 additions and 33 deletions.
2 changes: 1 addition & 1 deletion bootstrap/pkg/kfapp/existing_arrikto/existing.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ func generateCert(addr string) ([]byte, []byte, error) {

func getLBAddress(kubeclient client.Client) (string, error) {
// Get IngressGateway Service's address
const maxRetries = 40
const maxRetries = 80
var lbIngresses []corev1.LoadBalancerIngress
svc := &corev1.Service{}
lbServiceName := types.NamespacedName{Name: "istio-ingressgateway", Namespace: "istio-system"}
Expand Down
24 changes: 24 additions & 0 deletions prow_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,30 @@ workflows:
kwargs:
use_basic_auth: true
config_path: https://raw.githubusercontent.com/kubeflow/manifests/master/kfdef/kfctl_gcp_basic_auth.yaml
# E2E tests for kfctl_existing_arrikto
- app_dir: kubeflow/kubeflow/testing/workflows
component: kfctl_go_test
name: kfctl-go-existing
job_types:
# Enable once we have confirmed the stability of the test
# - presubmit
- postsubmit
include_dirs:
- bootstrap/*
- dependencies/*
- kubeflow/*
- testing/*
params:
platform: gke
gkeApiVersion: v1
workflowName: kfctl-go
useBasicAuth: false
useIstio: true
testEndpoint: false
configPath: https://raw.githubusercontent.com/kubeflow/manifests/master/kfdef/kfctl_existing_arrikto.yaml
cluster_creation_script: create_existing_cluster.sh
cluster_deletion_script: delete_existing_cluster.py
nameSuffix: existing_arrikto
# Only run kfctl presubmit test with basic auth if
# files related to basic auth are modified.
- py_func: kubeflow.kubeflow.ci.kfctl_e2e_workflow.create_workflow
Expand Down
16 changes: 16 additions & 0 deletions testing/kfctl/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ def pytest_addoption(parser):
"--use_istio", action="store", default="False",
help="Use istio.")

parser.addoption(
"--cluster_creation_script", action="store", default="",
help="The script to use to create a K8s cluster before running kfctl.")

parser.addoption(
"--cluster_deletion_script", action="store", default="",
help="The script to use to delete a K8s cluster before running kfctl.")

@pytest.fixture
def app_path(request):
return request.config.getoption("--app_path")
Expand All @@ -62,6 +70,14 @@ def project(request):
def config_path(request):
return request.config.getoption("--config_path")

@pytest.fixture
def cluster_creation_script(request):
return request.config.getoption("--cluster_creation_script")

@pytest.fixture
def cluster_deletion_script(request):
return request.config.getoption("--cluster_deletion_script")

@pytest.fixture
def build_and_apply(request):
value = request.config.getoption("--build_and_apply").lower()
Expand Down
75 changes: 50 additions & 25 deletions testing/kfctl/kf_is_ready_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
import subprocess
import tempfile
import uuid
import yaml
from retrying import retry

import pytest

from kubeflow.testing import util
from testing import deploy_utils

def test_kf_is_ready(namespace, use_basic_auth, use_istio):
def test_kf_is_ready(namespace, use_basic_auth, use_istio, app_path):
"""Test that Kubeflow was successfully deployed.
Args:
Expand All @@ -32,37 +33,67 @@ def test_kf_is_ready(namespace, use_basic_auth, use_istio):
# Verify that components are actually deployed.
# TODO(jlewi): We need to parameterize this list based on whether
# we are using IAP or basic auth.
# TODO(yanniszark): This list is incomplete and missing a lot of components.
deployment_names = [
"argo-ui",
"centraldashboard",
"cloud-endpoints-controller",
"jupyter-web-app-deployment",
"metadata-db",
"metadata-deployment",
"metadata-ui",
"minio",
"ml-pipeline",
"ml-pipeline-persistenceagent",
"ml-pipeline-scheduledworkflow",
"ml-pipeline-ui",
"ml-pipeline-viewer-controller-deployment",
"mysql",
"notebook-controller-deployment",
"tf-job-operator",
"profiles-deployment",
"pytorch-operator",
"katib-controller",
"tf-job-operator",
"workflow-controller",
]

stateful_set_names = [
"kfserving-controller-manager",
stateful_set_names = []

with open(os.path.join(app_path, "app.yaml")) as f:
kfdef = yaml.safe_load(f)
platform = kfdef["spec"]["platform"]

ingress_related_deployments = [
"istio-citadel",
"istio-egressgateway",
"istio-galley",
"istio-ingressgateway",
"istio-pilot",
"istio-policy",
"istio-sidecar-injector",
"istio-telemetry",
"istio-tracing",
"kiali",
"prometheus",
]

ingress_related_deployments = []
ingress_related_stateful_sets = []

if use_basic_auth:
deployment_names.extend(["basic-auth-login"])
ingress_related_stateful_sets.extend(["backend-updater"])
else:
ingress_related_deployments.extend(["iap-enabler"])
ingress_related_stateful_sets.extend(["backend-updater"])
knative_namespace = "knative-serving"
knative_related_deployments = [
"activator",
"autoscaler",
"controller",
]

if platform == "gcp":
deployment_names.extend(["cloud-endpoints-controller"])
stateful_set_names.extend(["kfserving-controller-manager"])
if use_basic_auth:
deployment_names.extend(["basic-auth-login"])
ingress_related_stateful_sets.extend(["backend-updater"])
else:
ingress_related_deployments.extend(["iap-enabler"])
ingress_related_stateful_sets.extend(["backend-updater"])
elif platform == "existing_arrikto":
deployment_names.extend(["dex"])
ingress_related_deployments.extend(["authservice"])
knative_related_deployments = []


# TODO(jlewi): Might want to parallelize this.
for deployment_name in deployment_names:
Expand All @@ -89,15 +120,9 @@ def test_kf_is_ready(namespace, use_basic_auth, use_istio):

# TODO(jlewi): We should verify that the ingress is created and healthy.

knative_namespace = "knative-serving"
knative_related_deployments = [
"activator",
"autoscaler",
"controller",
]
for deployment_name in knative_related_deployments:
logging.info("Verifying that deployment %s started...", deployment_name)
util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
logging.info("Verifying that deployment %s started...", deployment_name)
util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO,
Expand Down
9 changes: 8 additions & 1 deletion testing/kfctl/kfctl_delete_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,14 @@ def get_endpoints_list(project):

return endpoints

def test_kfctl_delete(kfctl_path, app_path, project):
def test_kfctl_delete(kfctl_path, app_path, project, cluster_deletion_script):

# TODO(yanniszark): split this into a separate workflow step
if cluster_deletion_script:
logging.info("cluster_deletion_script specified: %s", cluster_deletion_script)
util.run(["/bin/bash", "-c", cluster_deletion_script])
return

if not kfctl_path:
raise ValueError("kfctl_path is required")

Expand Down
12 changes: 10 additions & 2 deletions testing/kfctl/kfctl_go_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from kubeflow.kubeflow.ci import kfctl_go_test_utils as kfctl_util
from kubeflow.testing import util

def test_build_kfctl_go(app_path, project, use_basic_auth, use_istio, config_path, build_and_apply):
def test_build_kfctl_go(app_path, project, use_basic_auth, use_istio, config_path, build_and_apply, cluster_creation_script):
"""Test building and deploying Kubeflow.
Args:
Expand All @@ -15,6 +15,7 @@ def test_build_kfctl_go(app_path, project, use_basic_auth, use_istio, config_pat
use_basic_auth: Whether to use basic_auth.
use_istio: Whether to use Istio or not
config_path: Path to the KFDef spec file.
cluster_creation_script: script invoked to create a new cluster
build_and_apply: whether to build and apply or apply
"""
# Need to activate account for scopes.
Expand All @@ -24,11 +25,18 @@ def test_build_kfctl_go(app_path, project, use_basic_auth, use_istio, config_pat
"--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
])

# TODO(yanniszark): split this into a separate workflow step
if cluster_creation_script:
logging.info("Cluster creation script specified: %s", cluster_creation_script)
util.run(["/bin/bash", "-c", cluster_creation_script])


kfctl_path = kfctl_util.build_kfctl_go()
app_path = kfctl_util.kfctl_deploy_kubeflow(
app_path, project, use_basic_auth,
use_istio, config_path, kfctl_path, build_and_apply)
kfctl_util.verify_kubeconfig(app_path)
if not cluster_creation_script:
kfctl_util.verify_kubeconfig(app_path)

if __name__ == "__main__":
logging.basicConfig(
Expand Down
34 changes: 34 additions & 0 deletions testing/kfctl/scripts/create_existing_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

set -e

export PROJECT="kubeflow-ci"
export GCP_ZONE="us-central1-a"
export GCP_USER="$(gcloud config list account --format "value(core.account)" )"
export GCP_PROJECT="$(gcloud config list project --format "value(core.project)" )"
export CLUSTER_NAME="kfctl-arr-${REPO_NAME}-${BUILD_ID}"
export CLUSTER_VERSION="$(gcloud container get-server-config --zone=${GCP_ZONE} --format="value(validMasterVersions[0])" )"

############################
# Create and setup cluster #
############################

gcloud container clusters create "${CLUSTER_NAME}" \
--project "${GCP_PROJECT}" \
--zone "${GCP_ZONE}" \
--username "admin" \
--cluster-version "${CLUSTER_VERSION}" \
--machine-type "custom-6-23040" --num-nodes "1" \
--image-type "UBUNTU" \
--local-ssd-count=4 \
--disk-type "pd-ssd" --disk-size "50" \
--no-enable-cloud-logging --no-enable-cloud-monitoring \
--no-enable-ip-alias \
--enable-network-policy \
--enable-autoupgrade --enable-autorepair

echo "Getting credentials for newly created cluster..."
gcloud container clusters get-credentials "${CLUSTER_NAME}" --zone="${GCP_ZONE}"

echo "Setting up GKE RBAC..."
kubectl create clusterrolebinding cluster-admin-binding --clusterrole=cluster-admin --user="${GCP_USER}"
28 changes: 28 additions & 0 deletions testing/kfctl/scripts/delete_existing_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python3

import os
import logging
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
from kubeflow.testing import util


def must_getenv(name):
value = os.getenv(name)
if not name:
logging.fatal("Environment variable %s is not set", name)
raise ValueError()
return value


if __name__ == "__main__":

util.run([
"gcloud", "auth", "activate-service-account", "--key-file",
must_getenv("GOOGLE_APPLICATION_CREDENTIALS")
])

cluster_name = "kfctl-arr-" + must_getenv("REPO_NAME") + "-" + must_getenv("BUILD_ID")
credentials = GoogleCredentials.get_application_default()
service = discovery.build('container', 'v1', credentials=credentials, cache_discovery=False)
util.delete_cluster(service, cluster_name, "kubeflow-ci", "us-central1-a")
2 changes: 1 addition & 1 deletion testing/test_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def test_successful_deployment(deployment_name):
if i == retries:
raise Exception('Deployment failed: ' + deployment_name)
try:
output = util.run(["kubectl", "get", "deployment", deployment_name])
output = util.run(["kubectl", "get", "deployment", deployment_name, "-n", "kubeflow"])
logging.info("output = \n" + output)
if output.count('\n') == 1:
output = output.split('\n')[1]
Expand Down
14 changes: 12 additions & 2 deletions testing/workflows/components/kfctl_go_test.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ local runPath = srcDir + "/testing/workflows/run.sh";
local kfCtlPath = srcDir + "/bootstrap/bin/kfctl";
local kubeConfig = testDir + "/kfctl_test/.kube/kubeconfig";

// Name for the Kubeflow app.
// cluster_creation_script specifies the script to run in order to create
// a cluster before running kfctl.
// Only applicable to configs that don't create their own clusters.
local cluster_creation_script = if (params.cluster_creation_script=="") then "" else srcDir + "/testing/kfctl/scripts/" + params.cluster_creation_script;
local cluster_deletion_script = if (params.cluster_deletion_script=="") then "" else srcDir + "/testing/kfctl/scripts/" + params.cluster_deletion_script;


// This needs to be unique for each test run because it is
// used to name GCP resources
// We take the suffix of the name because it should provide some random salt.
Expand Down Expand Up @@ -185,11 +191,13 @@ local nameSuffix1 = if util.toBool(params.useBasicAuth) then
"basic-auth"
else
"iap";
local nameSuffix = if util.toBool(params.useIstio) then
local nameSuffix2 = if util.toBool(params.useIstio) then
nameSuffix1 + "-istio"
else
nameSuffix1;

local nameSuffix = if (params.nameSuffix=="") then nameSuffix2 else params.nameSuffix;

// Create a list of dictionary.c
// Each item is a dictionary describing one step in the graph.
local dagTemplates = [
Expand Down Expand Up @@ -226,6 +234,7 @@ local dagTemplates = [
"--use_basic_auth=" + params.useBasicAuth,
"--use_istio=" + params.useIstio,
"--config_path=" + params.configPath,
"--cluster_creation_script=" + cluster_creation_script,
// Increase the log level so that info level log statements show up.
"--log-cli-level=info",
"--junitxml=" + artifactsDir + "/junit_kfctl-build-test" + nameSuffix + ".xml",
Expand Down Expand Up @@ -312,6 +321,7 @@ local deleteStep = if deleteKubeflow then
"--junitxml=" + artifactsDir + "/junit_kfctl-go-delete-test.xml",
"--app_path=" + appDir,
"--kfctl_path=" + kfCtlPath,
"--cluster_deletion_script=" + cluster_deletion_script,
],
working_dir=srcDir+ "/testing/kfctl",
),
Expand Down
3 changes: 3 additions & 0 deletions testing/workflows/components/params.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
useIstio: "true",
testEndpoint: "false",
configPath: "bootstrap/config/kfctl_gcp_iap_master.yaml",
cluster_creation_script: "",
cluster_deletion_script: "",
nameSuffix: "",
},
click_deploy_test: {
bucket: "kubeflow-ci_temp",
Expand Down
2 changes: 1 addition & 1 deletion testing/workflows/components/workflows.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@
"jupyter_test.py",
// Test timeout in seconds.
"--namespace=" + tests.stepsNamespace,
"--timeout=500",
"--timeout=1000",
"--junitxml=" + tests.artifactsDir + "/junit_jupyter-test.xml",
],
workingDir: tests.srcDir + "/kubeflow/jupyter/tests",
Expand Down

0 comments on commit 269f5c9

Please sign in to comment.