Skip to content

Commit

Permalink
Revert "Allow creation of clusters larger than 500 nodes"
Browse files Browse the repository at this point in the history
  • Loading branch information
zmerlynn committed Dec 11, 2015
1 parent aaa1fe6 commit 9492fd6
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 111 deletions.
25 changes: 11 additions & 14 deletions cluster/gce/upgrade.sh
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,6 @@ function upgrade-nodes() {
#
# Vars set:
# SANITIZED_VERSION
# INSTANCE_GROUPS
# KUBELET_TOKEN
# KUBE_PROXY_TOKEN
# CA_CERT_BASE64
Expand All @@ -185,7 +184,7 @@ function prepare-node-upgrade() {
echo "== Preparing node upgrade (to ${KUBE_VERSION}). ==" >&2
SANITIZED_VERSION=$(echo ${KUBE_VERSION} | sed 's/[\.\+]/-/g')

detect-node-names # sets INSTANCE_GROUPS
detect-node-names

# TODO(zmerlynn): Refactor setting scope flags.
local scope_flags=
Expand Down Expand Up @@ -232,18 +231,16 @@ function do-node-upgrade() {
subgroup="alpha compute"
fi
local template_name=$(get-template-name-from-version ${SANITIZED_VERSION})
for group in ${INSTANCE_GROUPS[@]}; do
gcloud ${subgroup} rolling-updates \
--project="${PROJECT}" \
--zone="${ZONE}" \
start \
--group="${group}" \
--template="${template_name}" \
--instance-startup-timeout=300s \
--max-num-concurrent-instances=1 \
--max-num-failed-instances=0 \
--min-instance-update-time=0s
done
gcloud ${subgroup} rolling-updates \
--project="${PROJECT}" \
--zone="${ZONE}" \
start \
--group="${NODE_INSTANCE_PREFIX}-group" \
--template="${template_name}" \
--instance-startup-timeout=300s \
--max-num-concurrent-instances=1 \
--max-num-failed-instances=0 \
--min-instance-update-time=0s

# TODO(zmerlynn): Wait for the rolling-update to finish.

Expand Down
143 changes: 46 additions & 97 deletions cluster/gce/util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -253,24 +253,12 @@ function upload-server-tars() {
# NODE_INSTANCE_PREFIX
# Vars set:
# NODE_NAMES
# INSTANCE_GROUPS
function detect-node-names {
detect-project
INSTANCE_GROUPS=()
INSTANCE_GROUPS+=($(gcloud compute instance-groups managed list --zone "${ZONE}" --project "${PROJECT}" | grep ${NODE_INSTANCE_PREFIX} | cut -f1 -d" "))
NODE_NAMES=()
if [[ -n "${INSTANCE_GROUPS[@]:-}" ]]; then
for group in "${INSTANCE_GROUPS[@]}"; do
NODE_NAMES+=($(gcloud compute instance-groups managed list-instances \
"${group}" --zone "${ZONE}" --project "${PROJECT}" \
--format=yaml | grep instance: | cut -d ' ' -f 2))
done
echo "INSTANCE_GROUPS=${INSTANCE_GROUPS[*]}" >&2
echo "NODE_NAMES=${NODE_NAMES[*]}" >&2
else
echo "INSTANCE_GROUPS=" >&2
echo "NODE_NAMES=" >&2
fi
NODE_NAMES=($(gcloud compute instance-groups managed list-instances \
"${NODE_INSTANCE_PREFIX}-group" --zone "${ZONE}" --project "${PROJECT}" \
--format=yaml | grep instance: | cut -d ' ' -f 2))
echo "NODE_NAMES=${NODE_NAMES[*]}" >&2
}

# Detect the information about the minions
Expand Down Expand Up @@ -725,43 +713,17 @@ function kube-up {

create-node-instance-template $template_name

local defaulted_max_instances_per_mig=${MAX_INSTANCES_PER_MIG:-500}

if [[ ${defaulted_max_instances_per_mig} -le "0" ]]; then
echo "MAX_INSTANCES_PER_MIG cannot be negative. Assuming default 500"
defaulted_max_instances_per_mig=500
fi
local num_migs=$(((${NUM_NODES} + ${defaulted_max_instances_per_mig} - 1) / ${defaulted_max_instances_per_mig}))
local instances_per_mig=$(((${NUM_NODES} + ${num_migs} - 1) / ${num_migs}))
local last_mig_size=$((${NUM_NODES} - (${num_migs} - 1) * ${instances_per_mig}))

#TODO: parallelize this loop to speed up the process
for i in $(seq $((${num_migs} - 1))); do
gcloud compute instance-groups managed \
create "${NODE_INSTANCE_PREFIX}-group-$i" \
--project "${PROJECT}" \
--zone "${ZONE}" \
--base-instance-name "${NODE_INSTANCE_PREFIX}" \
--size "${instances_per_mig}" \
--template "$template_name" || true;
gcloud compute instance-groups managed wait-until-stable \
"${NODE_INSTANCE_PREFIX}-group-$i" \
--zone "${ZONE}" \
--project "${PROJECT}" || true;
done

gcloud compute instance-groups managed \
create "${NODE_INSTANCE_PREFIX}-group-${num_migs}" \
create "${NODE_INSTANCE_PREFIX}-group" \
--project "${PROJECT}" \
--zone "${ZONE}" \
--base-instance-name "${NODE_INSTANCE_PREFIX}" \
--size "${last_mig_size}" \
--size "${NUM_NODES}" \
--template "$template_name" || true;
gcloud compute instance-groups managed wait-until-stable \
"${NODE_INSTANCE_PREFIX}-group-${num_migs}" \
--zone "${ZONE}" \
--project "${PROJECT}" || true;

"${NODE_INSTANCE_PREFIX}-group" \
--zone "${ZONE}" \
--project "${PROJECT}" || true;
detect-node-names
detect-master

Expand All @@ -780,12 +742,9 @@ function kube-up {
METRICS+="--custom-metric-utilization metric=custom.cloudmonitoring.googleapis.com/kubernetes.io/memory/node_reservation,"
METRICS+="utilization-target=${TARGET_NODE_UTILIZATION},utilization-target-type=GAUGE "

echo "Creating node autoscalers."

for i in $(seq ${num_migs}); do
gcloud compute instance-groups managed set-autoscaling "${NODE_INSTANCE_PREFIX}-group-$i" --zone "${ZONE}" --project "${PROJECT}" \
--min-num-replicas "${AUTOSCALER_MIN_NODES}" --max-num-replicas "${AUTOSCALER_MAX_NODES}" ${METRICS} || true
done
echo "Creating node autoscaler."
gcloud compute instance-groups managed set-autoscaling "${NODE_INSTANCE_PREFIX}-group" --zone "${ZONE}" --project "${PROJECT}" \
--min-num-replicas "${AUTOSCALER_MIN_NODES}" --max-num-replicas "${AUTOSCALER_MAX_NODES}" ${METRICS} || true
fi

echo "Waiting up to ${KUBE_CLUSTER_INITIALIZATION_TIMEOUT} seconds for cluster initialization."
Expand Down Expand Up @@ -851,51 +810,46 @@ function kube-up {
# down the firewall rules and routes.
function kube-down {
detect-project
detect-node-names # For INSTANCE_GROUPS

echo "Bringing down cluster"
set +e # Do not stop on error

# Delete autoscaler for nodes if present. We assume that all or none instance groups have an autoscaler
# Delete autoscaler for nodes if present.
local autoscaler
autoscaler=( $(gcloud compute instance-groups managed list --zone "${ZONE}" --project "${PROJECT}" \
| grep "${NODE_INSTANCE_PREFIX}-group-1" \
| grep "${NODE_INSTANCE_PREFIX}-group" \
| awk '{print $7}') )
if [[ "${autoscaler:-}" == "yes" ]]; then
for group in ${INSTANCE_GROUPS[@]}; do
gcloud compute instance-groups managed stop-autoscaling "${group}" --zone "${ZONE}" --project "${PROJECT}"
done
gcloud compute instance-groups managed stop-autoscaling "${NODE_INSTANCE_PREFIX}-group" --zone "${ZONE}" --project "${PROJECT}"
fi

# Get the name of the managed instance group template before we delete the
# managed instange group. (The name of the managed instnace group template may
# change during a cluster upgrade.)
local template=$(get-template "${PROJECT}" "${ZONE}" "${NODE_INSTANCE_PREFIX}-group-1")
local template=$(get-template "${PROJECT}" "${ZONE}" "${NODE_INSTANCE_PREFIX}-group")

# The gcloud APIs don't return machine parseable error codes/retry information. Therefore the best we can
# do is parse the output and special case particular responses we are interested in.
for group in ${INSTANCE_GROUPS[@]}; do
if gcloud compute instance-groups managed describe "${group}" --project "${PROJECT}" --zone "${ZONE}" &>/dev/null; then
deleteCmdOutput=$(gcloud compute instance-groups managed delete --zone "${ZONE}" \
--project "${PROJECT}" \
--quiet \
"${group}")
if [[ "$deleteCmdOutput" != "" ]]; then
# Managed instance group deletion is done asynchronously, we must wait for it to complete, or subsequent steps fail
deleteCmdOperationId=$(echo $deleteCmdOutput | grep "Operation:" | sed "s/.*Operation:[[:space:]]*\([^[:space:]]*\).*/\1/g")
if [[ "$deleteCmdOperationId" != "" ]]; then
deleteCmdStatus="PENDING"
while [[ "$deleteCmdStatus" != "DONE" ]]
do
sleep 5
deleteCmdOperationOutput=$(gcloud compute instance-groups managed --zone "${ZONE}" --project "${PROJECT}" get-operation $deleteCmdOperationId)
deleteCmdStatus=$(echo $deleteCmdOperationOutput | grep -i "status:" | sed "s/.*status:[[:space:]]*\([^[:space:]]*\).*/\1/g")
echo "Waiting for MIG deletion to complete. Current status: " $deleteCmdStatus
done
fi
if gcloud compute instance-groups managed describe "${NODE_INSTANCE_PREFIX}-group" --project "${PROJECT}" --zone "${ZONE}" &>/dev/null; then
deleteCmdOutput=$(gcloud compute instance-groups managed delete --zone "${ZONE}" \
--project "${PROJECT}" \
--quiet \
"${NODE_INSTANCE_PREFIX}-group")
if [[ "$deleteCmdOutput" != "" ]]; then
# Managed instance group deletion is done asynchronously, we must wait for it to complete, or subsequent steps fail
deleteCmdOperationId=$(echo $deleteCmdOutput | grep "Operation:" | sed "s/.*Operation:[[:space:]]*\([^[:space:]]*\).*/\1/g")
if [[ "$deleteCmdOperationId" != "" ]]; then
deleteCmdStatus="PENDING"
while [[ "$deleteCmdStatus" != "DONE" ]]
do
sleep 5
deleteCmdOperationOutput=$(gcloud compute instance-groups managed --zone "${ZONE}" --project "${PROJECT}" get-operation $deleteCmdOperationId)
deleteCmdStatus=$(echo $deleteCmdOperationOutput | grep -i "status:" | sed "s/.*status:[[:space:]]*\([^[:space:]]*\).*/\1/g")
echo "Waiting for MIG deletion to complete. Current status: " $deleteCmdStatus
done
fi
fi
done
fi

if gcloud compute instance-templates describe --project "${PROJECT}" "${template}" &>/dev/null; then
gcloud compute instance-templates delete \
Expand Down Expand Up @@ -1028,13 +982,12 @@ function get-template {
# KUBE_RESOURCE_FOUND
function check-resources {
detect-project
detect-node-names

echo "Looking for already existing resources"
KUBE_RESOURCE_FOUND=""

if [[ -n "${INSTANCE_GROUPS[@]:-}" ]]; then
KUBE_RESOURCE_FOUND="Managed instance groups ${INSTANCE_GROUPS[@]}"
if gcloud compute instance-groups managed describe --project "${PROJECT}" --zone "${ZONE}" "${NODE_INSTANCE_PREFIX}-group" &>/dev/null; then
KUBE_RESOURCE_FOUND="Managed instance group ${NODE_INSTANCE_PREFIX}-group"
return 1
fi

Expand Down Expand Up @@ -1137,13 +1090,11 @@ function prepare-push() {
create-node-instance-template $tmp_template_name

local template_name="${NODE_INSTANCE_PREFIX}-template"
for group in ${INSTANCE_GROUPS[@]}; do
gcloud compute instance-groups managed \
set-instance-template "${group}" \
--template "$tmp_template_name" \
--zone "${ZONE}" \
--project "${PROJECT}" || true;
done
gcloud compute instance-groups managed \
set-instance-template "${NODE_INSTANCE_PREFIX}-group" \
--template "$tmp_template_name" \
--zone "${ZONE}" \
--project "${PROJECT}" || true;

gcloud compute instance-templates delete \
--project "${PROJECT}" \
Expand All @@ -1152,13 +1103,11 @@ function prepare-push() {

create-node-instance-template "$template_name"

for group in ${INSTANCE_GROUPS[@]}; do
gcloud compute instance-groups managed \
set-instance-template "${group}" \
--template "$template_name" \
--zone "${ZONE}" \
--project "${PROJECT}" || true;
done
gcloud compute instance-groups managed \
set-instance-template "${NODE_INSTANCE_PREFIX}-group" \
--template "$template_name" \
--zone "${ZONE}" \
--project "${PROJECT}" || true;

gcloud compute instance-templates delete \
--project "${PROJECT}" \
Expand Down

1 comment on commit 9492fd6

@k8s-teamcity-mesosphere

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TeamCity OSS :: Kubernetes Mesos :: 4 - Smoke Tests Build 9066 outcome was SUCCESS
Summary: Tests passed: 1, ignored: 205 Build time: 00:04:33

Please sign in to comment.