From 9e5fd874cc20b47be248af0f46e20a405649dbb0 Mon Sep 17 00:00:00 2001
From: Zach Loafman <zml@google.com>
Date: Fri, 3 Apr 2015 14:48:39 -0700
Subject: [PATCH 1/2] First hack at upgrade script for GCE:

Address #6075: Shoot the master VM while saving the master-pd. This
takes a couple of minor changes to configure-vm.sh, some of which also
would be necessary for reboot. In particular, I changed it so that the
kube-token instance metadata is no longer required after inception;
instead, we mount the master-pd and see if we've already created the
known tokens file before blocking on the instance metadata.

Also partially addresses #6099 in bash by refactoring the kube-push
path.
---
 cluster/gce/configure-vm.sh |  81 +++++++++------
 cluster/gce/upgrade.sh      | 191 ++++++++++++++++++++++++++++++++++++
 cluster/gce/util.sh         |  55 +++++++----
 3 files changed, 279 insertions(+), 48 deletions(-)
 create mode 100755 cluster/gce/upgrade.sh

diff --git a/cluster/gce/configure-vm.sh b/cluster/gce/configure-vm.sh
index df095c8b45027..780a8266169e1 100644
--- a/cluster/gce/configure-vm.sh
+++ b/cluster/gce/configure-vm.sh
@@ -21,6 +21,8 @@ set -o pipefail
 # If we have any arguments at all, this is a push and not just setup.
 is_push=$@
 
+readonly KNOWN_TOKENS_FILE="/srv/salt-overlay/salt/kube-apiserver/known_tokens.csv"
+
 function ensure-install-dir() {
   INSTALL_DIR="/var/cache/kubernetes-install"
   mkdir -p ${INSTALL_DIR}
@@ -55,17 +57,6 @@ for k,v in yaml.load(sys.stdin).iteritems():
   print "readonly {var}={value}".format(var = k, value = pipes.quote(str(v)))
 ''' < "${kube_env_yaml}")
 
-  # We bake the KUBELET_TOKEN in separately to avoid auth information
-  # having to be re-communicated on kube-push. (Otherwise the client
-  # has to keep the bearer token around to handle generating a valid
-  # kube-env.)
-  if [[ -z "${KUBELET_TOKEN:-}" ]]; then
-    until KUBELET_TOKEN=$(curl-metadata kube-token); do
-      echo 'Waiting for metadata KUBELET_TOKEN...'
-      sleep 3
-    done
-  fi
-
   # Infer master status from presence in node pool
   if [[ $(hostname) = ${NODE_INSTANCE_PREFIX}* ]]; then
     KUBERNETES_MASTER="false"
@@ -82,6 +73,19 @@ for k,v in yaml.load(sys.stdin).iteritems():
   fi
 }
 
+function ensure-kube-token() {
+  # We bake the KUBELET_TOKEN in separately to avoid auth information
+  # having to be re-communicated on kube-push. (Otherwise the client
+  # has to keep the bearer token around to handle generating a valid
+  # kube-env.)
+  if [[ -z "${KUBELET_TOKEN:-}" ]] && [[ ! -e "${KNOWN_TOKENS_FILE}" ]]; then
+    until KUBELET_TOKEN=$(curl-metadata kube-token); do
+      echo 'Waiting for metadata KUBELET_TOKEN...'
+      sleep 3
+    done
+  fi
+}
+
 function remove-docker-artifacts() {
   echo "== Deleting docker0 =="
   # Forcibly install bridge-utils (options borrowed from Salt logs).
@@ -240,27 +244,41 @@ admission_control: '$(echo "$ADMISSION_CONTROL" | sed -e "s/'/''/g")'
 EOF
 }
 
-# This should only happen on cluster initialization
+# This should only happen on cluster initialization. Uses
+# MASTER_HTPASSWORD to generate the nginx/htpasswd file, and the
+# KUBELET_TOKEN, plus /dev/urandom, to generate known_tokens.csv
+# (KNOWN_TOKENS_FILE). After the first boot and on upgrade, these
+# files exist on the master-pd and should never be touched again
+# (except perhaps an additional service account, see NB below.)
 function create-salt-auth() {
-  mkdir -p /srv/salt-overlay/salt/nginx
-  echo "${MASTER_HTPASSWD}" > /srv/salt-overlay/salt/nginx/htpasswd
-
-  mkdir -p /srv/salt-overlay/salt/kube-apiserver
-  known_tokens_file="/srv/salt-overlay/salt/kube-apiserver/known_tokens.csv"
-  (umask 077;
-    echo "${KUBELET_TOKEN},kubelet,kubelet" > "${known_tokens_file}")
-
-  mkdir -p /srv/salt-overlay/salt/kubelet
-  kubelet_auth_file="/srv/salt-overlay/salt/kubelet/kubernetes_auth"
-  (umask 077;
-    echo "{\"BearerToken\": \"${KUBELET_TOKEN}\", \"Insecure\": true }" > "${kubelet_auth_file}")
-
-  # Generate tokens for other "service accounts".  Append to known_tokens.
-  local -r service_accounts=("system:scheduler" "system:controller_manager" "system:logging" "system:monitoring" "system:dns")
-  for account in "${service_accounts[@]}"; do
-    token=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null)
-    echo "${token},${account},${account}" >> "${known_tokens_file}"
-  done
+  local -r htpasswd_file="/srv/salt-overlay/salt/nginx/htpasswd"
+
+  if [ ! -e "${htpasswd_file}" ]; then
+    mkdir -p /srv/salt-overlay/salt/nginx
+    echo "${MASTER_HTPASSWD}" > "${htpasswd_file}"
+  fi
+
+  if [ ! -e "${KNOWN_TOKENS_FILE}" ]; then
+    mkdir -p /srv/salt-overlay/salt/kube-apiserver
+    (umask 077;
+      echo "${KUBELET_TOKEN},kubelet,kubelet" > "${KNOWN_TOKENS_FILE}")
+
+    mkdir -p /srv/salt-overlay/salt/kubelet
+    kubelet_auth_file="/srv/salt-overlay/salt/kubelet/kubernetes_auth"
+    (umask 077;
+      echo "{\"BearerToken\": \"${KUBELET_TOKEN}\", \"Insecure\": true }" > "${kubelet_auth_file}")
+
+    # Generate tokens for other "service accounts".  Append to known_tokens.
+    #
+    # NB: If this list ever changes, this script actually has to
+    # change to detect the existence of this file, kill any deleted
+    # old tokens and add any new tokens (to handle the upgrade case).
+    local -r service_accounts=("system:scheduler" "system:controller_manager" "system:logging" "system:monitoring" "system:dns")
+    for account in "${service_accounts[@]}"; do
+      token=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null)
+      echo "${token},${account},${account}" >> "${KNOWN_TOKENS_FILE}"
+    done
+  fi
 }
 
 function download-release() {
@@ -390,6 +408,7 @@ if [[ -z "${is_push}" ]]; then
   ensure-install-dir
   set-kube-env
   [[ "${KUBERNETES_MASTER}" == "true" ]] && mount-master-pd
+  ensure-kube-token
   create-salt-pillar
   create-salt-auth
   download-release
diff --git a/cluster/gce/upgrade.sh b/cluster/gce/upgrade.sh
new file mode 100755
index 0000000000000..6f129dff923f2
--- /dev/null
+++ b/cluster/gce/upgrade.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# !!!EXPERIMENTAL !!! Upgrade script for GCE. Expect this to get
+# rewritten in Go in relatively short order, but it allows us to start
+# testing the concepts.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+# VERSION_REGEX matches things like "v0.13.1"
+readonly VERSION_REGEX="^v(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)$"
+
+# CI_VERSION_REGEX matches things like "v0.14.1-341-ge0c9d9e"
+readonly CI_VERSION_REGEX="^v(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)-(.*)$"
+
+if [[ "${KUBERNETES_PROVIDER:-gce}" != "gce" ]]; then
+  echo "!!! ${1} only works on GCE" >&2
+  exit 1
+fi
+
+KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../..
+source "${KUBE_ROOT}/cluster/kube-env.sh"
+source "${KUBE_ROOT}/cluster/${KUBERNETES_PROVIDER}/util.sh"
+
+function usage() {
+  echo "!!! EXPERIMENTAL !!!"
+  echo ""
+  echo "${0} [-M|-N] <release or continuous integration version>"
+  echo "  Upgrades master and nodes by default"
+  echo "  -M:  Upgrade master only"
+  echo "  -N:  Upgrade nodes only"
+  echo ""
+  echo "(... Fetching current release versions ...)"
+  echo ""
+
+  local latest_release
+  local latest_stable
+  local latest_ci
+
+  latest_stable=$(gsutil cat gs://kubernetes-release/release/stable.txt)
+  latest_release=$(gsutil cat gs://kubernetes-release/release/latest.txt)
+  latest_ci=$(gsutil cat gs://kubernetes-release/ci/latest.txt)
+
+  echo "To upgrade to:"
+  echo "  latest stable: ${0} ${latest_stable}"
+  echo "  latest release: ${0} ${latest_release}"
+  echo "  latest ci:      ${0} ${latest_ci}"
+}
+
+function upgrade-master() {
+  echo "== Upgrading master to ${SERVER_BINARY_TAR_URL}. Do not interrupt, deleting master instance. =="
+
+  ensure-temp-dir
+  detect-project
+  detect-master
+  get-password
+  set-master-htpasswd
+
+  # Delete the master instance. Note that the master-pd is created
+  # with auto-delete=no, so it should not be deleted.
+  gcloud compute instances delete \
+    --project "${PROJECT}" \
+    --quiet \
+    --zone "${ZONE}" \
+    "${MASTER_NAME}"
+
+  write-master-env
+  gcloud compute instances create "${MASTER_NAME}" \
+    --address "${MASTER_NAME}-ip" \
+    --project "${PROJECT}" \
+    --zone "${ZONE}" \
+    --machine-type "${MASTER_SIZE}" \
+    --image-project="${IMAGE_PROJECT}" \
+    --image "${IMAGE}" \
+    --tags "${MASTER_TAG}" \
+    --network "${NETWORK}" \
+    --scopes "storage-ro" "compute-rw" \
+    --can-ip-forward \
+    --metadata-from-file \
+      "startup-script=${KUBE_ROOT}/cluster/gce/configure-vm.sh" \
+      "kube-env=${KUBE_TEMP}/master-kube-env.yaml" \
+    --disk name="${MASTER_NAME}-pd" device-name=master-pd mode=rw boot=no auto-delete=no
+
+  wait-for-master
+}
+
+function wait-for-master() {
+  echo "== Waiting for new master to respond to API requests =="
+
+  until curl --insecure --user "${KUBE_USER}:${KUBE_PASSWORD}" --max-time 5 \
+    --fail --output /dev/null --silent "https://${KUBE_MASTER_IP}/api/v1beta1/pods"; do
+    printf "."
+    sleep 2
+  done
+
+  echo "== Done =="
+}
+
+function upgrade-nodes() {
+  echo "== Upgrading nodes to ${SERVER_BINARY_TAR_URL}. =="
+  ensure-temp-dir
+  detect-project
+  detect-minion-names
+  get-password
+  set-master-htpasswd
+  kube-update-nodes upgrade
+  echo "== Done =="
+}
+
+function tars_from_version() {
+  version=${1-}
+
+  if [[ ${version} =~ ${VERSION_REGEX} ]]; then
+    SERVER_BINARY_TAR_URL="https://storage.googleapis.com/kubernetes-release/release/${version}/kubernetes-server-linux-amd64.tar.gz"
+    SALT_TAR_URL="https://storage.googleapis.com/kubernetes-release/release/${version}/kubernetes-salt.tar.gz"
+  elif [[ ${version} =~ ${CI_VERSION_REGEX} ]]; then
+    SERVER_BINARY_TAR_URL="https://storage.googleapis.com/kubernetes-release/ci/${version}/kubernetes-server-linux-amd64.tar.gz"
+    SALT_TAR_URL="https://storage.googleapis.com/kubernetes-release/ci/${version}/kubernetes-salt.tar.gz"
+  else
+    echo "!!! Version not provided or version doesn't match regexp" >&2
+    exit 1
+  fi
+
+  if ! curl -Ss --range 0-1 ${SERVER_BINARY_TAR_URL} >&/dev/null; then
+    echo "!!! Can't find release at ${SERVER_BINARY_TAR_URL}" >&2
+    exit 1
+  fi
+
+  echo "== Release ${version} validated =="
+}
+
+master_upgrade=true
+node_upgrade=true
+
+while getopts ":MNh" opt; do
+  case ${opt} in
+    M)
+      node_upgrade=false
+      ;;
+    N)
+      master_upgrade=false
+      ;;
+    h)
+      usage
+      exit 0
+      ;;
+    \?)
+      echo "Invalid option: -$OPTARG" >&2
+      usage
+      exit 1
+      ;;
+  esac
+done
+shift $((OPTIND-1))
+
+if [[ $# -lt 1 ]]; then
+  usage
+  exit 1
+fi
+
+if [[ "${master_upgrade}" == "false" ]] && [[ "${node_upgrade}" == "false" ]]; then
+  echo "Can't specify both -M and -N" >&2
+  exit 1
+fi
+
+tars_from_version ${1}
+
+if [[ "${master_upgrade}" == "true" ]]; then
+  upgrade-master
+fi
+
+if [[ "${node_upgrade}" == "true" ]]; then
+  upgrade-nodes
+fi
+
+"${KUBE_ROOT}/cluster/validate-cluster.sh"
diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh
index 83b2c369b7ae2..19b564aac78dd 100755
--- a/cluster/gce/util.sh
+++ b/cluster/gce/util.sh
@@ -229,7 +229,6 @@ function detect-minions () {
 # Vars set:
 #   KUBE_MASTER
 #   KUBE_MASTER_IP
-#   KUBE_MASTER_IP_INTERNAL
 function detect-master () {
   detect-project
   KUBE_MASTER=${MASTER_NAME}
@@ -835,22 +834,7 @@ function kube-push {
   echo "Pushing to master (log at ${OUTPUT}/kube-push-${KUBE_MASTER}.log) ..."
   cat ${KUBE_ROOT}/cluster/gce/configure-vm.sh | gcloud compute ssh --ssh-flag="-o LogLevel=quiet" --project "${PROJECT}" --zone "${ZONE}" "${KUBE_MASTER}" --command "sudo bash -s -- --push" &> ${OUTPUT}/kube-push-"${KUBE_MASTER}".log
 
-  echo "Pushing metadata to minions... "
-  write-node-env
-  for (( i=0; i<${#MINION_NAMES[@]}; i++)); do
-    add-instance-metadata-from-file "${MINION_NAMES[$i]}" "kube-env=${KUBE_TEMP}/node-kube-env.yaml" &
-  done
-  wait-for-jobs
-  echo "Done"
-
-  for (( i=0; i<${#MINION_NAMES[@]}; i++)); do
-    echo "Starting push to node (log at ${OUTPUT}/kube-push-${MINION_NAMES[$i]}.log) ..."
-    cat ${KUBE_ROOT}/cluster/gce/configure-vm.sh | gcloud compute ssh --ssh-flag="-o LogLevel=quiet" --project "${PROJECT}" --zone "${ZONE}" "${MINION_NAMES[$i]}" --command "sudo bash -s -- --push" &> ${OUTPUT}/kube-push-"${MINION_NAMES[$i]}".log &
-  done
-
-  echo -n "Waiting for node pushes... "
-  wait-for-jobs
-  echo "Done"
+  kube-update-nodes push
 
   # TODO(zmerlynn): Re-create instance-template with the new
   # node-kube-env. This isn't important until the node-ip-range issue
@@ -869,6 +853,43 @@ function kube-push {
   echo
 }
 
+# Push or upgrade nodes.
+#
+# TODO: This really needs to trampoline somehow to the configure-vm.sh
+# from the .tar.gz that we're actually pushing onto the node, because
+# that configuration shifts over versions. Right now, we're blasting
+# the configure-vm from our version instead.
+#
+# Assumed vars:
+#  KUBE_ROOT
+#  MINION_NAMES
+#  KUBE_TEMP
+#  PROJECT
+#  ZONE
+function kube-update-nodes() {
+  action=${1}
+
+  OUTPUT=${KUBE_ROOT}/_output/logs
+  mkdir -p ${OUTPUT}
+
+  echo "Updating node metadata... "
+  write-node-env
+  for (( i=0; i<${#MINION_NAMES[@]}; i++)); do
+    add-instance-metadata-from-file "${MINION_NAMES[$i]}" "kube-env=${KUBE_TEMP}/node-kube-env.yaml" &
+  done
+  wait-for-jobs
+  echo "Done"
+
+  for (( i=0; i<${#MINION_NAMES[@]}; i++)); do
+    echo "Starting ${action} on node (log at ${OUTPUT}/kube-${action}-${MINION_NAMES[$i]}.log) ..."
+    cat ${KUBE_ROOT}/cluster/gce/configure-vm.sh | gcloud compute ssh --ssh-flag="-o LogLevel=quiet" --project "${PROJECT}" --zone "${ZONE}" "${MINION_NAMES[$i]}" --command "sudo bash -s -- --push" &> ${OUTPUT}/kube-${action}-"${MINION_NAMES[$i]}".log &
+  done
+
+  echo -n "Waiting..."
+  wait-for-jobs
+  echo "Done"
+}
+
 # -----------------------------------------------------------------------------
 # Cluster specific test helpers used from hack/e2e-test.sh
 

From 616c6be65378347e6225947acef203dc8ea9a498 Mon Sep 17 00:00:00 2001
From: Zach Loafman <zml@google.com>
Date: Mon, 6 Apr 2015 08:35:02 -0700
Subject: [PATCH 2/2] Refactor the "gcloud compute instances create" call as
 well.

---
 cluster/gce/upgrade.sh | 18 +--------------
 cluster/gce/util.sh    | 52 ++++++++++++++++++++++++++++++------------
 2 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/cluster/gce/upgrade.sh b/cluster/gce/upgrade.sh
index 6f129dff923f2..88a809e994972 100755
--- a/cluster/gce/upgrade.sh
+++ b/cluster/gce/upgrade.sh
@@ -79,23 +79,7 @@ function upgrade-master() {
     --zone "${ZONE}" \
     "${MASTER_NAME}"
 
-  write-master-env
-  gcloud compute instances create "${MASTER_NAME}" \
-    --address "${MASTER_NAME}-ip" \
-    --project "${PROJECT}" \
-    --zone "${ZONE}" \
-    --machine-type "${MASTER_SIZE}" \
-    --image-project="${IMAGE_PROJECT}" \
-    --image "${IMAGE}" \
-    --tags "${MASTER_TAG}" \
-    --network "${NETWORK}" \
-    --scopes "storage-ro" "compute-rw" \
-    --can-ip-forward \
-    --metadata-from-file \
-      "startup-script=${KUBE_ROOT}/cluster/gce/configure-vm.sh" \
-      "kube-env=${KUBE_TEMP}/master-kube-env.yaml" \
-    --disk name="${MASTER_NAME}-pd" device-name=master-pd mode=rw boot=no auto-delete=no
-
+  create-master-instance "${MASTER_NAME}-ip"
   wait-for-master
 }
 
diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh
index 19b564aac78dd..c15631fccfd20 100755
--- a/cluster/gce/util.sh
+++ b/cluster/gce/util.sh
@@ -487,6 +487,42 @@ function write-node-env {
   build-kube-env false "${KUBE_TEMP}/node-kube-env.yaml"
 }
 
+# create-master-instance creates the master instance. If called with
+# an argument, the argument is used as the name to a reserved IP
+# address for the master. (In the case of upgrade/repair, we re-use
+# the same IP.)
+#
+# It requires a whole slew of assumed variables, partially due to to
+# the call to write-master-env. Listing them would be rather
+# futile. Instead, we list the required calls to ensure any additional
+# variables are set:
+#   ensure-temp-dir
+#   detect-project
+#   get-password
+#   set-master-htpasswd
+#
+function create-master-instance {
+  local address_opt=""
+  [[ -n ${1:-} ]] && address_opt="--address ${1}"
+
+  write-master-env
+  gcloud compute instances create "${MASTER_NAME}" \
+    ${address_opt} \
+    --project "${PROJECT}" \
+    --zone "${ZONE}" \
+    --machine-type "${MASTER_SIZE}" \
+    --image-project="${IMAGE_PROJECT}" \
+    --image "${IMAGE}" \
+    --tags "${MASTER_TAG}" \
+    --network "${NETWORK}" \
+    --scopes "storage-ro" "compute-rw" \
+    --can-ip-forward \
+    --metadata-from-file \
+      "startup-script=${KUBE_ROOT}/cluster/gce/configure-vm.sh" \
+      "kube-env=${KUBE_TEMP}/master-kube-env.yaml" \
+    --disk name="${MASTER_NAME}-pd" device-name=master-pd mode=rw boot=no auto-delete=no
+}
+
 # Instantiate a kubernetes cluster
 #
 # Assumed vars
@@ -546,21 +582,7 @@ function kube-up {
   # https://github.com/GoogleCloudPlatform/kubernetes/issues/3168
   KUBELET_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null)
 
-  write-master-env
-  gcloud compute instances create "${MASTER_NAME}" \
-    --project "${PROJECT}" \
-    --zone "${ZONE}" \
-    --machine-type "${MASTER_SIZE}" \
-    --image-project="${IMAGE_PROJECT}" \
-    --image "${IMAGE}" \
-    --tags "${MASTER_TAG}" \
-    --network "${NETWORK}" \
-    --scopes "storage-ro" "compute-rw" \
-    --can-ip-forward \
-    --metadata-from-file \
-      "startup-script=${KUBE_ROOT}/cluster/gce/configure-vm.sh" \
-      "kube-env=${KUBE_TEMP}/master-kube-env.yaml" \
-    --disk name="${MASTER_NAME}-pd" device-name=master-pd mode=rw boot=no auto-delete=no &
+  create-master-instance &
 
   # Create a single firewall rule for all minions.
   create-firewall-rule "${MINION_TAG}-all" "${CLUSTER_IP_RANGE}" "${MINION_TAG}" &