Skip to content

Commit

Permalink
duplicate linux CI cluster
Browse files Browse the repository at this point in the history
This PR duplicates the linux CI cluster. This is the first in a
three-PR plan to implement #6400 safely while people are working.

I usually do cluster updates over the weekend because they require
shutting down the entire CI system for about two hours. This is
unfortunately not practical while people are working, and timezones make
it difficult for me to find a time where people are not working during
the week.

So instead the plan is as follows:

1. Create a duplicate of our CI cluster (this PR).
2. Wait for the new cluster to be operational (~90-120 minutes ime).
3. In the Azure Pipelines config screen, disable all the nodes of the
   "old" cluster, so all new jobs get assigned to the temp cluster. Wait
   for all jobs to finish on the old cluster.
4. Update the old cluster. Wait for it to be deployed. (Second PR.)
5. In Azure, disable temp nodes, wait for jobs to drain.
6. Delete temp nodes (third PR).

Reviewing this PR is best done by verifying you can reproduce the
following shell session:

```
$ diff vsts_agent_linux.tf vsts_agent_linux_temp.tf
4,7c4,5
< resource "secret_resource" "vsts-token" {}
<
< data "template_file" "vsts-agent-linux-startup" {
<   template = "${file("${path.module}/vsts_agent_linux_startup.sh")}"
---
> data "template_file" "vsts-agent-linux-startup-temp" {
>   template =
"${file("${path.module}/vsts_agent_linux_startup_temp.sh")}"
16c14
< resource "google_compute_region_instance_group_manager"
"vsts-agent-linux" {
---
> resource "google_compute_region_instance_group_manager"
"vsts-agent-linux-temp" {
18,19c16,17
<   name               = "vsts-agent-linux"
<   base_instance_name = "vsts-agent-linux"
---
>   name               = "vsts-agent-linux-temp"
>   base_instance_name = "vsts-agent-linux-temp"
24,25c22,23
<     name              = "vsts-agent-linux"
<     instance_template =
"${google_compute_instance_template.vsts-agent-linux.self_link}"
---
>     name              = "vsts-agent-linux-temp"
>     instance_template =
"${google_compute_instance_template.vsts-agent-linux-temp.self_link}"
36,37c34,35
< resource "google_compute_instance_template" "vsts-agent-linux" {
<   name_prefix  = "vsts-agent-linux-"
---
> resource "google_compute_instance_template" "vsts-agent-linux-temp" {
>   name_prefix  = "vsts-agent-linux-temp-"
52c50
<     startup-script =
"${data.template_file.vsts-agent-linux-startup.rendered}"
---
>     startup-script =
"${data.template_file.vsts-agent-linux-startup-temp.rendered}"
$ diff vsts_agent_linux_startup.sh vsts_agent_linux_startup_temp.sh
149c149
< su --command "sh <(curl https://nixos.org/nix/install) --daemon"
--login vsts
---
> su --command "sh <(curl -sSfL https://nixos.org/nix/install) --daemon"
--login vsts
$
```

and reviewing that diff, rather than looking at the added files in their
entirety. The name changes are benign and needed for Terraform to
appropriately keep track of which node belongs to the old vs the temp
group. The only change that matters is the new group has the `-sSfL`
flag so they will actually boot up. (Hopefully.)

CHANGELOG_BEGIN
CHANGELOG_END
  • Loading branch information
garyverhaegen-da committed Jun 18, 2020
1 parent fba5747 commit 4f1deb6
Show file tree
Hide file tree
Showing 2 changed files with 282 additions and 0 deletions.
202 changes: 202 additions & 0 deletions infra/vsts_agent_linux_startup_temp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
#!/usr/bin/env bash
# Copyright (c) 2020 Digital Asset (Switzerland) GmbH and/or its affiliates. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Agent startup script
set -euo pipefail

## Hardening

# Commit harakiri on failure
trap "shutdown -h now" EXIT

# replace the default nameserver to not use the metadata server
echo "nameserver 8.8.8.8" > /etc/resolv.conf

# delete self
rm -vf "$0"

## Install system dependencies
apt-get update -q
apt-get install -qy \
curl sudo \
bzip2 rsync \
jq liblttng-ust0 libcurl3 libkrb5-3 libicu55 zlib1g \
git \
netcat \
apt-transport-https \
software-properties-common

# Install dependencies for Chrome (to run Puppeteer tests on the gsg)
# list taken from: https://github.com/puppeteer/puppeteer/blob/a3d1536a6b6e282a43521bea28aef027a7133df8/docs/troubleshooting.md#chrome-headless-doesnt-launch-on-unix
# see https://github.com/digital-asset/daml/pull/5540 for context
apt-get install -qy \
gconf-service \
libasound2 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libc6 \
libcairo2 \
libcups2 \
libdbus-1-3 \
libexpat1 \
libfontconfig1 \
libgcc1 \
libgconf-2-4 \
libgdk-pixbuf2.0-0 \
libglib2.0-0 \
libgtk-3-0 \
libnspr4 \
libpango-1.0-0 \
libpangocairo-1.0-0 \
libstdc++6 \
libx11-6 \
libx11-xcb1 \
libxcb1 \
libxcomposite1 \
libxcursor1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxi6 \
libxrandr2 \
libxrender1 \
libxss1 \
libxtst6 \
ca-certificates \
fonts-liberation \
libappindicator1 \
libnss3 \
lsb-release \
xdg-utils \
wget

curl -sSL https://dl.google.com/cloudagents/install-logging-agent.sh | bash

#install docker
DOCKER_VERSION="5:18.09.5~3-0~ubuntu-$(lsb_release -cs)"
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
apt-key fingerprint 0EBFCD88
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
apt-get update
apt-get install -qy docker-ce=$DOCKER_VERSION docker-ce-cli=$DOCKER_VERSION containerd.io

#Start docker daemon
systemctl enable docker

## Install the VSTS agent
groupadd --gid 3000 vsts
useradd \
--create-home \
--gid 3000 \
--shell /bin/bash \
--uid 3000 \
vsts
#add docker group to user
usermod -aG docker vsts

su --login vsts <<'AGENT_SETUP'
set -euo pipefail
VSTS_ACCOUNT=${vsts_account}
VSTS_POOL=${vsts_pool}
VSTS_TOKEN=${vsts_token}
mkdir -p ~/agent
cd ~/agent
echo 'assignment=default' > .capabilities
echo Determining matching VSTS agent...
VSTS_AGENT_RESPONSE=$(curl -sSfL \
-u "user:$VSTS_TOKEN" \
-H 'Accept:application/json;api-version=3.0-preview' \
"https://$VSTS_ACCOUNT.visualstudio.com/_apis/distributedtask/packages/agent?platform=linux-x64")
VSTS_AGENT_URL=$(echo "$VSTS_AGENT_RESPONSE" \
| jq -r '.value | map([.version.major,.version.minor,.version.patch,.downloadUrl]) | sort | .[length-1] | .[3]')
if [ -z "$VSTS_AGENT_URL" -o "$VSTS_AGENT_URL" == "null" ]; then
echo 1>&2 error: could not determine a matching VSTS agent - check that account \'$VSTS_ACCOUNT\' is correct and the token is valid for that account
exit 1
fi
echo Downloading and installing VSTS agent...
curl -sSfL "$VSTS_AGENT_URL" | tar -xz --no-same-owner
set +u
source ./env.sh
set -u
./config.sh \
--acceptTeeEula \
--agent "$(hostname)" \
--auth PAT \
--pool "$VSTS_POOL" \
--replace \
--token "$VSTS_TOKEN" \
--unattended \
--url "https://$VSTS_ACCOUNT.visualstudio.com"
AGENT_SETUP

## Hardening

chown --recursive root:root /home/vsts/agent/{*.sh,bin,externals}

## Install Nix

# This needs to run inside of a user with sudo access
echo "vsts ALL=(ALL:ALL) NOPASSWD:ALL" > /etc/sudoers.d/nix_installation
su --command "sh <(curl -sSfL https://nixos.org/nix/install) --daemon" --login vsts
rm /etc/sudoers.d/nix_installation

# Note: the "hydra.da-int.net" string is now part of the name of the key for
# legacy reasons; it bears no relation to the DNS hostname of the current
# cache.
cat <<NIX_CONF > /etc/nix/nix.conf
binary-cache-public-keys = hydra.da-int.net-1:6Oy2+KYvI7xkAOg0gJisD7Nz/6m8CmyKMbWfSKUe03g= cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY= hydra.nixos.org-1:CNHJZBh9K4tP3EKF6FkkgeVYsS3ohTl+oS0Qa8bezVs=
binary-caches = https://nix-cache.da-ext.net https://cache.nixos.org
build-users-group = nixbld
cores = 1
max-jobs = 0
sandbox = relaxed
NIX_CONF

systemctl restart nix-daemon

# Warm up local caches by building dev-env and current daml master
# This is allowed to fail, as we still want to have CI machines
# around, even when their caches are only warmed up halfway
su --login vsts <<'CACHE_WARMUP'
# user-wide bazel disk cache override
echo "build:linux --disk_cache=~/.bazel-cache" > ~/.bazelrc
# clone and build
(
git clone https://github.com/digital-asset/daml
cd daml
./ci/dev-env-install.sh
./build.sh "_$(uname)"
) || true
CACHE_WARMUP

# Purge old agents
su --login vsts <<'PURGE_OLD_AGENTS'
cd daml && \
VSTS_ACCOUNT=${vsts_account} VSTS_POOL=${vsts_pool} VSTS_TOKEN=${vsts_token} ./ci/azure-cleanup/purge_old_agents.py || true
PURGE_OLD_AGENTS

# Remove /home/vsts/daml folder that might be present from cache warmup
rm -R /home/vsts/daml || true

## Finish

# run the fake local webserver, taken from the docker image
web-server() {
while true; do
printf 'HTTP/1.1 302 Found\r\nLocation: https://%s.visualstudio.com/_admin/_AgentPool\r\n\r\n' "${vsts_account}" | nc -l -p 80 -q 0 > /dev/null
done
}
web-server &

# Start the VSTS agent
su --login --command "cd /home/vsts/agent && exec ./run.sh" - vsts
80 changes: 80 additions & 0 deletions infra/vsts_agent_linux_temp.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright (c) 2020 Digital Asset (Switzerland) GmbH and/or its affiliates. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

data "template_file" "vsts-agent-linux-startup-temp" {
template = "${file("${path.module}/vsts_agent_linux_startup_temp.sh")}"

vars = {
vsts_token = "${secret_resource.vsts-token.value}"
vsts_account = "digitalasset"
vsts_pool = "linux-pool"
}
}

resource "google_compute_region_instance_group_manager" "vsts-agent-linux-temp" {
provider = "google-beta"
name = "vsts-agent-linux-temp"
base_instance_name = "vsts-agent-linux-temp"
region = "${local.region}"
target_size = 10

version {
name = "vsts-agent-linux-temp"
instance_template = "${google_compute_instance_template.vsts-agent-linux-temp.self_link}"
}

update_policy {
type = "PROACTIVE"
minimal_action = "REPLACE"
max_surge_fixed = 3
min_ready_sec = 60
}
}

resource "google_compute_instance_template" "vsts-agent-linux-temp" {
name_prefix = "vsts-agent-linux-temp-"
machine_type = "n1-standard-8"
labels = "${local.labels}"

disk {
disk_size_gb = 200
disk_type = "pd-ssd"
source_image = "ubuntu-os-cloud/ubuntu-1604-lts"
}

lifecycle {
create_before_destroy = true
}

metadata {
startup-script = "${data.template_file.vsts-agent-linux-startup-temp.rendered}"

shutdown-script = <<EOS
#!/usr/bin/env bash
set -euo pipefail
cd /home/vsts/agent
su vsts <<SHUTDOWN_AGENT
export VSTS_AGENT_INPUT_TOKEN='${secret_resource.vsts-token.value}'
./config.sh remove --unattended --auth PAT
SHUTDOWN_AGENT
EOS
}

network_interface {
network = "default"

// Ephemeral IP to get access to the Internet
access_config {}
}

service_account {
email = "log-writer@da-dev-gcp-daml-language.iam.gserviceaccount.com"
scopes = ["cloud-platform"]
}

scheduling {
automatic_restart = false
on_host_maintenance = "TERMINATE"
preemptible = false
}
}

0 comments on commit 4f1deb6

Please sign in to comment.