Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

[CI] Add bootnode checking CI jobs #6889

Merged
merged 29 commits into from
Mar 21, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
0dfafb2
Add check_bootnode script and github action
s3krit Mar 14, 2023
5bb0a9e
fix mktemp for linux machines
s3krit Mar 14, 2023
2350fb5
Update check_bootnodes.sh
s3krit Mar 14, 2023
7470da0
fix ephemeral ports and fetch polkadot
s3krit Mar 15, 2023
2d31104
Merge branch 'mp-bootnode-checker' of github.com:paritytech/polkadot …
s3krit Mar 15, 2023
134127c
fix check-bootnodes.yml
s3krit Mar 15, 2023
1af8d0b
increase node spawn holdoff
s3krit Mar 15, 2023
c668286
disable fail-fast
s3krit Mar 15, 2023
1cebfd8
refactor, separate out check_bootnodes and make it posix-compliant
s3krit Mar 15, 2023
1170e84
add new job for detecting new bootnodes
s3krit Mar 15, 2023
feec2e4
fix check-bootnodes.yml
s3krit Mar 15, 2023
da4da9a
only check all bootnodes on release
s3krit Mar 15, 2023
540dd97
Add test bad bootnode
s3krit Mar 15, 2023
5ee885a
fix paths
s3krit Mar 15, 2023
5b465ab
fix paths and git... hopefully
s3krit Mar 15, 2023
9d1a4be
this better work...
s3krit Mar 15, 2023
1e106e7
fix
s3krit Mar 15, 2023
db27961
test
s3krit Mar 15, 2023
fc9a45e
last test
s3krit Mar 15, 2023
db701fb
Revert "Add test bad bootnode"
s3krit Mar 15, 2023
c815413
Merge remote-tracking branch 'origin' into mp-bootnode-checker
s3krit Mar 16, 2023
1733e63
Update check_bootnodes.sh
s3krit Mar 16, 2023
9f76d0e
optimisations
s3krit Mar 16, 2023
9dc9a6a
Merge branch 'mp-bootnode-checker' of github.com:paritytech/polkadot …
s3krit Mar 16, 2023
3f9d31b
increase holdoff to 5 seconds
s3krit Mar 16, 2023
ea5f71c
dont delete chainspec til we kill the node
s3krit Mar 16, 2023
45a94cc
Update check-bootnodes.yml
s3krit Mar 20, 2023
cce201e
Remove checking bootnodes on pushing of this branch
s3krit Mar 20, 2023
90c3faa
Merge remote-tracking branch 'origin/master' into mp-bootnode-checker
Mar 21, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add check_bootnode script and github action
  • Loading branch information
s3krit committed Mar 14, 2023
commit 0dfafb204993000d91284e9a7b50e212f3dfecfd
18 changes: 18 additions & 0 deletions .github/workflows/check-bootnodes.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# checks all runtimes we care about (kusama, polkadot, westend) and ensures
s3krit marked this conversation as resolved.
Show resolved Hide resolved
# the bootnodes in their respective chainspecs are contactable

name: Check all bootnodes
on: push

jobs:
check_bootnodes:
strategy:
matrix:
runtime: [westend, kusama, polkadot]
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v3
- name: Check ${{ matrix.runtime }} bootnodes
shell: bash
run: scripts/ci/github/check_bootnodes.sh ${{ matrix.runtime }}
113 changes: 113 additions & 0 deletions scripts/ci/github/check_bootnodes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env bash

# In this script, we check each bootnode for each runtime and ensure they are contactable.
# We do this by removing every bootnode from the chainspec with the exception of the one
# we want to check. Then we spin up a node using this new chainspec, wait a little while
# and then check our local node's RPC endpoint for the number of peers. If the node hasn't
# been able to contact any other nodes, we can reason that the bootnode we used is not well-connected
# or is otherwise uncontactable.

# Root of the polkadot dir
ROOT="$(dirname "${0}")/../../.."
RUNTIME="$1"

trap cleanup EXIT INT TERM

cleanup(){
echo "[+] Script interrupted or ended. Cleaning up..."
# Kill all the polkadot processes
killall polkadot > /dev/null 2>&1
}

check_bootnode(){
BOOTNODE_INDEX=$1
TMP_CHAINSPEC_FILE="$RUNTIME.$BOOTNODE_INDEX.tmp.json"
FINAL_CHAINSPEC_FILE="$RUNTIME.$BOOTNODE_INDEX.final.json"
# Copy the chainspec file to a temporary location to avoid weird race conditions when running in parallel
cp "$CHAINSPEC_FILE" "$CHAINSPEC_TMPDIR/$TMP_CHAINSPEC_FILE"
pushd "$CHAINSPEC_TMPDIR" > /dev/null || exit 1
jq ".bootNodes |= [.[$BOOTNODE_INDEX]] " < "$TMP_CHAINSPEC_FILE" > "$FINAL_CHAINSPEC_FILE"
BOOTNODE=$( jq -r '.bootNodes[0]' < "$FINAL_CHAINSPEC_FILE" )
# Get the first ephemeral port
BASE_PORT=$(sysctl net.inet.ip.portrange.first | awk '{print $2}')
RPC_PORT=$((BASE_PORT + BOOTNODE_INDEX))
echo "[+] Checking bootnode $BOOTNODE"
polkadot --chain "$FINAL_CHAINSPEC_FILE" --no-mdns --rpc-port=$RPC_PORT --tmp > /dev/null 2>&1 &
POLKADOT_PID=$!
# We just spun up a bunch of nodes... probably want to wait a bit.
sleep 60
popd > /dev/null || exit 1
# Check the health endpoint of the RPC node
PEERS="$(curl -s -X POST -H "Content-Type: application/json" --data '{"jsonrpc":"2.0","method":"system_health","params":[],"id":1}' http://localhost:$RPC_PORT | jq -r '.result.peers')"
# Clean up the node
kill -9 $POLKADOT_PID
# Sometimes due to machine load or other reasons, we don't get a response from the RPC node
# If $PEERS is an empty variable, mark the node as unreachable
if [ -z "$PEERS" ]; then
PEERS=0
fi
if [ "$PEERS" -gt 0 ]; then
echo "[+] $PEERS peers found for $BOOTNODE"
echo " Bootnode appears contactable"
GOOD_BOOTNODES+=("$BOOTNODE")
return 0
else
echo "[!] $PEERS peers found for $BOOTNODE"
echo " Bootnode appears unreachable"
BAD_BOOTNODES+=("$BOOTNODE")
return 1
fi
}

# For each runtime
CHAINSPEC_FILE="$ROOT/node/service/chain-specs/$RUNTIME.json"
# count the number of bootnodes
BOOTNODES=$( jq -r '.bootNodes | length' "$CHAINSPEC_FILE" )
# Make a temporary dir for chainspec files
CHAINSPEC_TMPDIR="$(mktemp -d -t "${RUNTIME}_chainspecs")"
echo "[+] Using $CHAINSPEC_TMPDIR as temporary chainspec dir"
# Store an array of the bad bootnodes
BAD_BOOTNODES=()
GOOD_BOOTNODES=()
PIDS=()
echo "[+] Checking $BOOTNODES bootnodes for $RUNTIME"
for i in $(seq 0 $((BOOTNODES-1))); do
# Check each bootnode in parallel
check_bootnode "$i" &
PIDS+=($!)
# Hold off one second between attempting to spawn nodes
sleep 1
done
RESPS=()
# Wait for all the nodes to finish
for pid in "${PIDS[@]}"; do
wait "$pid"
RESPS+=($?)
done
echo
# For any bootnodes that failed, add them to the bad bootnodes array
for i in "${!RESPS[@]}"; do
if [ "${RESPS[$i]}" -ne 0 ]; then
BAD_BOOTNODES+=("$( jq -r .bootNodes["$i"] < "$CHAINSPEC_FILE" )")
fi
done
# For any bootnodes that succeeded, add them to the good bootnodes array
for i in "${!RESPS[@]}"; do
if [ "${RESPS[$i]}" -eq 0 ]; then
GOOD_BOOTNODES+=("$( jq -r .bootNodes["$i"] < "$CHAINSPEC_FILE" )")
fi
done

# If we've got any uncontactable bootnodes for this runtime, print them
if [ ${#BAD_BOOTNODES[@]} -gt 0 ]; then
echo "[!] Bad bootnodes found for $RUNTIME:"
for i in "${BAD_BOOTNODES[@]}"; do
echo " $i"
done
cleanup
exit 1
else
echo "[+] All bootnodes for $RUNTIME are contactable"
cleanup
exit 0
fi