From b509cad58cd3fbc2f7efbb2d6fa7ac8c8eef07bd Mon Sep 17 00:00:00 2001 From: wjurayj-algo <65556361+wjurayj-algo@users.noreply.github.com> Date: Wed, 24 Jun 2020 10:40:54 -0400 Subject: [PATCH] Algoh fix (#1183) Added algod client health check to prevent node shutdown due to timeout in healthy node. --- cmd/algoh/client.go | 1 + cmd/algoh/deadman.go | 43 +++-- cmd/algoh/main.go | 22 +-- cmd/algoh/mockClient.go | 11 ++ .../cli/algoh/expect/algohExpectCommon.exp | 109 ++++++++++++ .../cli/algoh/expect/algohTimeoutTest.exp | 87 ++++++++++ .../cli/algoh/expect/algoh_expect_test.go | 158 ++++++++++++++++++ .../expect/disabled_profiler_config.json | 1 + test/e2e-go/cli/algoh/expect/host-config.json | 1 + 9 files changed, 413 insertions(+), 20 deletions(-) create mode 100644 test/e2e-go/cli/algoh/expect/algohExpectCommon.exp create mode 100644 test/e2e-go/cli/algoh/expect/algohTimeoutTest.exp create mode 100644 test/e2e-go/cli/algoh/expect/algoh_expect_test.go create mode 100644 test/e2e-go/cli/algoh/expect/disabled_profiler_config.json create mode 100644 test/e2e-go/cli/algoh/expect/host-config.json diff --git a/cmd/algoh/client.go b/cmd/algoh/client.go index 07b1b9c769..56b23a0043 100644 --- a/cmd/algoh/client.go +++ b/cmd/algoh/client.go @@ -28,4 +28,5 @@ type Client interface { Status() (generatedV2.NodeStatusResponse, error) Block(round uint64) (v1.Block, error) GetGoRoutines(ctx context.Context) (string, error) + HealthCheck() error } diff --git a/cmd/algoh/deadman.go b/cmd/algoh/deadman.go index 33320f7306..ef1654c9f1 100644 --- a/cmd/algoh/deadman.go +++ b/cmd/algoh/deadman.go @@ -22,6 +22,7 @@ import ( "sync" "time" + "github.com/algorand/go-algorand/config" "github.com/algorand/go-algorand/daemon/algod/api/spec/v1" "github.com/algorand/go-algorand/logging/telemetryspec" ) @@ -33,9 +34,10 @@ type deadManWatcher struct { client Client done <-chan struct{} wg *sync.WaitGroup + algodConfig config.Local } -func makeDeadManWatcher(timeout int64, client Client, uploadOnError bool, done <-chan struct{}, wg *sync.WaitGroup) deadManWatcher { +func makeDeadManWatcher(timeout int64, client Client, uploadOnError bool, done <-chan struct{}, wg *sync.WaitGroup, algodConfig config.Local) deadManWatcher { var deadManTime time.Duration if timeout == 0 { deadManTime = time.Hour * (10 * 365 * 24) // Don't fire for 10 years @@ -50,6 +52,7 @@ func makeDeadManWatcher(timeout int64, client Client, uploadOnError bool, done < uploadOnError: uploadOnError, done: done, wg: wg, + algodConfig: algodConfig, } } @@ -90,15 +93,27 @@ func (w deadManWatcher) onBlock(block v1.Block) { } func (w deadManWatcher) reportDeadManTimeout(curBlock uint64) (err error) { - goRoutines, err := getGoRoutines(w.client) - if err != nil { - goRoutines = fmt.Sprintf("Error dumping goroutines: %v", err) - } - - details := telemetryspec.DeadManTriggeredEventDetails{ - Timeout: int64(w.timeout.Seconds()), - CurrentBlock: curBlock, - GoRoutines: goRoutines, + var details telemetryspec.DeadManTriggeredEventDetails + if w.algodConfig.EnableProfiler { + goRoutines, err := getGoRoutines(w.client) + if err != nil { + goRoutines = fmt.Sprintf("Error dumping goroutines: %v", err) + } + details = telemetryspec.DeadManTriggeredEventDetails{ + Timeout: int64(w.timeout.Seconds()), + CurrentBlock: curBlock, + GoRoutines: goRoutines, + } + } else { + healthCheck, err := getHealthCheck(w.client) + if err != nil { + healthCheck = fmt.Sprintf("Error performing health check : %v", err) + } + details = telemetryspec.DeadManTriggeredEventDetails{ + Timeout: int64(w.timeout.Seconds()), + CurrentBlock: curBlock, + GoRoutines: healthCheck, + } } log.EventWithDetails(telemetryspec.HostApplicationState, telemetryspec.DeadManTriggeredEvent, details) @@ -117,3 +132,11 @@ func getGoRoutines(client Client) (goRoutines string, err error) { } return } + +func getHealthCheck(client Client) (healthCheck string, err error) { + err = client.HealthCheck() + if err == nil { + healthCheck = "Node is healthy" + } + return +} diff --git a/cmd/algoh/main.go b/cmd/algoh/main.go index 04bcdcb4f3..bed536e118 100644 --- a/cmd/algoh/main.go +++ b/cmd/algoh/main.go @@ -100,6 +100,12 @@ func main() { reportErrorf("Can't convert data directory's path to absolute, %v\n", dataDir) } + algodConfig, err := config.LoadConfigFromDisk(absolutePath) + + if err != nil && !os.IsNotExist(err) { + log.Fatalf("Cannot load config: %v", err) + } + if _, err := os.Stat(absolutePath); err != nil { reportErrorf("Data directory %s does not appear to be valid\n", dataDir) } @@ -112,7 +118,7 @@ func main() { done := make(chan struct{}) log := logging.Base() - configureLogging(genesis, log, absolutePath, done) + configureLogging(genesis, log, absolutePath, done, algodConfig) defer log.CloseTelemetry() exeDir, err = util.ExeDir() @@ -175,7 +181,7 @@ func main() { var wg sync.WaitGroup - deadMan := makeDeadManWatcher(algohConfig.DeadManTimeSec, algodClient, algohConfig.UploadOnError, done, &wg) + deadMan := makeDeadManWatcher(algohConfig.DeadManTimeSec, algodClient, algohConfig.UploadOnError, done, &wg, algodConfig) wg.Add(1) listeners := []blockListener{deadMan} @@ -259,7 +265,7 @@ func getNodeController() nodecontrol.NodeController { return nc } -func configureLogging(genesis bookkeeping.Genesis, log logging.Logger, rootPath string, abort chan struct{}) { +func configureLogging(genesis bookkeeping.Genesis, log logging.Logger, rootPath string, abort chan struct{}, algodConfig config.Local) { log = logging.Base() liveLog := fmt.Sprintf("%s/host.log", rootPath) @@ -272,7 +278,7 @@ func configureLogging(genesis bookkeeping.Genesis, log logging.Logger, rootPath log.SetJSONFormatter() log.SetLevel(logging.Debug) - initTelemetry(genesis, log, rootPath, abort) + initTelemetry(genesis, log, rootPath, abort, algodConfig) // if we have the telemetry enabled, we want to use it's sessionid as part of the // collected metrics decorations. @@ -281,7 +287,7 @@ func configureLogging(genesis bookkeeping.Genesis, log logging.Logger, rootPath fmt.Fprintln(writer, "++++++++++++++++++++++++++++++++++++++++") } -func initTelemetry(genesis bookkeeping.Genesis, log logging.Logger, dataDirectory string, abort chan struct{}) { +func initTelemetry(genesis bookkeeping.Genesis, log logging.Logger, dataDirectory string, abort chan struct{}, algodConfig config.Local) { // Enable telemetry hook in daemon to send logs to cloud // If ALGOTEST env variable is set, telemetry is disabled - allows disabling telemetry for tests isTest := os.Getenv("ALGOTEST") != "" @@ -304,14 +310,10 @@ func initTelemetry(genesis bookkeeping.Genesis, log logging.Logger, dataDirector } if log.GetTelemetryEnabled() { - cfg, err := config.LoadConfigFromDisk(dataDirectory) - if err != nil && !os.IsNotExist(err) { - log.Fatalf("Cannot load config: %v", err) - } // If the telemetry URI is not set, periodically check SRV records for new telemetry URI if log.GetTelemetryURI() == "" { - network.StartTelemetryURIUpdateService(time.Minute, cfg, genesis.Network, log, abort) + network.StartTelemetryURIUpdateService(time.Minute, algodConfig, genesis.Network, log, abort) } // For privacy concerns, we don't want to provide the full data directory to telemetry. diff --git a/cmd/algoh/mockClient.go b/cmd/algoh/mockClient.go index ce379ca36a..209fda2fa4 100644 --- a/cmd/algoh/mockClient.go +++ b/cmd/algoh/mockClient.go @@ -50,6 +50,7 @@ type mockClient struct { StatusCalls int BlockCalls map[uint64]int GetGoRoutinesCalls int + HealthCheckCalls int error []error status []generatedV2.NodeStatusResponse routine []string @@ -111,3 +112,13 @@ func (c *mockClient) GetGoRoutines(ctx context.Context) (r string, e error) { e = c.nextError() return } + +func (c *mockClient) HealthCheck() (e error) { + c.HealthCheckCalls++ + // Repeat last healthcheck... + if len(c.routine) > 1 { + c.routine = c.routine[1:] + } + e = c.nextError() + return +} diff --git a/test/e2e-go/cli/algoh/expect/algohExpectCommon.exp b/test/e2e-go/cli/algoh/expect/algohExpectCommon.exp new file mode 100644 index 0000000000..0fb4dec439 --- /dev/null +++ b/test/e2e-go/cli/algoh/expect/algohExpectCommon.exp @@ -0,0 +1,109 @@ +# Algoh Expect Utility Package +namespace eval ::Algoh { + + # Export Procedures + + namespace export Abort + namespace export Info + namespace export CreateNetwork + + namespace export StopNode + namespace export StartNode + + + # My Variables + set version 1.0 + set description "Algodh Expect Package" + + # Variable for the path of the script + variable home [file join [pwd] [file dirname [info script]]] +} + +# Definition of the procedure MyProcedure +proc ::Algoh::Info {} { + puts Algoh::description +} + +proc ::Algoh::Abort { ERROR } { + puts "Aborting with error: $ERROR" + exit 1 +} + +package provide Algoh $Algoh::version +package require Tcl 8.0 + + +# Start the network +proc ::Algoh::CreateNetwork { NETWORK_NAME NETWORK_TEMPLATE TEST_ROOT_DIR } { + + # Running on ARM64, it seems that network creation is pretty slow. + # 30 second won't be enough here, so I'm changing this to 90 seconds. + set timeout 90 + + if { [catch { + # Create network + puts "network create $NETWORK_NAME" + spawn goal network create --network $NETWORK_NAME --template $NETWORK_TEMPLATE --rootdir $TEST_ROOT_DIR + expect { + timeout { close; ::Algoh::Abort "Timed out creating network" } + "^Network $NETWORK_NAME created under.*" { puts "Network $NETWORK_NAME created" ; close } + eof { catch wait result; if { [lindex $result 3] != 0 } { puts "Unable to create network"; Algoh::Abort } } + } + } EXCEPTION ] } { + ::AlgorandGoal::Abort "ERROR in CreateNetwork: $EXCEPTION" + } +} + +proc ::Algoh::StartNode { TEST_ALGO_DIR } { + + puts "Primary node start" + # Start node + if { [catch { + spawn goal node start -d $TEST_ALGO_DIR + expect { + timeout { close; ::Algoh::Abort "Timed out starting relay node" } + "^Algorand node successfully started!* { puts "Primary relay node started"; close } + eof { catch wait result; if { [lindex $result 3] != 0 } { puts "Unable to start node"; Algoh::Abort } } + } + } EXCEPTION ] } { + ::Algoh::Abort "ERROR in StartNode: $EXCEPTION" + } +} + +proc ::Algoh::StopNode { TEST_ALGO_DIR } { + set timeout 15 + + if { [catch { + puts "node stop with $TEST_ALGO_DIR" + spawn goal node stop -d $TEST_ALGO_DIR + expect { + timeout { close; ::Algoh::Abort "Did not recieve appropriate message during node stop"} + "*The node was successfully stopped.*" {puts "Node stopped successfully"; close} + eof { catch wait result; if { [lindex $result 3] != 0 } { puts "Unable to stop node"; Algoh::Abort } } + + } + } EXCEPTION] } { + ::Algoh::Abort "ERROR in StopNode: $EXCEPTION" + } +} + +# Stop the network +proc ::Algoh::StopNetwork { NETWORK_NAME TEST_ROOT_DIR } { + set timeout 60 + set NETWORK_STOP_MESSAGE "" + puts "Stopping network: $NETWORK_NAME" + spawn goal network stop -r $TEST_ROOT_DIR + expect { + timeout { + close + puts "Timed out shutting down network" + puts "TEST_ROOT_DIR $::TEST_ROOT_DIR" + puts "NETWORK_NAME $::NETWORK_NAME" + exit 1 + } + "Network Stopped under.*" {set NETWORK_STOP_MESSAGE $expect_out(buffer); close} + eof { catch wait result; if { [lindex $result 3] != 0 } { puts "Unable to stop network"; Algoh::Abort } } + + } + puts $NETWORK_STOP_MESSAGE +} diff --git a/test/e2e-go/cli/algoh/expect/algohTimeoutTest.exp b/test/e2e-go/cli/algoh/expect/algohTimeoutTest.exp new file mode 100644 index 0000000000..20716bef85 --- /dev/null +++ b/test/e2e-go/cli/algoh/expect/algohTimeoutTest.exp @@ -0,0 +1,87 @@ +#!/usr/bin/expect -f +set err 0 +log_user 1 + + + +if { [catch { + source algohExpectCommon.exp + + set TEST_ALGO_DIR [lindex $argv 0] + set TEST_DATA_DIR [lindex $argv 1] + + exec mkdir -p $TEST_ALGO_DIR + + set TIME_STAMP [clock seconds] + set timeout 5 + + set TEST_ROOT_DIR $TEST_ALGO_DIR/root + set TEST_PRIMARY_NODE_DIR $TEST_ROOT_DIR/Primary + set TEST_NODE_DIR $TEST_ROOT_DIR/Node + set NETWORK_NAME test_net_expect_$TIME_STAMP + set NETWORK_TEMPLATE "$TEST_DATA_DIR/nettemplates/TwoNodes50Each.json" + puts "TEST_ALGO_DIR: $TEST_ALGO_DIR" + + + ::Algoh::CreateNetwork $NETWORK_NAME $NETWORK_TEMPLATE $TEST_ROOT_DIR + + ::Algoh::StartNode $TEST_PRIMARY_NODE_DIR + + exec cat ./disabled_profiler_config.json > $TEST_NODE_DIR/config.json + exec cat ./host-config.json > $TEST_NODE_DIR/host-config.json + + set PRIMARY_ADDR "" + spawn cat $TEST_ROOT_DIR/Primary/algod-listen.net + expect { + -regexp {http://[0-9\.]+:[0-9]+} { regexp -- {[0-9.]+:[0-9]+} $expect_out(0,string) PRIMARY_ADDR; close;} + timeout {puts "missed our case"; close; exit 1} + } + + puts "regex match: $PRIMARY_ADDR" + + #start hosted node in the background + spawn $env(GOPATH)/bin/algoh -d $TEST_NODE_DIR -p $PRIMARY_ADDR + expect { + "^Logging to: *" {puts "algoh startup successful"} + timeout {puts "algoh failed to start"; close; exit 1} + } + + #allow algoh time to put files in Node dir + spawn sleep 5 + set timeout 5 + expect { + timeout {puts "algod should be fully running"; close} + } + + #wait until Node approves blocks to the network + set timeout 60 + spawn $env(GOPATH)/bin/goal node wait -d $TEST_NODE_DIR -w 61 + expect { + eof {puts "successfully communicating with relay node"} + "Timed out waiting for node to make progress" {puts "timed out waiting for connection to relay node"; close; exit 1} + timeout {puts "should not reached this case"; close; exit 1} + } + + + ::Algoh::StopNode $TEST_PRIMARY_NODE_DIR + + + set timeout 201 + spawn $env(GOPATH)/bin/goal node wait -d $TEST_NODE_DIR -w 200 + expect { + "^Cannot contact Algorand node: open $TEST_NODE_DIR/algod.net: no such file or directory." {puts "ERROR: node shutdown"; close; exit 1} + "^Timed out waiting for node to make progress" {puts "node correctly continued running despite relay shutdown"; close} + timeout {puts "should not reached this case", close; exit 1} + } + + ::Algoh::StopNetwork $NETWORK_NAME $TEST_ROOT_DIR + + exec rm -d -r -f $TEST_ALGO_DIR + puts "Basic Algoh Test Successful" + exit 0 +} EXCEPTION ] } { + puts "ERROR in algoh test: $EXCEPTION" + + exec rm -d -r -f $TEST_ALGO_DIR + exit 1 +} diff --git a/test/e2e-go/cli/algoh/expect/algoh_expect_test.go b/test/e2e-go/cli/algoh/expect/algoh_expect_test.go new file mode 100644 index 0000000000..adb17abbfe --- /dev/null +++ b/test/e2e-go/cli/algoh/expect/algoh_expect_test.go @@ -0,0 +1,158 @@ +// Copyright (C) 2019-2020 Algorand, Inc. +// This file is part of go-algorand +// +// go-algorand is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// go-algorand is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with go-algorand. If not, see . + +package expect + +import ( + "bytes" + "io/ioutil" + "os" + "os/exec" + "path" + "path/filepath" + "regexp" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +type algohExpectFixture struct { + testDir string + testDataDir string + testDirTmp bool + t *testing.T + testFilter string +} + +func (f *algohExpectFixture) initialize(t *testing.T) (err error) { + f.t = t + f.testDir = os.Getenv("TESTDIR") + if f.testDir == "" { + f.testDir, _ = ioutil.TempDir("", "tmp") + f.testDir = filepath.Join(f.testDir, "expect") + err = os.MkdirAll(f.testDir, 0755) + if err != nil { + f.t.Errorf("error creating test dir %s, with error %v", f.testDir, err) + return + } + f.testDirTmp = true + } + f.testDataDir = os.Getenv("TESTDATADIR") + if f.testDataDir == "" { + f.testDataDir = os.ExpandEnv("${GOPATH}/src/github.com/algorand/go-algorand/test/testdata") + } + + f.testFilter = os.Getenv("TESTFILTER") + if f.testFilter == "" { + f.testFilter = ".*" + } + return +} + +func (f *algohExpectFixture) getTestDir(testName string) (workingDir, algoDir string, err error) { + testName = strings.Replace(testName, ".exp", "", -1) + workingDir = filepath.Join(f.testDir, testName) + err = os.Mkdir(workingDir, 0755) + if err != nil { + f.t.Errorf("error creating test dir %s, with error %v", workingDir, err) + return + } + algoDir = filepath.Join(workingDir, "algoh") + err = os.Mkdir(algoDir, 0755) + if err != nil { + f.t.Errorf("error creating algo dir %s, with error %v", algoDir, err) + return + } + return +} + +func (f *algohExpectFixture) removeTestDir(workingDir string) (err error) { + err = os.RemoveAll(workingDir) + if err != nil { + f.t.Errorf("error removing test dir %s, with error %v", workingDir, err) + return + } + return +} + +// TesthWithExpect Process all expect script files with suffix Test.exp within the test/e2e-go/cli/algoh/expect directory +func TestAlgohWithExpect(t *testing.T) { + var f algohExpectFixture + var execCommand = exec.Command + expectFiles := make(map[string]string) // map expect test to full file name. + err := filepath.Walk(".", func(path string, info os.FileInfo, err error) error { + if strings.HasSuffix(info.Name(), "Test.exp") { + expectFiles[info.Name()] = path + } + return nil + }) + require.NoError(t, err) + err = f.initialize(t) + require.NoError(t, err) + + for testName := range expectFiles { + if match, _ := regexp.MatchString(f.testFilter, testName); match { + t.Run(testName, func(t *testing.T) { + workingDir, algoDir, err := f.getTestDir(testName) + require.NoError(t, err) + t.Logf("algoDir: %s\ntestDataDir:%s\n", algoDir, f.testDataDir) + cmd := execCommand("expect", testName, algoDir, f.testDataDir) + var outBuf bytes.Buffer + cmd.Stdout = &outBuf + + // Set stderr to be a file descriptor. In other way Go's exec.Cmd::writerDescriptor + // attaches goroutine reading that blocks on io.Copy from stderr. + // Cmd::CombinedOutput sets stderr to stdout and also blocks. + // Cmd::Start + Cmd::Wait with manual pipes redirection etc also blocks. + // Wrapping 'expect' with 'expect "$@" 2>&1' also blocks on stdout reading. + // Cmd::Output with Cmd::Stderr == nil works but stderr get lost. + // Using os.File as stderr does not trigger goroutine creation, instead exec.Cmd relies on os.File implementation. + errFile, err := os.OpenFile(path.Join(workingDir, "stderr.txt"), os.O_CREATE|os.O_RDWR, 0) + if err != nil { + t.Logf("failed opening stderr temp file: %s\n", err.Error()) + t.Fail() + } + defer errFile.Close() // Close might error but we Sync it before leaving the scope + cmd.Stderr = errFile + + err = cmd.Run() + if err != nil { + var stderr string + var ferr error + if ferr = errFile.Sync(); ferr == nil { + if _, ferr = errFile.Seek(0, 0); ferr == nil { + if info, ferr := errFile.Stat(); ferr == nil { + errData := make([]byte, info.Size()) + if _, ferr = errFile.Read(errData); ferr == nil { + stderr = string(errData) + } + } + } + } + if ferr != nil { + stderr = ferr.Error() + } + t.Logf("err running '%s': %s\nstdout: %s\nstderr: %s\n", testName, err, string(outBuf.Bytes()), stderr) + t.Fail() + } else { + // t.Logf("stdout: %s", string(outBuf.Bytes())) + f.removeTestDir(workingDir) + } + }) + } + } +} diff --git a/test/e2e-go/cli/algoh/expect/disabled_profiler_config.json b/test/e2e-go/cli/algoh/expect/disabled_profiler_config.json new file mode 100644 index 0000000000..f299ebfe04 --- /dev/null +++ b/test/e2e-go/cli/algoh/expect/disabled_profiler_config.json @@ -0,0 +1 @@ +{ "GossipFanout": 1, "EndpointAddress": "127.0.0.1:0", "DNSBootstrapID": "", "EnableProfiler": false, "IncomingConnectionsLimit": 0, "RunHosted": true } \ No newline at end of file diff --git a/test/e2e-go/cli/algoh/expect/host-config.json b/test/e2e-go/cli/algoh/expect/host-config.json new file mode 100644 index 0000000000..a9391f8897 --- /dev/null +++ b/test/e2e-go/cli/algoh/expect/host-config.json @@ -0,0 +1 @@ +{ "DeadManTimeSec": 32, "StatusDelayMS": 45000, "StallDelayMS": 60000, "UploadOnError": true, "SendBlockStats": false } \ No newline at end of file