Skip to content

Commit

Permalink
Algoh fix (algorand#1183)
Browse files Browse the repository at this point in the history
Added algod client health check to prevent node shutdown due to timeout in healthy node.
  • Loading branch information
wjurayj-algo authored Jun 24, 2020
1 parent 5a074a2 commit b509cad
Show file tree
Hide file tree
Showing 9 changed files with 413 additions and 20 deletions.
1 change: 1 addition & 0 deletions cmd/algoh/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@ type Client interface {
Status() (generatedV2.NodeStatusResponse, error)
Block(round uint64) (v1.Block, error)
GetGoRoutines(ctx context.Context) (string, error)
HealthCheck() error
}
43 changes: 33 additions & 10 deletions cmd/algoh/deadman.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"sync"
"time"

"github.com/algorand/go-algorand/config"
"github.com/algorand/go-algorand/daemon/algod/api/spec/v1"
"github.com/algorand/go-algorand/logging/telemetryspec"
)
Expand All @@ -33,9 +34,10 @@ type deadManWatcher struct {
client Client
done <-chan struct{}
wg *sync.WaitGroup
algodConfig config.Local
}

func makeDeadManWatcher(timeout int64, client Client, uploadOnError bool, done <-chan struct{}, wg *sync.WaitGroup) deadManWatcher {
func makeDeadManWatcher(timeout int64, client Client, uploadOnError bool, done <-chan struct{}, wg *sync.WaitGroup, algodConfig config.Local) deadManWatcher {
var deadManTime time.Duration
if timeout == 0 {
deadManTime = time.Hour * (10 * 365 * 24) // Don't fire for 10 years
Expand All @@ -50,6 +52,7 @@ func makeDeadManWatcher(timeout int64, client Client, uploadOnError bool, done <
uploadOnError: uploadOnError,
done: done,
wg: wg,
algodConfig: algodConfig,
}
}

Expand Down Expand Up @@ -90,15 +93,27 @@ func (w deadManWatcher) onBlock(block v1.Block) {
}

func (w deadManWatcher) reportDeadManTimeout(curBlock uint64) (err error) {
goRoutines, err := getGoRoutines(w.client)
if err != nil {
goRoutines = fmt.Sprintf("Error dumping goroutines: %v", err)
}

details := telemetryspec.DeadManTriggeredEventDetails{
Timeout: int64(w.timeout.Seconds()),
CurrentBlock: curBlock,
GoRoutines: goRoutines,
var details telemetryspec.DeadManTriggeredEventDetails
if w.algodConfig.EnableProfiler {
goRoutines, err := getGoRoutines(w.client)
if err != nil {
goRoutines = fmt.Sprintf("Error dumping goroutines: %v", err)
}
details = telemetryspec.DeadManTriggeredEventDetails{
Timeout: int64(w.timeout.Seconds()),
CurrentBlock: curBlock,
GoRoutines: goRoutines,
}
} else {
healthCheck, err := getHealthCheck(w.client)
if err != nil {
healthCheck = fmt.Sprintf("Error performing health check : %v", err)
}
details = telemetryspec.DeadManTriggeredEventDetails{
Timeout: int64(w.timeout.Seconds()),
CurrentBlock: curBlock,
GoRoutines: healthCheck,
}
}
log.EventWithDetails(telemetryspec.HostApplicationState, telemetryspec.DeadManTriggeredEvent, details)

Expand All @@ -117,3 +132,11 @@ func getGoRoutines(client Client) (goRoutines string, err error) {
}
return
}

func getHealthCheck(client Client) (healthCheck string, err error) {
err = client.HealthCheck()
if err == nil {
healthCheck = "Node is healthy"
}
return
}
22 changes: 12 additions & 10 deletions cmd/algoh/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ func main() {
reportErrorf("Can't convert data directory's path to absolute, %v\n", dataDir)
}

algodConfig, err := config.LoadConfigFromDisk(absolutePath)

if err != nil && !os.IsNotExist(err) {
log.Fatalf("Cannot load config: %v", err)
}

if _, err := os.Stat(absolutePath); err != nil {
reportErrorf("Data directory %s does not appear to be valid\n", dataDir)
}
Expand All @@ -112,7 +118,7 @@ func main() {

done := make(chan struct{})
log := logging.Base()
configureLogging(genesis, log, absolutePath, done)
configureLogging(genesis, log, absolutePath, done, algodConfig)
defer log.CloseTelemetry()

exeDir, err = util.ExeDir()
Expand Down Expand Up @@ -175,7 +181,7 @@ func main() {

var wg sync.WaitGroup

deadMan := makeDeadManWatcher(algohConfig.DeadManTimeSec, algodClient, algohConfig.UploadOnError, done, &wg)
deadMan := makeDeadManWatcher(algohConfig.DeadManTimeSec, algodClient, algohConfig.UploadOnError, done, &wg, algodConfig)
wg.Add(1)

listeners := []blockListener{deadMan}
Expand Down Expand Up @@ -259,7 +265,7 @@ func getNodeController() nodecontrol.NodeController {
return nc
}

func configureLogging(genesis bookkeeping.Genesis, log logging.Logger, rootPath string, abort chan struct{}) {
func configureLogging(genesis bookkeeping.Genesis, log logging.Logger, rootPath string, abort chan struct{}, algodConfig config.Local) {
log = logging.Base()

liveLog := fmt.Sprintf("%s/host.log", rootPath)
Expand All @@ -272,7 +278,7 @@ func configureLogging(genesis bookkeeping.Genesis, log logging.Logger, rootPath
log.SetJSONFormatter()
log.SetLevel(logging.Debug)

initTelemetry(genesis, log, rootPath, abort)
initTelemetry(genesis, log, rootPath, abort, algodConfig)

// if we have the telemetry enabled, we want to use it's sessionid as part of the
// collected metrics decorations.
Expand All @@ -281,7 +287,7 @@ func configureLogging(genesis bookkeeping.Genesis, log logging.Logger, rootPath
fmt.Fprintln(writer, "++++++++++++++++++++++++++++++++++++++++")
}

func initTelemetry(genesis bookkeeping.Genesis, log logging.Logger, dataDirectory string, abort chan struct{}) {
func initTelemetry(genesis bookkeeping.Genesis, log logging.Logger, dataDirectory string, abort chan struct{}, algodConfig config.Local) {
// Enable telemetry hook in daemon to send logs to cloud
// If ALGOTEST env variable is set, telemetry is disabled - allows disabling telemetry for tests
isTest := os.Getenv("ALGOTEST") != ""
Expand All @@ -304,14 +310,10 @@ func initTelemetry(genesis bookkeeping.Genesis, log logging.Logger, dataDirector
}

if log.GetTelemetryEnabled() {
cfg, err := config.LoadConfigFromDisk(dataDirectory)
if err != nil && !os.IsNotExist(err) {
log.Fatalf("Cannot load config: %v", err)
}

// If the telemetry URI is not set, periodically check SRV records for new telemetry URI
if log.GetTelemetryURI() == "" {
network.StartTelemetryURIUpdateService(time.Minute, cfg, genesis.Network, log, abort)
network.StartTelemetryURIUpdateService(time.Minute, algodConfig, genesis.Network, log, abort)
}

// For privacy concerns, we don't want to provide the full data directory to telemetry.
Expand Down
11 changes: 11 additions & 0 deletions cmd/algoh/mockClient.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type mockClient struct {
StatusCalls int
BlockCalls map[uint64]int
GetGoRoutinesCalls int
HealthCheckCalls int
error []error
status []generatedV2.NodeStatusResponse
routine []string
Expand Down Expand Up @@ -111,3 +112,13 @@ func (c *mockClient) GetGoRoutines(ctx context.Context) (r string, e error) {
e = c.nextError()
return
}

func (c *mockClient) HealthCheck() (e error) {
c.HealthCheckCalls++
// Repeat last healthcheck...
if len(c.routine) > 1 {
c.routine = c.routine[1:]
}
e = c.nextError()
return
}
109 changes: 109 additions & 0 deletions test/e2e-go/cli/algoh/expect/algohExpectCommon.exp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Algoh Expect Utility Package
namespace eval ::Algoh {

# Export Procedures

namespace export Abort
namespace export Info
namespace export CreateNetwork

namespace export StopNode
namespace export StartNode


# My Variables
set version 1.0
set description "Algodh Expect Package"

# Variable for the path of the script
variable home [file join [pwd] [file dirname [info script]]]
}

# Definition of the procedure MyProcedure
proc ::Algoh::Info {} {
puts Algoh::description
}

proc ::Algoh::Abort { ERROR } {
puts "Aborting with error: $ERROR"
exit 1
}

package provide Algoh $Algoh::version
package require Tcl 8.0


# Start the network
proc ::Algoh::CreateNetwork { NETWORK_NAME NETWORK_TEMPLATE TEST_ROOT_DIR } {

# Running on ARM64, it seems that network creation is pretty slow.
# 30 second won't be enough here, so I'm changing this to 90 seconds.
set timeout 90

if { [catch {
# Create network
puts "network create $NETWORK_NAME"
spawn goal network create --network $NETWORK_NAME --template $NETWORK_TEMPLATE --rootdir $TEST_ROOT_DIR
expect {
timeout { close; ::Algoh::Abort "Timed out creating network" }
"^Network $NETWORK_NAME created under.*" { puts "Network $NETWORK_NAME created" ; close }
eof { catch wait result; if { [lindex $result 3] != 0 } { puts "Unable to create network"; Algoh::Abort } }
}
} EXCEPTION ] } {
::AlgorandGoal::Abort "ERROR in CreateNetwork: $EXCEPTION"
}
}

proc ::Algoh::StartNode { TEST_ALGO_DIR } {

puts "Primary node start"
# Start node
if { [catch {
spawn goal node start -d $TEST_ALGO_DIR
expect {
timeout { close; ::Algoh::Abort "Timed out starting relay node" }
"^Algorand node successfully started!* { puts "Primary relay node started"; close }
eof { catch wait result; if { [lindex $result 3] != 0 } { puts "Unable to start node"; Algoh::Abort } }
}
} EXCEPTION ] } {
::Algoh::Abort "ERROR in StartNode: $EXCEPTION"
}
}

proc ::Algoh::StopNode { TEST_ALGO_DIR } {
set timeout 15

if { [catch {
puts "node stop with $TEST_ALGO_DIR"
spawn goal node stop -d $TEST_ALGO_DIR
expect {
timeout { close; ::Algoh::Abort "Did not recieve appropriate message during node stop"}
"*The node was successfully stopped.*" {puts "Node stopped successfully"; close}
eof { catch wait result; if { [lindex $result 3] != 0 } { puts "Unable to stop node"; Algoh::Abort } }

}
} EXCEPTION] } {
::Algoh::Abort "ERROR in StopNode: $EXCEPTION"
}
}

# Stop the network
proc ::Algoh::StopNetwork { NETWORK_NAME TEST_ROOT_DIR } {
set timeout 60
set NETWORK_STOP_MESSAGE ""
puts "Stopping network: $NETWORK_NAME"
spawn goal network stop -r $TEST_ROOT_DIR
expect {
timeout {
close
puts "Timed out shutting down network"
puts "TEST_ROOT_DIR $::TEST_ROOT_DIR"
puts "NETWORK_NAME $::NETWORK_NAME"
exit 1
}
"Network Stopped under.*" {set NETWORK_STOP_MESSAGE $expect_out(buffer); close}
eof { catch wait result; if { [lindex $result 3] != 0 } { puts "Unable to stop network"; Algoh::Abort } }

}
puts $NETWORK_STOP_MESSAGE
}
87 changes: 87 additions & 0 deletions test/e2e-go/cli/algoh/expect/algohTimeoutTest.exp
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/expect -f
set err 0
log_user 1



if { [catch {
source algohExpectCommon.exp

set TEST_ALGO_DIR [lindex $argv 0]
set TEST_DATA_DIR [lindex $argv 1]

exec mkdir -p $TEST_ALGO_DIR

set TIME_STAMP [clock seconds]
set timeout 5

set TEST_ROOT_DIR $TEST_ALGO_DIR/root
set TEST_PRIMARY_NODE_DIR $TEST_ROOT_DIR/Primary
set TEST_NODE_DIR $TEST_ROOT_DIR/Node
set NETWORK_NAME test_net_expect_$TIME_STAMP
set NETWORK_TEMPLATE "$TEST_DATA_DIR/nettemplates/TwoNodes50Each.json"
puts "TEST_ALGO_DIR: $TEST_ALGO_DIR"


::Algoh::CreateNetwork $NETWORK_NAME $NETWORK_TEMPLATE $TEST_ROOT_DIR

::Algoh::StartNode $TEST_PRIMARY_NODE_DIR

exec cat ./disabled_profiler_config.json > $TEST_NODE_DIR/config.json
exec cat ./host-config.json > $TEST_NODE_DIR/host-config.json

set PRIMARY_ADDR ""
spawn cat $TEST_ROOT_DIR/Primary/algod-listen.net
expect {
-regexp {http://[0-9\.]+:[0-9]+} { regexp -- {[0-9.]+:[0-9]+} $expect_out(0,string) PRIMARY_ADDR; close;}
timeout {puts "missed our case"; close; exit 1}
}

puts "regex match: $PRIMARY_ADDR"

#start hosted node in the background
spawn $env(GOPATH)/bin/algoh -d $TEST_NODE_DIR -p $PRIMARY_ADDR
expect {
"^Logging to: *" {puts "algoh startup successful"}
timeout {puts "algoh failed to start"; close; exit 1}
}

#allow algoh time to put files in Node dir
spawn sleep 5
set timeout 5
expect {
timeout {puts "algod should be fully running"; close}
}

#wait until Node approves blocks to the network
set timeout 60
spawn $env(GOPATH)/bin/goal node wait -d $TEST_NODE_DIR -w 61
expect {
eof {puts "successfully communicating with relay node"}
"Timed out waiting for node to make progress" {puts "timed out waiting for connection to relay node"; close; exit 1}
timeout {puts "should not reached this case"; close; exit 1}
}


::Algoh::StopNode $TEST_PRIMARY_NODE_DIR


set timeout 201
spawn $env(GOPATH)/bin/goal node wait -d $TEST_NODE_DIR -w 200
expect {
"^Cannot contact Algorand node: open $TEST_NODE_DIR/algod.net: no such file or directory." {puts "ERROR: node shutdown"; close; exit 1}
"^Timed out waiting for node to make progress" {puts "node correctly continued running despite relay shutdown"; close}
timeout {puts "should not reached this case", close; exit 1}
}

::Algoh::StopNetwork $NETWORK_NAME $TEST_ROOT_DIR

exec rm -d -r -f $TEST_ALGO_DIR
puts "Basic Algoh Test Successful"
exit 0
} EXCEPTION ] } {
puts "ERROR in algoh test: $EXCEPTION"

exec rm -d -r -f $TEST_ALGO_DIR
exit 1
}
Loading

0 comments on commit b509cad

Please sign in to comment.