From a4321ac741a58e4b7bf4d2c4cc4dd48cc7d3b80f Mon Sep 17 00:00:00 2001 From: Arthur Kepler <610274+excalq@users.noreply.github.com> Date: Fri, 21 Apr 2023 03:07:40 -0700 Subject: [PATCH] Add reload timestamps to metrics (#99) * Use version sort add_metrics_to_readme.sh * #93: Adds logstash_stats_pipeline_up metrics * Add newline to EOF * Adds reload timestamp metrics * Fixes #96, Last_error field isn't properly defined * Adds timestamp metrics to test/snapshot/readme * Remove pipeline up metric --------- Co-authored-by: Jakub Surdej --- README.md | 5 ++++- collectors/nodestats/nodestats_collector_test.go | 2 ++ collectors/nodestats/pipeline_subcollector.go | 12 ++++++++++-- scripts/add_metrics_to_readme.sh | 2 +- scripts/snapshots/metric_names.txt | 2 ++ 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d4bc1c24..6324a74f 100644 --- a/README.md +++ b/README.md @@ -304,10 +304,13 @@ Table of exported metrics: | logstash_stats_pipeline_queue_events_queue_size | counter | Number of events that the queue can accommodate | | logstash_stats_pipeline_queue_max_size_in_bytes | counter | Maximum size of given queue in bytes. | | logstash_stats_pipeline_reloads_failures | counter | Number of failed pipeline reloads. | +| logstash_stats_pipeline_reloads_last_failure_timestamp | gauge | Timestamp of last failed pipeline reload. | +| logstash_stats_pipeline_reloads_last_success_timestamp | gauge | Timestamp of last successful pipeline reload. | | logstash_stats_pipeline_reloads_successes | counter | Number of successful pipeline reloads. | -| logstash_stats_process_cpu_load_average_15m | gauge | Total 15m system load average. | +| logstash_stats_pipeline_up | gauge | Whether the pipeline is up or not. | | logstash_stats_process_cpu_load_average_1m | gauge | Total 1m system load average. | | logstash_stats_process_cpu_load_average_5m | gauge | Total 5m system load average. | +| logstash_stats_process_cpu_load_average_15m | gauge | Total 15m system load average. | | logstash_stats_process_cpu_percent | gauge | CPU usage of the process. | | logstash_stats_process_cpu_total_millis | gauge | Total CPU time used by the process. | | logstash_stats_process_max_file_descriptors | gauge | Limit of open file descriptors. | diff --git a/collectors/nodestats/nodestats_collector_test.go b/collectors/nodestats/nodestats_collector_test.go index 923ec2b6..67d44950 100644 --- a/collectors/nodestats/nodestats_collector_test.go +++ b/collectors/nodestats/nodestats_collector_test.go @@ -77,6 +77,8 @@ func TestCollectNotNil(t *testing.T) { "logstash_stats_pipeline_queue_max_size_in_bytes", "logstash_stats_pipeline_reloads_failures", "logstash_stats_pipeline_reloads_successes", + "logstash_stats_pipeline_reloads_last_success_timestamp", + "logstash_stats_pipeline_reloads_last_failure_timestamp", "logstash_stats_process_cpu_percent", "logstash_stats_process_cpu_total_millis", "logstash_stats_process_cpu_load_average_1m", diff --git a/collectors/nodestats/pipeline_subcollector.go b/collectors/nodestats/pipeline_subcollector.go index f6d41d2b..f2c3f236 100644 --- a/collectors/nodestats/pipeline_subcollector.go +++ b/collectors/nodestats/pipeline_subcollector.go @@ -43,6 +43,9 @@ func NewPipelineSubcollector() *PipelineSubcollector { ReloadsSuccesses: descHelper.NewDescWithHelpAndLabel("reloads_successes", "Number of successful pipeline reloads.", "pipeline_id"), ReloadsFailures: descHelper.NewDescWithHelpAndLabel("reloads_failures", "Number of failed pipeline reloads.", "pipeline_id"), + ReloadsLastSuccessTimestamp: descHelper.NewDescWithHelpAndLabel("reloads_last_success_timestamp", "Timestamp of last successful pipeline reload.", "pipeline_id"), + ReloadsLastFailureTimestamp: descHelper.NewDescWithHelpAndLabel("reloads_last_failure_timestamp", "Timestamp of last failed pipeline reload.", "pipeline_id"), + QueueEventsCount: descHelper.NewDescWithHelpAndLabel("queue_events_count", "Number of events in the queue.", "pipeline_id"), QueueEventsQueueSize: descHelper.NewDescWithHelpAndLabel("queue_events_queue_size", "Number of events that the queue can accommodate", "pipeline_id"), QueueMaxQueueSizeInBytes: descHelper.NewDescWithHelpAndLabel("queue_max_size_in_bytes", "Maximum size of given queue in bytes.", "pipeline_id"), @@ -59,11 +62,16 @@ func (collector *PipelineSubcollector) Collect(pipeStats *responses.SinglePipeli ch <- prometheus.MustNewConstMetric(collector.EventsDuration, prometheus.CounterValue, float64(pipeStats.Events.DurationInMillis), pipelineID) ch <- prometheus.MustNewConstMetric(collector.EventsQueuePushDuration, prometheus.CounterValue, float64(pipeStats.Events.QueuePushDurationInMillis), pipelineID) - // todo: add restart timestamps - ch <- prometheus.MustNewConstMetric(collector.ReloadsSuccesses, prometheus.CounterValue, float64(pipeStats.Reloads.Successes), pipelineID) ch <- prometheus.MustNewConstMetric(collector.ReloadsFailures, prometheus.CounterValue, float64(pipeStats.Reloads.Failures), pipelineID) + if pipeStats.Reloads.LastSuccessTimestamp != nil { + ch <- prometheus.NewMetricWithTimestamp(*pipeStats.Reloads.LastSuccessTimestamp, prometheus.MustNewConstMetric(collector.ReloadsLastSuccessTimestamp, prometheus.GaugeValue, 1, pipelineID)) + } + if pipeStats.Reloads.LastFailureTimestamp != nil { + ch <- prometheus.NewMetricWithTimestamp(*pipeStats.Reloads.LastFailureTimestamp, prometheus.MustNewConstMetric(collector.ReloadsLastFailureTimestamp, prometheus.GaugeValue, 1, pipelineID)) + } + ch <- prometheus.MustNewConstMetric(collector.QueueEventsCount, prometheus.CounterValue, float64(pipeStats.Queue.EventsCount), pipelineID) ch <- prometheus.MustNewConstMetric(collector.QueueEventsQueueSize, prometheus.CounterValue, float64(pipeStats.Queue.QueueSizeInBytes), pipelineID) ch <- prometheus.MustNewConstMetric(collector.QueueMaxQueueSizeInBytes, prometheus.CounterValue, float64(pipeStats.Queue.MaxQueueSizeInBytes), pipelineID) diff --git a/scripts/add_metrics_to_readme.sh b/scripts/add_metrics_to_readme.sh index bf098c1d..51c5482c 100755 --- a/scripts/add_metrics_to_readme.sh +++ b/scripts/add_metrics_to_readme.sh @@ -17,7 +17,7 @@ endLine=$(grep -n "^" $FILE | awk -F: '{print $1}') metricsTable=$(echo "| Name | Type | Description | | ----------- | ----------- | ----------- | -$(getMetrics | sort)") +$(getMetrics | sort --version-sort)") for ((i=0; i<${#LINES[@]}; i++)); do if [ $i -eq $startLine ]; then diff --git a/scripts/snapshots/metric_names.txt b/scripts/snapshots/metric_names.txt index 5afe3953..4ee321c6 100644 --- a/scripts/snapshots/metric_names.txt +++ b/scripts/snapshots/metric_names.txt @@ -24,6 +24,8 @@ logstash_stats_pipeline_queue_events_queue_size logstash_stats_pipeline_queue_max_size_in_bytes logstash_stats_pipeline_reloads_failures logstash_stats_pipeline_reloads_successes +logstash_stats_pipeline_reloads_last_success_timestamp +logstash_stats_pipeline_reloads_last_failure_timestamp logstash_stats_process_cpu_percent logstash_stats_process_cpu_total_millis logstash_stats_process_cpu_load_average_1m