Add taskrun gauge metrics for k8s throttling becaues of defined resou…

…rce quotas or k8s live node constraints This commit adds a new experimental gauge metrics that counts the number of TaskRuns whose underlying Pods are currently not scheduled to run by Kubernetes: - one metric counts when Kubernetes ResourceQuota policies within a Namespace prevent scheduling - a second metric counts when underlying Node level CPU or Memory utilization are such that the underlying Pod cannot be scheduled
jagathprakash · Jun 5, 2023 · 4924b51 · 4924b51
1 parent 74b2f11
commit 4924b51
Showing 3 changed files with 131 additions and 10 deletions.
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -20,6 +20,8 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
 | `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
 | `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | experimental |
 | `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
+| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
+| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count`  | Gauge | | experimental |
 | `tekton_pipelines_controller_taskruns_pod_latency` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
 | `tekton_pipelines_controller_cloudevent_count` | Counter | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
 | `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |

diff --git a/pkg/taskrunmetrics/metrics.go b/pkg/taskrunmetrics/metrics.go
@@ -27,6 +27,7 @@ import (
 	"github.com/tektoncd/pipeline/pkg/apis/pipeline"
 	"github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1"
 	listers "github.com/tektoncd/pipeline/pkg/client/listers/pipeline/v1beta1"
+	"github.com/tektoncd/pipeline/pkg/pod"
 	"go.opencensus.io/stats"
 	"go.opencensus.io/stats/view"
 	"go.opencensus.io/tag"
@@ -51,12 +52,14 @@ var (
 	statusTag      = tag.MustNewKey("status")
 	podTag         = tag.MustNewKey("pod")
 
-	trDurationView      *view.View
-	prTRDurationView    *view.View
-	trCountView         *view.View
-	runningTRsCountView *view.View
-	podLatencyView      *view.View
-	cloudEventsView     *view.View
+	trDurationView                      *view.View
+	prTRDurationView                    *view.View
+	trCountView                         *view.View
+	runningTRsCountView                 *view.View
+	runningTRsThrottledByQuotaCountView *view.View
+	runningTRsThrottledByNodeCountView  *view.View
+	podLatencyView                      *view.View
+	cloudEventsView                     *view.View
 
 	trDuration = stats.Float64(
 		"taskrun_duration_seconds",
@@ -76,6 +79,14 @@ var (
 		"Number of taskruns executing currently",
 		stats.UnitDimensionless)
 
+	runningTRsThrottledByQuotaCount = stats.Float64("running_taskruns_throttled_by_quota_count",
+		"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas.  Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
+		stats.UnitDimensionless)
+
+	runningTRsThrottledByNodeCount = stats.Float64("running_taskruns_throttled_by_node_count",
+		"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
+		stats.UnitDimensionless)
+
 	podLatency = stats.Float64("taskruns_pod_latency",
 		"scheduling latency for the taskruns pods",
 		stats.UnitMilliseconds)
@@ -203,6 +214,16 @@ func viewRegister(cfg *config.Metrics) error {
 		Measure:     runningTRsCount,
 		Aggregation: view.LastValue(),
 	}
+	runningTRsThrottledByQuotaCountView = &view.View{
+		Description: runningTRsThrottledByQuotaCount.Description(),
+		Measure:     runningTRsThrottledByQuotaCount,
+		Aggregation: view.LastValue(),
+	}
+	runningTRsThrottledByNodeCountView = &view.View{
+		Description: runningTRsThrottledByNodeCount.Description(),
+		Measure:     runningTRsThrottledByNodeCount,
+		Aggregation: view.LastValue(),
+	}
 	podLatencyView = &view.View{
 		Description: podLatency.Description(),
 		Measure:     podLatency,
@@ -220,6 +241,8 @@ func viewRegister(cfg *config.Metrics) error {
 		prTRDurationView,
 		trCountView,
 		runningTRsCountView,
+		runningTRsThrottledByQuotaCountView,
+		runningTRsThrottledByNodeCountView,
 		podLatencyView,
 		cloudEventsView,
 	)
@@ -231,6 +254,8 @@ func viewUnregister() {
 		prTRDurationView,
 		trCountView,
 		runningTRsCountView,
+		runningTRsThrottledByQuotaCountView,
+		runningTRsThrottledByNodeCountView,
 		podLatencyView,
 		cloudEventsView,
 	)
@@ -344,9 +369,21 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
 	}
 
 	var runningTrs int
+	var trsThrottledByQuota int
+	var trsThrottledByNode int
 	for _, pr := range trs {
-		if !pr.IsDone() {
-			runningTrs++
+		if pr.IsDone() {
+			continue
+		}
+		runningTrs++
+		succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded)
+		if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown {
+			switch succeedCondition.Reason {
+			case pod.ReasonExceededResourceQuota:
+				trsThrottledByQuota++
+			case pod.ReasonExceededNodeResources:
+				trsThrottledByNode++
+			}
 		}
 	}
 
@@ -355,6 +392,8 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
 		return err
 	}
 	metrics.Record(ctx, runningTRsCount.M(float64(runningTrs)))
+	metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNode)))
+	metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuota)))
 
 	return nil
 }
@@ -374,7 +413,7 @@ func (r *Recorder) ReportRunningTaskRuns(ctx context.Context, lister listers.Tas
 			return
 
 		case <-delay.C:
-			// Every 30s surface a metric for the number of running tasks.
+			// Every 30s surface a metric for the number of running tasks, as well as those running tasks that are currently throttled by k8s.
 			if err := r.RunningTaskRuns(ctx, lister); err != nil {
 				logger.Warnf("Failed to log the metrics : %v", err)
 			}

diff --git a/pkg/taskrunmetrics/metrics_test.go b/pkg/taskrunmetrics/metrics_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1"
 	faketaskruninformer "github.com/tektoncd/pipeline/pkg/client/injection/informers/pipeline/v1beta1/taskrun/fake"
 	"github.com/tektoncd/pipeline/pkg/names"
+	"github.com/tektoncd/pipeline/pkg/pod"
 	ttesting "github.com/tektoncd/pipeline/pkg/reconciler/testing"
 	"go.uber.org/zap"
 	corev1 "k8s.io/api/core/v1"
@@ -418,6 +419,85 @@ func TestRecordRunningTaskRunsCount(t *testing.T) {
 	metricstest.CheckLastValueData(t, "running_taskruns_count", map[string]string{}, 1)
 }
 
+func TestRecordRunningTaskRunsThrottledCounts(t *testing.T) {
+	for _, tc := range []struct {
+		status     corev1.ConditionStatus
+		reason     string
+		nodeCount  float64
+		quotaCount float64
+	}{
+		{
+			status: corev1.ConditionTrue,
+			reason: "",
+		},
+		{
+			status: corev1.ConditionTrue,
+			reason: pod.ReasonExceededResourceQuota,
+		},
+		{
+			status: corev1.ConditionTrue,
+			reason: pod.ReasonExceededNodeResources,
+		},
+		{
+			status: corev1.ConditionFalse,
+			reason: "",
+		},
+		{
+			status: corev1.ConditionFalse,
+			reason: pod.ReasonExceededResourceQuota,
+		},
+		{
+			status: corev1.ConditionFalse,
+			reason: pod.ReasonExceededNodeResources,
+		},
+		{
+			status: corev1.ConditionUnknown,
+			reason: "",
+		},
+		{
+			status:     corev1.ConditionUnknown,
+			reason:     pod.ReasonExceededResourceQuota,
+			quotaCount: 1,
+		},
+		{
+			status:    corev1.ConditionUnknown,
+			reason:    pod.ReasonExceededNodeResources,
+			nodeCount: 1,
+		},
+	} {
+		unregisterMetrics()
+		tr := &v1beta1.TaskRun{
+			ObjectMeta: metav1.ObjectMeta{Name: names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("taskrun-")},
+			Status: v1beta1.TaskRunStatus{
+				Status: duckv1.Status{
+					Conditions: duckv1.Conditions{{
+						Type:   apis.ConditionSucceeded,
+						Status: tc.status,
+						Reason: tc.reason,
+					}},
+				},
+			},
+		}
+		ctx, _ := ttesting.SetupFakeContext(t)
+		informer := faketaskruninformer.Get(ctx)
+		if err := informer.Informer().GetIndexer().Add(tr); err != nil {
+			t.Fatalf("Adding TaskRun to informer: %v", err)
+		}
+
+		ctx = getConfigContext()
+		metrics, err := NewRecorder(ctx)
+		if err != nil {
+			t.Fatalf("NewRecorder: %v", err)
+		}
+
+		if err := metrics.RunningTaskRuns(ctx, informer.Lister()); err != nil {
+			t.Errorf("RunningTaskRuns: %v", err)
+		}
+		metricstest.CheckLastValueData(t, "running_taskruns_throttled_by_quota_count", map[string]string{}, tc.quotaCount)
+		metricstest.CheckLastValueData(t, "running_taskruns_throttled_by_node_count", map[string]string{}, tc.nodeCount)
+	}
+}
+
 func TestRecordPodLatency(t *testing.T) {
 	creationTime := metav1.Now()
 
@@ -685,7 +765,7 @@ func TestRecordCloudEvents(t *testing.T) {
 }
 
 func unregisterMetrics() {
-	metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "taskruns_pod_latency", "cloudevent_count")
+	metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "taskruns_pod_latency", "cloudevent_count")
 
 	// Allow the recorder singleton to be recreated.
 	once = sync.Once{}