Skip to content

Commit

Permalink
Add taskrun gauge metrics for k8s throttling becaues of defined resou…
Browse files Browse the repository at this point in the history
…rce quotas or k8s live node constraints

This commit adds a new experimental gauge metrics that counts the number of TaskRuns whose
underlying Pods are currently not scheduled to run by Kubernetes:
- one metric counts when Kubernetes ResourceQuota policies within a Namespace prevent scheduling
- a second metric counts when underlying Node level CPU or Memory utilization are such that the underlying Pod
cannot be scheduled
gabemontero authored and tekton-robot committed Jun 5, 2023
1 parent 74b2f11 commit 4924b51
Showing 3 changed files with 131 additions and 10 deletions.
2 changes: 2 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
@@ -20,6 +20,8 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
| `tekton_pipelines_controller_taskruns_pod_latency` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
| `tekton_pipelines_controller_cloudevent_count` | Counter | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |
57 changes: 48 additions & 9 deletions pkg/taskrunmetrics/metrics.go
Original file line number Diff line number Diff line change
@@ -27,6 +27,7 @@ import (
"github.com/tektoncd/pipeline/pkg/apis/pipeline"
"github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1"
listers "github.com/tektoncd/pipeline/pkg/client/listers/pipeline/v1beta1"
"github.com/tektoncd/pipeline/pkg/pod"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
@@ -51,12 +52,14 @@ var (
statusTag = tag.MustNewKey("status")
podTag = tag.MustNewKey("pod")

trDurationView *view.View
prTRDurationView *view.View
trCountView *view.View
runningTRsCountView *view.View
podLatencyView *view.View
cloudEventsView *view.View
trDurationView *view.View
prTRDurationView *view.View
trCountView *view.View
runningTRsCountView *view.View
runningTRsThrottledByQuotaCountView *view.View
runningTRsThrottledByNodeCountView *view.View
podLatencyView *view.View
cloudEventsView *view.View

trDuration = stats.Float64(
"taskrun_duration_seconds",
@@ -76,6 +79,14 @@ var (
"Number of taskruns executing currently",
stats.UnitDimensionless)

runningTRsThrottledByQuotaCount = stats.Float64("running_taskruns_throttled_by_quota_count",
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
stats.UnitDimensionless)

runningTRsThrottledByNodeCount = stats.Float64("running_taskruns_throttled_by_node_count",
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
stats.UnitDimensionless)

podLatency = stats.Float64("taskruns_pod_latency",
"scheduling latency for the taskruns pods",
stats.UnitMilliseconds)
@@ -203,6 +214,16 @@ func viewRegister(cfg *config.Metrics) error {
Measure: runningTRsCount,
Aggregation: view.LastValue(),
}
runningTRsThrottledByQuotaCountView = &view.View{
Description: runningTRsThrottledByQuotaCount.Description(),
Measure: runningTRsThrottledByQuotaCount,
Aggregation: view.LastValue(),
}
runningTRsThrottledByNodeCountView = &view.View{
Description: runningTRsThrottledByNodeCount.Description(),
Measure: runningTRsThrottledByNodeCount,
Aggregation: view.LastValue(),
}
podLatencyView = &view.View{
Description: podLatency.Description(),
Measure: podLatency,
@@ -220,6 +241,8 @@ func viewRegister(cfg *config.Metrics) error {
prTRDurationView,
trCountView,
runningTRsCountView,
runningTRsThrottledByQuotaCountView,
runningTRsThrottledByNodeCountView,
podLatencyView,
cloudEventsView,
)
@@ -231,6 +254,8 @@ func viewUnregister() {
prTRDurationView,
trCountView,
runningTRsCountView,
runningTRsThrottledByQuotaCountView,
runningTRsThrottledByNodeCountView,
podLatencyView,
cloudEventsView,
)
@@ -344,9 +369,21 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
}

var runningTrs int
var trsThrottledByQuota int
var trsThrottledByNode int
for _, pr := range trs {
if !pr.IsDone() {
runningTrs++
if pr.IsDone() {
continue
}
runningTrs++
succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded)
if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown {
switch succeedCondition.Reason {
case pod.ReasonExceededResourceQuota:
trsThrottledByQuota++
case pod.ReasonExceededNodeResources:
trsThrottledByNode++
}
}
}

@@ -355,6 +392,8 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
return err
}
metrics.Record(ctx, runningTRsCount.M(float64(runningTrs)))
metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNode)))
metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuota)))

return nil
}
@@ -374,7 +413,7 @@ func (r *Recorder) ReportRunningTaskRuns(ctx context.Context, lister listers.Tas
return

case <-delay.C:
// Every 30s surface a metric for the number of running tasks.
// Every 30s surface a metric for the number of running tasks, as well as those running tasks that are currently throttled by k8s.
if err := r.RunningTaskRuns(ctx, lister); err != nil {
logger.Warnf("Failed to log the metrics : %v", err)
}
82 changes: 81 additions & 1 deletion pkg/taskrunmetrics/metrics_test.go
Original file line number Diff line number Diff line change
@@ -28,6 +28,7 @@ import (
"github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1"
faketaskruninformer "github.com/tektoncd/pipeline/pkg/client/injection/informers/pipeline/v1beta1/taskrun/fake"
"github.com/tektoncd/pipeline/pkg/names"
"github.com/tektoncd/pipeline/pkg/pod"
ttesting "github.com/tektoncd/pipeline/pkg/reconciler/testing"
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
@@ -418,6 +419,85 @@ func TestRecordRunningTaskRunsCount(t *testing.T) {
metricstest.CheckLastValueData(t, "running_taskruns_count", map[string]string{}, 1)
}

func TestRecordRunningTaskRunsThrottledCounts(t *testing.T) {
for _, tc := range []struct {
status corev1.ConditionStatus
reason string
nodeCount float64
quotaCount float64
}{
{
status: corev1.ConditionTrue,
reason: "",
},
{
status: corev1.ConditionTrue,
reason: pod.ReasonExceededResourceQuota,
},
{
status: corev1.ConditionTrue,
reason: pod.ReasonExceededNodeResources,
},
{
status: corev1.ConditionFalse,
reason: "",
},
{
status: corev1.ConditionFalse,
reason: pod.ReasonExceededResourceQuota,
},
{
status: corev1.ConditionFalse,
reason: pod.ReasonExceededNodeResources,
},
{
status: corev1.ConditionUnknown,
reason: "",
},
{
status: corev1.ConditionUnknown,
reason: pod.ReasonExceededResourceQuota,
quotaCount: 1,
},
{
status: corev1.ConditionUnknown,
reason: pod.ReasonExceededNodeResources,
nodeCount: 1,
},
} {
unregisterMetrics()
tr := &v1beta1.TaskRun{
ObjectMeta: metav1.ObjectMeta{Name: names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("taskrun-")},
Status: v1beta1.TaskRunStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: apis.ConditionSucceeded,
Status: tc.status,
Reason: tc.reason,
}},
},
},
}
ctx, _ := ttesting.SetupFakeContext(t)
informer := faketaskruninformer.Get(ctx)
if err := informer.Informer().GetIndexer().Add(tr); err != nil {
t.Fatalf("Adding TaskRun to informer: %v", err)
}

ctx = getConfigContext()
metrics, err := NewRecorder(ctx)
if err != nil {
t.Fatalf("NewRecorder: %v", err)
}

if err := metrics.RunningTaskRuns(ctx, informer.Lister()); err != nil {
t.Errorf("RunningTaskRuns: %v", err)
}
metricstest.CheckLastValueData(t, "running_taskruns_throttled_by_quota_count", map[string]string{}, tc.quotaCount)
metricstest.CheckLastValueData(t, "running_taskruns_throttled_by_node_count", map[string]string{}, tc.nodeCount)
}
}

func TestRecordPodLatency(t *testing.T) {
creationTime := metav1.Now()

@@ -685,7 +765,7 @@ func TestRecordCloudEvents(t *testing.T) {
}

func unregisterMetrics() {
metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "taskruns_pod_latency", "cloudevent_count")
metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "taskruns_pod_latency", "cloudevent_count")

// Allow the recorder singleton to be recreated.
once = sync.Once{}

0 comments on commit 4924b51

Please sign in to comment.