From 46dd680739c900064e5fd098b085aec20f276d90 Mon Sep 17 00:00:00 2001 From: Mustafa Demirhan <4033879+mdemirhan@users.noreply.github.com> Date: Tue, 18 Dec 2018 03:11:34 +0000 Subject: [PATCH] Export reconciler metrics. (#2742) --- cmd/controller/main.go | 3 + .../100-grafana-dash-knative-reconciler.yaml | 622 ++++++++++++++++++ .../metrics/prometheus/100-grafana.yaml | 11 + .../100-prometheus-scrape-config.yaml | 27 + 4 files changed, 663 insertions(+) create mode 100644 config/monitoring/metrics/prometheus/100-grafana-dash-knative-reconciler.yaml diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 06033e6e0193..588c05460506 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -41,6 +41,7 @@ import ( clientset "github.com/knative/serving/pkg/client/clientset/versioned" informers "github.com/knative/serving/pkg/client/informers/externalversions" "github.com/knative/serving/pkg/logging" + "github.com/knative/serving/pkg/metrics" "github.com/knative/serving/pkg/reconciler" "github.com/knative/serving/pkg/reconciler/v1alpha1/clusteringress" "github.com/knative/serving/pkg/reconciler/v1alpha1/configuration" @@ -192,6 +193,8 @@ func main() { // Watch the logging config map and dynamically update logging levels. configMapWatcher.Watch(logging.ConfigName, logging.UpdateLevelFromConfigMap(logger, atomicLevel, component)) + // Watch the observability config map and dynamically update metrics exporter. + configMapWatcher.Watch(metrics.ObservabilityConfigName, metrics.UpdateExporterFromConfigMap(component, logger)) // These are non-blocking. kubeInformerFactory.Start(stopCh) diff --git a/config/monitoring/metrics/prometheus/100-grafana-dash-knative-reconciler.yaml b/config/monitoring/metrics/prometheus/100-grafana-dash-knative-reconciler.yaml new file mode 100644 index 000000000000..fcea0621e337 --- /dev/null +++ b/config/monitoring/metrics/prometheus/100-grafana-dash-knative-reconciler.yaml @@ -0,0 +1,622 @@ +# Copyright 2018 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-definition-knative-reconciler + namespace: knative-monitoring +data: + knative-reconciler-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "description": "Knative Serving - Reconciler", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 7, + "panels": [], + "title": "Aggregate", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum by (reconciler)(60 * rate(controller_reconcile_count[1m]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{reconciler}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Reconcile Count (per min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 5, + "panels": [], + "title": "Per Reconciler", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum(60 * rate(controller_reconcile_count{reconciler=\"$reconciler\"}[1m]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{reconciler}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$reconciler Reconcile Count (per min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(controller_reconcile_latency_bucket{reconciler=\"$reconciler\"}[1m])) by (le))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99th", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.90, sum(rate(controller_reconcile_latency_bucket{reconciler=\"$reconciler\"}[1m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "90th", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(controller_reconcile_latency_bucket{reconciler=\"$reconciler\"}[1m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "50th", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$reconciler Reconcile Latency Percentiles", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 9, + "panels": [], + "title": "Per Reconciler & Key", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum(60 * rate(controller_reconcile_count{reconciler=\"$reconciler\", key=\"$key\"}[1m]))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{reconciler}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$reconciler/$key Reconcile Count (per min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 13, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(controller_reconcile_latency_bucket{reconciler=\"$reconciler\", key=\"$key\"}[1m])) by (le))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99th", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.90, sum(rate(controller_reconcile_latency_bucket{reconciler=\"$reconciler\", key=\"$key\"}[1m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "90th", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(controller_reconcile_latency_bucket{reconciler=\"$reconciler\", key=\"$key\"}[1m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "50th", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$reconciler/$key Reconcile Latency Percentiles", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "5s", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Reconciler", + "multi": false, + "name": "reconciler", + "options": [], + "query": "label_values(controller_reconcile_count, reconciler)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Key", + "multi": false, + "name": "key", + "options": [], + "query": "label_values(controller_reconcile_count{reconciler=\"$reconciler\"}, key)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Knative Serving - Reconciler", + "uid": "j0oFdEYiz", + "version": 10 + } diff --git a/config/monitoring/metrics/prometheus/100-grafana.yaml b/config/monitoring/metrics/prometheus/100-grafana.yaml index f7ad4db92ecb..5e4b2317d0d7 100644 --- a/config/monitoring/metrics/prometheus/100-grafana.yaml +++ b/config/monitoring/metrics/prometheus/100-grafana.yaml @@ -47,6 +47,12 @@ data: type: file options: folder: /grafana-dashboard-definition/knative-efficiency + - name: 'knative-reconciler' + org_id: 1 + folder: '' + type: file + options: + folder: /grafana-dashboard-definition/knative-reconciler - name: 'istio' org_id: 1 folder: '' @@ -174,6 +180,8 @@ spec: mountPath: /grafana-dashboard-definition/knative - name: grafana-dashboard-definition-knative-efficiency mountPath: /grafana-dashboard-definition/knative-efficiency + - name: grafana-dashboard-definition-knative-reconciler + mountPath: /grafana-dashboard-definition/knative-reconciler - name: grafana-dashboard-definition-istio mountPath: /grafana-dashboard-definition/istio - name: grafana-dashboard-definition-mixer @@ -225,6 +233,9 @@ spec: - name: grafana-dashboard-definition-knative-efficiency configMap: name: grafana-dashboard-definition-knative-efficiency + - name: grafana-dashboard-definition-knative-reconciler + configMap: + name: grafana-dashboard-definition-knative-reconciler - name: grafana-dashboard-definition-istio configMap: name: grafana-dashboard-definition-istio diff --git a/config/monitoring/metrics/prometheus/100-prometheus-scrape-config.yaml b/config/monitoring/metrics/prometheus/100-prometheus-scrape-config.yaml index a2825f32fd28..1cbc32a8d383 100644 --- a/config/monitoring/metrics/prometheus/100-prometheus-scrape-config.yaml +++ b/config/monitoring/metrics/prometheus/100-prometheus-scrape-config.yaml @@ -26,6 +26,33 @@ data: scrape_timeout: 10s evaluation_interval: 30s scrape_configs: + # Controller endpoint + - job_name: controller + scrape_interval: 3s + scrape_timeout: 3s + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Scrape only the the targets matching the following metadata + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_name] + action: keep + regex: knative-serving;controller;metrics + # Rename metadata labels to be reader friendly + - source_labels: [__meta_kubernetes_namespace] + action: replace + regex: (.*) + target_label: namespace + replacement: $1 + - source_labels: [__meta_kubernetes_pod_name] + action: replace + regex: (.*) + target_label: pod + replacement: $1 + - source_labels: [__meta_kubernetes_service_name] + action: replace + regex: (.*) + target_label: service + replacement: $1 # Autoscaler endpoint - job_name: autoscaler scrape_interval: 3s