diff --git a/pkg/kubelet/kubeletconfig/status/BUILD b/pkg/kubelet/kubeletconfig/status/BUILD index 666b3b42bef36..fe925bfc90525 100644 --- a/pkg/kubelet/kubeletconfig/status/BUILD +++ b/pkg/kubelet/kubeletconfig/status/BUILD @@ -11,6 +11,7 @@ go_library( importpath = "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/status", deps = [ "//pkg/kubelet/kubeletconfig/util/log:go_default_library", + "//pkg/kubelet/metrics:go_default_library", "//pkg/util/node:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", diff --git a/pkg/kubelet/kubeletconfig/status/status.go b/pkg/kubelet/kubeletconfig/status/status.go index ae8cdec069c4a..750a504cf3e73 100644 --- a/pkg/kubelet/kubeletconfig/status/status.go +++ b/pkg/kubelet/kubeletconfig/status/status.go @@ -25,6 +25,7 @@ import ( "k8s.io/apimachinery/pkg/types" clientset "k8s.io/client-go/kubernetes" utillog "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/util/log" + "k8s.io/kubernetes/pkg/kubelet/metrics" nodeutil "k8s.io/kubernetes/pkg/util/node" ) @@ -176,6 +177,24 @@ func (s *nodeConfigStatus) Sync(client clientset.Interface, nodeName string) { status.Error = s.errorOverride } + // update metrics based on the status we will sync + metrics.SetConfigError(len(status.Error) > 0) + err = metrics.SetAssignedConfig(status.Assigned) + if err != nil { + err = fmt.Errorf("failed to update Assigned config metric, error: %v", err) + return + } + err = metrics.SetActiveConfig(status.Active) + if err != nil { + err = fmt.Errorf("failed to update Active config metric, error: %v", err) + return + } + err = metrics.SetLastKnownGoodConfig(status.LastKnownGood) + if err != nil { + err = fmt.Errorf("failed to update LastKnownGood config metric, error: %v", err) + return + } + // apply the status to a copy of the node so we don't modify the object in the informer's store newNode := oldNode.DeepCopy() newNode.Status.Config = status diff --git a/pkg/kubelet/metrics/BUILD b/pkg/kubelet/metrics/BUILD index 2f8d8839738a1..8e6041502bcff 100644 --- a/pkg/kubelet/metrics/BUILD +++ b/pkg/kubelet/metrics/BUILD @@ -10,9 +10,12 @@ go_library( srcs = ["metrics.go"], importpath = "k8s.io/kubernetes/pkg/kubelet/metrics", deps = [ + "//pkg/features:go_default_library", "//pkg/kubelet/container:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/prometheus/client_golang/prometheus:go_default_library", + "//vendor/k8s.io/api/core/v1:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", ], ) diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 4e9470f5a52e9..058253258e4df 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -17,11 +17,15 @@ limitations under the License. package metrics import ( + "fmt" "sync" "time" "github.com/golang/glog" "github.com/prometheus/client_golang/prometheus" + corev1 "k8s.io/api/core/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/kubernetes/pkg/features" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" ) @@ -47,6 +51,17 @@ const ( // Metrics keys of device plugin operations DevicePluginRegistrationCountKey = "device_plugin_registration_count" DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds" + + // Metric keys for node config + AssignedConfigKey = "node_config_assigned" + ActiveConfigKey = "node_config_active" + LastKnownGoodConfigKey = "node_config_last_known_good" + ConfigErrorKey = "node_config_error" + ConfigSourceLabelKey = "node_config_source" + ConfigSourceLabelValueLocal = "local" + ConfigUIDLabelKey = "node_config_uid" + ConfigResourceVersionLabelKey = "node_config_resource_version" + KubeletConfigKeyLabelKey = "node_config_kubelet_key" ) var ( @@ -150,6 +165,40 @@ var ( }, []string{"resource_name"}, ) + + // Metrics for node config + + AssignedConfig = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: AssignedConfigKey, + Help: "The node's understanding of intended config. The count is always 1.", + }, + []string{ConfigSourceLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey}, + ) + ActiveConfig = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: ActiveConfigKey, + Help: "The config source the node is actively using. The count is always 1.", + }, + []string{ConfigSourceLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey}, + ) + LastKnownGoodConfig = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: LastKnownGoodConfigKey, + Help: "The config source the node will fall back to when it encounters certain errors. The count is always 1.", + }, + []string{ConfigSourceLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey}, + ) + ConfigError = prometheus.NewGauge( + prometheus.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: ConfigErrorKey, + Help: "This metric is true (1) if the node is experiencing a configuration-related error, false (0) otherwise.", + }, + ) ) var registerMetrics sync.Once @@ -172,6 +221,12 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu prometheus.MustRegister(EvictionStatsAge) prometheus.MustRegister(DevicePluginRegistrationCount) prometheus.MustRegister(DevicePluginAllocationLatency) + if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) { + prometheus.MustRegister(AssignedConfig) + prometheus.MustRegister(ActiveConfig) + prometheus.MustRegister(LastKnownGoodConfig) + prometheus.MustRegister(ConfigError) + } for _, collector := range collectors { prometheus.MustRegister(collector) } @@ -232,3 +287,88 @@ func (pc *podAndContainerCollector) Collect(ch chan<- prometheus.Metric) { prometheus.GaugeValue, float64(runningContainers)) } + +const configMapAPIPathFmt = "/api/v1/namespaces/%s/configmaps/%s" + +func configLabels(source *corev1.NodeConfigSource) (map[string]string, error) { + if source == nil { + return map[string]string{ + // prometheus requires all of the labels that can be set on the metric + ConfigSourceLabelKey: "local", + ConfigUIDLabelKey: "", + ConfigResourceVersionLabelKey: "", + KubeletConfigKeyLabelKey: "", + }, nil + } + if source.ConfigMap != nil { + return map[string]string{ + ConfigSourceLabelKey: fmt.Sprintf(configMapAPIPathFmt, source.ConfigMap.Namespace, source.ConfigMap.Name), + ConfigUIDLabelKey: string(source.ConfigMap.UID), + ConfigResourceVersionLabelKey: source.ConfigMap.ResourceVersion, + KubeletConfigKeyLabelKey: source.ConfigMap.KubeletConfigKey, + }, nil + } + return nil, fmt.Errorf("unrecognized config source type, all source subfields were nil") +} + +// track labels across metric updates, so we can delete old label sets and prevent leaks +var assignedConfigLabels map[string]string = map[string]string{} + +func SetAssignedConfig(source *corev1.NodeConfigSource) error { + // compute the timeseries labels from the source + labels, err := configLabels(source) + if err != nil { + return err + } + // clean up the old timeseries (WithLabelValues creates a new one for each distinct label set) + AssignedConfig.Delete(assignedConfigLabels) + // record the new timeseries + assignedConfigLabels = labels + // expose the new timeseries with a constant count of 1 + AssignedConfig.With(assignedConfigLabels).Set(1) + return nil +} + +// track labels across metric updates, so we can delete old label sets and prevent leaks +var activeConfigLabels map[string]string = map[string]string{} + +func SetActiveConfig(source *corev1.NodeConfigSource) error { + // compute the timeseries labels from the source + labels, err := configLabels(source) + if err != nil { + return err + } + // clean up the old timeseries (WithLabelValues creates a new one for each distinct label set) + ActiveConfig.Delete(activeConfigLabels) + // record the new timeseries + activeConfigLabels = labels + // expose the new timeseries with a constant count of 1 + ActiveConfig.With(activeConfigLabels).Set(1) + return nil +} + +// track labels across metric updates, so we can delete old label sets and prevent leaks +var lastKnownGoodConfigLabels map[string]string = map[string]string{} + +func SetLastKnownGoodConfig(source *corev1.NodeConfigSource) error { + // compute the timeseries labels from the source + labels, err := configLabels(source) + if err != nil { + return err + } + // clean up the old timeseries (WithLabelValues creates a new one for each distinct label set) + LastKnownGoodConfig.Delete(lastKnownGoodConfigLabels) + // record the new timeseries + lastKnownGoodConfigLabels = labels + // expose the new timeseries with a constant count of 1 + LastKnownGoodConfig.With(lastKnownGoodConfigLabels).Set(1) + return nil +} + +func SetConfigError(err bool) { + if err { + ConfigError.Set(1) + } else { + ConfigError.Set(0) + } +} diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 9113fa745d9be..2520218911eea 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -137,6 +137,7 @@ go_test( "//pkg/kubelet/types:go_default_library", "//pkg/security/apparmor:go_default_library", "//test/e2e/framework:go_default_library", + "//test/e2e/framework/metrics:go_default_library", "//test/e2e_node/services:go_default_library", "//test/utils/image:go_default_library", "//vendor/github.com/blang/semver:go_default_library", @@ -147,6 +148,7 @@ go_test( "//vendor/github.com/onsi/gomega:go_default_library", "//vendor/github.com/onsi/gomega/gstruct:go_default_library", "//vendor/github.com/onsi/gomega/types:go_default_library", + "//vendor/github.com/prometheus/common/model:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/equality:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library", @@ -164,7 +166,6 @@ go_test( ] + select({ "@io_bazel_rules_go//go/platform:linux": [ "//test/e2e/common:go_default_library", - "//test/e2e/framework/metrics:go_default_library", "//test/e2e_node/system:go_default_library", "//test/utils:go_default_library", "//vendor/github.com/kardianos/osext:go_default_library", diff --git a/test/e2e_node/dynamic_kubelet_config_test.go b/test/e2e_node/dynamic_kubelet_config_test.go index 3d709b99d1072..e29cd65a44025 100644 --- a/test/e2e_node/dynamic_kubelet_config_test.go +++ b/test/e2e_node/dynamic_kubelet_config_test.go @@ -18,6 +18,7 @@ package e2e_node import ( "fmt" + "reflect" "strings" "time" @@ -27,12 +28,17 @@ import ( apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig" controller "k8s.io/kubernetes/pkg/kubelet/kubeletconfig" "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/status" + "k8s.io/kubernetes/pkg/kubelet/metrics" + frameworkmetrics "k8s.io/kubernetes/test/e2e/framework/metrics" "k8s.io/kubernetes/test/e2e/framework" + "github.com/prometheus/common/model" + . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" ) @@ -45,8 +51,6 @@ type expectNodeConfigStatus struct { // If true, expect Status.Config.Active == Status.Config.LastKnownGood, // otherwise expect Status.Config.Active == Status.Config.Assigned. lkgActive bool - // If true, skip checking Status.Config.LastKnownGood == this.lastKnownGood in the status. - skipLkg bool } type nodeConfigTestCase struct { @@ -809,6 +813,8 @@ func (tc *nodeConfigTestCase) run(f *framework.Framework, fn func(f *framework.F tc.checkNodeConfigSource(f) // check status tc.checkConfigStatus(f) + // check that the Kubelet's config-related metrics are correct + tc.checkConfigMetrics(f) // check expectConfig if tc.expectConfig != nil { tc.checkConfig(f) @@ -929,7 +935,7 @@ func expectConfigStatus(tc *nodeConfigTestCase, actual *apiv1.NodeConfigStatus) errs = append(errs, spew.Sprintf("expected Assigned %#v but got %#v", expectAssigned, actual.Assigned)) } // check LastKnownGood matches tc.expectConfigStatus.lastKnownGood - if !tc.expectConfigStatus.skipLkg && !apiequality.Semantic.DeepEqual(tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood) { + if !apiequality.Semantic.DeepEqual(tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood) { errs = append(errs, spew.Sprintf("expected LastKnownGood %#v but got %#v", tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood)) } // check Active matches Assigned or LastKnownGood, depending on tc.expectConfigStatus.lkgActive @@ -1016,6 +1022,111 @@ func (tc *nodeConfigTestCase) checkEvent(f *framework.Framework) { }, timeout, interval).Should(BeNil()) } +// checkConfigMetrics makes sure the Kubelet's config related metrics are as we expect, given the test case +func (tc *nodeConfigTestCase) checkConfigMetrics(f *framework.Framework) { + const ( + timeout = time.Minute + interval = time.Second + assignedConfigKey = metrics.KubeletSubsystem + "_" + metrics.AssignedConfigKey + activeConfigKey = metrics.KubeletSubsystem + "_" + metrics.ActiveConfigKey + lastKnownGoodConfigKey = metrics.KubeletSubsystem + "_" + metrics.LastKnownGoodConfigKey + configErrorKey = metrics.KubeletSubsystem + "_" + metrics.ConfigErrorKey + ) + // local config helper + mkLocalSample := func(name model.LabelValue) *model.Sample { + return &model.Sample{ + Metric: model.Metric(map[model.LabelName]model.LabelValue{ + model.MetricNameLabel: name, + metrics.ConfigSourceLabelKey: metrics.ConfigSourceLabelValueLocal, + metrics.ConfigUIDLabelKey: "", + metrics.ConfigResourceVersionLabelKey: "", + metrics.KubeletConfigKeyLabelKey: "", + }), + Value: 1, + } + } + // remote config helper + mkRemoteSample := func(name model.LabelValue, source *apiv1.NodeConfigSource) *model.Sample { + return &model.Sample{ + Metric: model.Metric(map[model.LabelName]model.LabelValue{ + model.MetricNameLabel: name, + metrics.ConfigSourceLabelKey: model.LabelValue(fmt.Sprintf("/api/v1/namespaces/%s/configmaps/%s", source.ConfigMap.Namespace, source.ConfigMap.Name)), + metrics.ConfigUIDLabelKey: model.LabelValue(source.ConfigMap.UID), + metrics.ConfigResourceVersionLabelKey: model.LabelValue(source.ConfigMap.ResourceVersion), + metrics.KubeletConfigKeyLabelKey: model.LabelValue(source.ConfigMap.KubeletConfigKey), + }), + Value: 1, + } + } + // error helper + mkErrorSample := func(expectError bool) *model.Sample { + v := model.SampleValue(0) + if expectError { + v = model.SampleValue(1) + } + return &model.Sample{ + Metric: model.Metric(map[model.LabelName]model.LabelValue{model.MetricNameLabel: configErrorKey}), + Value: v, + } + } + // construct expected metrics + // assigned + assignedSamples := model.Samples{mkLocalSample(assignedConfigKey)} + assignedSource := tc.configSource.DeepCopy() + if assignedSource != nil && assignedSource.ConfigMap != nil { + assignedSource.ConfigMap.UID = tc.configMap.UID + assignedSource.ConfigMap.ResourceVersion = tc.configMap.ResourceVersion + assignedSamples = model.Samples{mkRemoteSample(assignedConfigKey, assignedSource)} + } + // last-known-good + lastKnownGoodSamples := model.Samples{mkLocalSample(lastKnownGoodConfigKey)} + lastKnownGoodSource := tc.expectConfigStatus.lastKnownGood + if lastKnownGoodSource != nil && lastKnownGoodSource.ConfigMap != nil { + lastKnownGoodSamples = model.Samples{mkRemoteSample(lastKnownGoodConfigKey, lastKnownGoodSource)} + } + // active + activeSamples := model.Samples{mkLocalSample(activeConfigKey)} + activeSource := assignedSource + if tc.expectConfigStatus.lkgActive { + activeSource = lastKnownGoodSource + } + if activeSource != nil && activeSource.ConfigMap != nil { + activeSamples = model.Samples{mkRemoteSample(activeConfigKey, activeSource)} + } + // error + errorSamples := model.Samples{mkErrorSample(len(tc.expectConfigStatus.err) > 0)} + // expected metrics + expect := frameworkmetrics.KubeletMetrics(map[string]model.Samples{ + assignedConfigKey: assignedSamples, + activeConfigKey: activeSamples, + lastKnownGoodConfigKey: lastKnownGoodSamples, + configErrorKey: errorSamples, + }) + // wait for expected metrics to appear + Eventually(func() error { + actual, err := getKubeletMetrics(sets.NewString( + assignedConfigKey, + activeConfigKey, + lastKnownGoodConfigKey, + configErrorKey, + )) + if err != nil { + return err + } + // clear timestamps from actual, so DeepEqual is time-invariant + for _, samples := range actual { + for _, sample := range samples { + sample.Timestamp = 0 + } + } + // compare to expected + if !reflect.DeepEqual(expect, actual) { + return fmt.Errorf("checkConfigMetrics: case: %s: expect metrics %s but got %s", tc.desc, spew.Sprintf("%#v", expect), spew.Sprintf("%#v", actual)) + } + return nil + }, timeout, interval).Should(BeNil()) +} + // constructs the expected SelfLink for a config map func configMapAPIPath(cm *apiv1.ConfigMap) string { return fmt.Sprintf("/api/v1/namespaces/%s/configmaps/%s", cm.Namespace, cm.Name) diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index cec7974bc4c5a..0cd2dca5540da 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -45,6 +45,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/remote" "k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework/metrics" + frameworkmetrics "k8s.io/kubernetes/test/e2e/framework/metrics" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" @@ -334,6 +335,24 @@ func logKubeletMetrics(metricKeys ...string) { } } +// returns config related metrics from the local kubelet, filtered to the filterMetricNames passed in +func getKubeletMetrics(filterMetricNames sets.String) (frameworkmetrics.KubeletMetrics, error) { + // grab Kubelet metrics + ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255") + if err != nil { + return nil, err + } + + filtered := metrics.NewKubeletMetrics() + for name := range ms { + if !filterMetricNames.Has(name) { + continue + } + filtered[name] = ms[name] + } + return filtered, nil +} + // runCommand runs the cmd and returns the combined stdout and stderr, or an // error if the command failed. func runCommand(cmd ...string) (string, error) {