Skip to content

Commit

Permalink
Add job namespace to tf_operator_jobs_* counters (#1283)
Browse files Browse the repository at this point in the history
* Add job namespace to 'tf_operator_jobs_*' counters

* Update `tfJobsDeletedCount`
  • Loading branch information
alembiewski authored Jun 28, 2021
1 parent 6fd9489 commit af5bdd5
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 28 deletions.
15 changes: 9 additions & 6 deletions pkg/controller.v1/tensorflow/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,13 @@ var (
// key function but it should be just fine for non delete events.
KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc

tfJobsDeletedCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "tf_operator_jobs_deleted_total",
Help: "Counts number of TF jobs deleted",
})
tfJobsDeletedCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "tf_operator_jobs_deleted_total",
Help: "Counts number of TF jobs deleted",
},
[]string{"job_namespace"},
)
)

// TFController is the type for TFJob Controller, which manages
Expand Down Expand Up @@ -245,7 +248,7 @@ func (tc *TFController) processNextWorkItem() bool {
if err != nil {
if err == errNotExists {
logger.Infof("TFJob has been deleted: %v", key)
tfJobsDeletedCount.Inc()
tfJobsDeletedCount.WithLabelValues(tfJob.Namespace).Inc()
return true
}

Expand Down Expand Up @@ -308,7 +311,7 @@ func (tc *TFController) syncTFJob(key string) (bool, error) {
if err != nil {
if err == errNotExists {
logger.Infof("TFJob has been deleted: %v", key)
tfJobsDeletedCount.Inc()
tfJobsDeletedCount.WithLabelValues(namespace).Inc()
return true, nil
}
return false, err
Expand Down
13 changes: 8 additions & 5 deletions pkg/controller.v1/tensorflow/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,13 @@ const (
)

var (
tfJobsCreatedCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "tf_operator_jobs_created_total",
Help: "Counts number of TF jobs created",
})
tfJobsCreatedCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "tf_operator_jobs_created_total",
Help: "Counts number of TF jobs created",
},
[]string{"job_namespace"},
)
)

// DeleteJob implements ControllerInterface interface.
Expand Down Expand Up @@ -128,7 +131,7 @@ func (tc *TFController) addTFJob(obj interface{}) {
return
}
tc.enqueueTFJob(obj)
tfJobsCreatedCount.Inc()
tfJobsCreatedCount.WithLabelValues(tfJob.Namespace).Inc()
}

// updateTFJob enqueues the current tfjob.
Expand Down
13 changes: 8 additions & 5 deletions pkg/controller.v1/tensorflow/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,13 @@ const (
)

var (
tfJobsRestartCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "tf_operator_jobs_restarted_total",
Help: "Counts number of TF jobs restarted",
})
tfJobsRestartCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "tf_operator_jobs_restarted_total",
Help: "Counts number of TF jobs restarted",
},
[]string{"job_namespace"},
)
)

// reconcilePods checks and updates pods for each given TFReplicaSpec.
Expand Down Expand Up @@ -149,7 +152,7 @@ func (tc *TFController) ReconcilePods(
commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
return err
}
tfJobsRestartCount.Inc()
tfJobsRestartCount.WithLabelValues(tfJob.Namespace).Inc()
}
}

Expand Down
30 changes: 18 additions & 12 deletions pkg/controller.v1/tensorflow/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,20 @@ const (
)

var (
tfJobsSuccessCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "tf_operator_jobs_successful_total",
Help: "Counts number of TF jobs successful",
})
tfJobsFailureCount = promauto.NewCounter(prometheus.CounterOpts{
Name: "tf_operator_jobs_failed_total",
Help: "Counts number of TF jobs failed",
})
tfJobsSuccessCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "tf_operator_jobs_successful_total",
Help: "Counts number of TF jobs successful",
},
[]string{"job_namespace"},
)
tfJobsFailureCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "tf_operator_jobs_failed_total",
Help: "Counts number of TF jobs failed",
},
[]string{"job_namespace"},
)
)

func (tc *TFController) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, jobStatus *commonv1.JobStatus) error {
Expand Down Expand Up @@ -137,7 +143,7 @@ func (tc *TFController) UpdateJobStatus(job interface{}, replicas map[commonv1.R
commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
return err
}
tfJobsSuccessCount.Inc()
tfJobsSuccessCount.WithLabelValues(tfJob.Namespace).Inc()
}
}
} else {
Expand All @@ -159,7 +165,7 @@ func (tc *TFController) UpdateJobStatus(job interface{}, replicas map[commonv1.R
commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
return err
}
tfJobsSuccessCount.Inc()
tfJobsSuccessCount.WithLabelValues(tfJob.Namespace).Inc()
} else if running > 0 {
// Some workers are still running, leave a running condition.
msg := fmt.Sprintf("TFJob %s/%s is running.",
Expand All @@ -184,7 +190,7 @@ func (tc *TFController) UpdateJobStatus(job interface{}, replicas map[commonv1.R
if restart {
// job is restarting, no need to set it failed
// we know it because we update the status condition when reconciling the replicas
tfJobsFailureCount.Inc()
tfJobsFailureCount.WithLabelValues(tfJob.Namespace).Inc()
} else {
msg := fmt.Sprintf("TFJob %s/%s has failed because %d %s replica(s) failed.",
tfJob.Namespace, tfJob.Name, failed, rtype)
Expand All @@ -199,7 +205,7 @@ func (tc *TFController) UpdateJobStatus(job interface{}, replicas map[commonv1.R
commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
return err
}
tfJobsFailureCount.Inc()
tfJobsFailureCount.WithLabelValues(tfJob.Namespace).Inc()
}
}
}
Expand Down

0 comments on commit af5bdd5

Please sign in to comment.