Restart job on failure for Always,OnFailure Policy (#1572)

Fixes #1570 Together with kubeflow/common#189 There can be pod level failures caused by the system, which would perviously caused the entire job to fail on all policies except ExitCode.
kubeflow · Jun 9, 2022 · 58cc3a0 · 58cc3a0
1 parent 7761578
commit 58cc3a0
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go
@@ -419,7 +419,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{},
 		}
 
 		if failed > 0 {
-			if spec.RestartPolicy == commonv1.RestartPolicyExitCode {
+			if spec.RestartPolicy != commonv1.RestartPolicyNever {
 				msg := fmt.Sprintf("PyTorchJob %s is restarting because %d %s replica(s) failed.", pytorchjob.Name, failed, rtype)
 				r.Recorder.Event(pytorchjob, corev1.EventTypeWarning, commonutil.JobRestartingReason, msg)
 				err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, commonutil.JobRestartingReason, msg)