Skip to content

Commit

Permalink
fix Pytorjob status inaccuracy when task replica scale down
Browse files Browse the repository at this point in the history
  • Loading branch information
PeterChg committed May 17, 2022
1 parent af3cfc2 commit 305349b
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions pkg/controller.v1/pytorch/pytorchjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,10 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{},
expected := *(spec.Replicas) - succeeded
running := status.Active
failed := status.Failed
specReplicas := *spec.Replicas

logrus.Infof("PyTorchJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d",
pytorchjob.Name, rtype, expected, running, succeeded, failed)
logrus.Infof("PyTorchJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d, failed=%d, Replicas=%d",
pytorchjob.Name, rtype, expected, running, succeeded, failed, specReplicas)

if ContainsMasterSpec(replicas) {
if rtype == commonv1.ReplicaType(pytorchv1.PyTorchReplicaTypeMaster) {
Expand Down Expand Up @@ -418,7 +419,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{},
}
}

if failed > 0 {
if failed > 0 && (specReplicas > succeeded+running) {
if spec.RestartPolicy == commonv1.RestartPolicyExitCode {
msg := fmt.Sprintf("PyTorchJob %s is restarting because %d %s replica(s) failed.", pytorchjob.Name, failed, rtype)
r.Recorder.Event(pytorchjob, corev1.EventTypeWarning, commonutil.JobRestartingReason, msg)
Expand Down

0 comments on commit 305349b

Please sign in to comment.