Skip to content

Commit

Permalink
Merge pull request #48075 from clamoriniere1A/feature/job_failure_policy
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue (batch tested with PRs 51335, 51364, 51130, 48075, 50920)

[API] Feature/job failure policy

**What this PR does / why we need it**: Implements the Backoff policy and failed pod limit defined in kubernetes/community#583

**Which issue this PR fixes**: 
fixes #27997, fixes #30243

**Special notes for your reviewer**:
This is a WIP PR, I updated the api batchv1.JobSpec in order to prepare the backoff policy implementation in the JobController.

**Release note**:
```release-note
Add backoff policy and failed pod limit for a job
```
  • Loading branch information
Kubernetes Submit Queue authored Sep 3, 2017
2 parents 94d9457 + 2286936 commit 73ed961
Show file tree
Hide file tree
Showing 21 changed files with 261 additions and 105 deletions.
7 changes: 6 additions & 1 deletion api/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -57933,10 +57933,15 @@
],
"properties": {
"activeDeadlineSeconds": {
"description": "Optional duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer",
"description": "Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer",
"type": "integer",
"format": "int64"
},
"backoffLimit": {
"description": "Specifies the number of retries before marking this job failed. Defaults to 6",
"type": "integer",
"format": "int32"
},
"completions": {
"description": "Specifies the desired number of successfully finished pods the job should be run with. Setting to nil means that the success of any pod signals the success of all pods, and allows parallelism to have any positive value. Setting to 1 means that parallelism is limited to 1 and the success of that pod signals the success of the job. More info: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/",
"type": "integer",
Expand Down
7 changes: 6 additions & 1 deletion api/swagger-spec/batch_v1.json
Original file line number Diff line number Diff line change
Expand Up @@ -1385,7 +1385,12 @@
"activeDeadlineSeconds": {
"type": "integer",
"format": "int64",
"description": "Optional duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer"
"description": "Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer"
},
"backoffLimit": {
"type": "integer",
"format": "int32",
"description": "Specifies the number of retries before marking this job failed. Defaults to 6"
},
"selector": {
"$ref": "v1.LabelSelector",
Expand Down
7 changes: 6 additions & 1 deletion api/swagger-spec/batch_v1beta1.json
Original file line number Diff line number Diff line change
Expand Up @@ -1440,7 +1440,12 @@
"activeDeadlineSeconds": {
"type": "integer",
"format": "int64",
"description": "Optional duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer"
"description": "Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer"
},
"backoffLimit": {
"type": "integer",
"format": "int32",
"description": "Specifies the number of retries before marking this job failed. Defaults to 6"
},
"selector": {
"$ref": "v1.LabelSelector",
Expand Down
7 changes: 6 additions & 1 deletion api/swagger-spec/batch_v2alpha1.json
Original file line number Diff line number Diff line change
Expand Up @@ -1440,7 +1440,12 @@
"activeDeadlineSeconds": {
"type": "integer",
"format": "int64",
"description": "Optional duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer"
"description": "Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer"
},
"backoffLimit": {
"type": "integer",
"format": "int32",
"description": "Specifies the number of retries before marking this job failed. Defaults to 6"
},
"selector": {
"$ref": "v1.LabelSelector",
Expand Down
9 changes: 8 additions & 1 deletion docs/api-reference/batch/v1/definitions.html
Original file line number Diff line number Diff line change
Expand Up @@ -2407,12 +2407,19 @@ <h3 id="_v1_jobspec">v1.JobSpec</h3>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">activeDeadlineSeconds</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Optional duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">integer (int64)</p></td>
<td class="tableblock halign-left valign-top"></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">backoffLimit</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Specifies the number of retries before marking this job failed. Defaults to 6</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">integer (int32)</p></td>
<td class="tableblock halign-left valign-top"></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">selector</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">A label query over pods that should match the pod count. Normally, the system sets this field for you. More info: <a href="https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors">https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors</a></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
Expand Down
9 changes: 8 additions & 1 deletion docs/api-reference/batch/v1beta1/definitions.html
Original file line number Diff line number Diff line change
Expand Up @@ -2441,12 +2441,19 @@ <h3 id="_v1_jobspec">v1.JobSpec</h3>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">activeDeadlineSeconds</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Optional duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">integer (int64)</p></td>
<td class="tableblock halign-left valign-top"></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">backoffLimit</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Specifies the number of retries before marking this job failed. Defaults to 6</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">integer (int32)</p></td>
<td class="tableblock halign-left valign-top"></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">selector</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">A label query over pods that should match the pod count. Normally, the system sets this field for you. More info: <a href="https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors">https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors</a></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
Expand Down
9 changes: 8 additions & 1 deletion docs/api-reference/batch/v2alpha1/definitions.html
Original file line number Diff line number Diff line change
Expand Up @@ -2414,12 +2414,19 @@ <h3 id="_v1_jobspec">v1.JobSpec</h3>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">activeDeadlineSeconds</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Optional duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">integer (int64)</p></td>
<td class="tableblock halign-left valign-top"></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">backoffLimit</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Specifies the number of retries before marking this job failed. Defaults to 6</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">integer (int32)</p></td>
<td class="tableblock halign-left valign-top"></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">selector</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">A label query over pods that should match the pod count. Normally, the system sets this field for you. More info: <a href="https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors">https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors</a></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/batch/fuzzer/fuzzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ var Funcs = func(codecs runtimeserializer.CodecFactory) []interface{} {
c.FuzzNoCustom(j) // fuzz self without calling this function again
completions := int32(c.Rand.Int31())
parallelism := int32(c.Rand.Int31())
backoffLimit := int32(c.Rand.Int31())
j.Completions = &completions
j.Parallelism = &parallelism
j.BackoffLimit = &backoffLimit
if c.Rand.Int31()%2 == 0 {
j.ManualSelector = newBool(true)
} else {
Expand Down
10 changes: 10 additions & 0 deletions pkg/apis/batch/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ type JobSpec struct {
// +optional
ActiveDeadlineSeconds *int64

// Optional number of retries before marking this job failed.
// Defaults to 6
// +optional
BackoffLimit *int32

// TODO enabled it when https://github.com/kubernetes/kubernetes/issues/28486 has been fixed
// Optional number of failed pods to retain.
// +optional
// FailedPodsLimit *int32

// A label query over pods that should match the pod count.
// Normally, the system sets this field for you.
// +optional
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/batch/v1/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ func Convert_batch_JobSpec_To_v1_JobSpec(in *batch.JobSpec, out *batchv1.JobSpec
out.Parallelism = in.Parallelism
out.Completions = in.Completions
out.ActiveDeadlineSeconds = in.ActiveDeadlineSeconds
out.BackoffLimit = in.BackoffLimit
out.Selector = in.Selector
if in.ManualSelector != nil {
out.ManualSelector = new(bool)
Expand All @@ -71,6 +72,7 @@ func Convert_v1_JobSpec_To_batch_JobSpec(in *batchv1.JobSpec, out *batch.JobSpec
out.Parallelism = in.Parallelism
out.Completions = in.Completions
out.ActiveDeadlineSeconds = in.ActiveDeadlineSeconds
out.BackoffLimit = in.BackoffLimit
out.Selector = in.Selector
if in.ManualSelector != nil {
out.ManualSelector = new(bool)
Expand Down
4 changes: 4 additions & 0 deletions pkg/apis/batch/v1/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ func SetDefaults_Job(obj *batchv1.Job) {
obj.Spec.Parallelism = new(int32)
*obj.Spec.Parallelism = 1
}
if obj.Spec.BackoffLimit == nil {
obj.Spec.BackoffLimit = new(int32)
*obj.Spec.BackoffLimit = 6
}
labels := obj.Spec.Template.Labels
if labels != nil && len(obj.Labels) == 0 {
obj.Labels = labels
Expand Down
105 changes: 66 additions & 39 deletions pkg/apis/batch/v1/defaults_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func TestSetDefaultJob(t *testing.T) {
expected *batchv1.Job
expectLabels bool
}{
"both unspecified -> sets both to 1": {
"All unspecified -> sets all to default values": {
original: &batchv1.Job{
Spec: batchv1.JobSpec{
Template: v1.PodTemplateSpec{
Expand All @@ -48,13 +48,14 @@ func TestSetDefaultJob(t *testing.T) {
},
expected: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(1),
Parallelism: newInt32(1),
Completions: newInt32(1),
Parallelism: newInt32(1),
BackoffLimit: newInt32(6),
},
},
expectLabels: true,
},
"both unspecified -> sets both to 1 and no default labels": {
"All unspecified -> all integers are defaulted and no default labels": {
original: &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{"mylabel": "myvalue"},
Expand All @@ -67,12 +68,13 @@ func TestSetDefaultJob(t *testing.T) {
},
expected: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(1),
Parallelism: newInt32(1),
Completions: newInt32(1),
Parallelism: newInt32(1),
BackoffLimit: newInt32(6),
},
},
},
"WQ: Parallelism explicitly 0 and completions unset -> no change": {
"WQ: Parallelism explicitly 0 and completions unset -> BackoffLimit is defaulted": {
original: &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: newInt32(0),
Expand All @@ -83,12 +85,13 @@ func TestSetDefaultJob(t *testing.T) {
},
expected: &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: newInt32(0),
Parallelism: newInt32(0),
BackoffLimit: newInt32(6),
},
},
expectLabels: true,
},
"WQ: Parallelism explicitly 2 and completions unset -> no change": {
"WQ: Parallelism explicitly 2 and completions unset -> BackoffLimit is defaulted": {
original: &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: newInt32(2),
Expand All @@ -99,12 +102,13 @@ func TestSetDefaultJob(t *testing.T) {
},
expected: &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: newInt32(2),
Parallelism: newInt32(2),
BackoffLimit: newInt32(6),
},
},
expectLabels: true,
},
"Completions explicitly 2 and parallelism unset -> parallelism is defaulted": {
"Completions explicitly 2 and others unset -> parallelism and BackoffLimit are defaulted": {
original: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(2),
Expand All @@ -115,47 +119,70 @@ func TestSetDefaultJob(t *testing.T) {
},
expected: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(2),
Parallelism: newInt32(1),
Completions: newInt32(2),
Parallelism: newInt32(1),
BackoffLimit: newInt32(6),
},
},
expectLabels: true,
},
"Both set -> no change": {
"BackoffLimit explicitly 5 and others unset -> parallelism and completions are defaulted": {
original: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(10),
Parallelism: newInt32(11),
BackoffLimit: newInt32(5),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{Labels: defaultLabels},
},
},
},
expected: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(10),
Parallelism: newInt32(11),
Completions: newInt32(1),
Parallelism: newInt32(1),
BackoffLimit: newInt32(5),
},
},
expectLabels: true,
},
"All set -> no change": {
original: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(8),
Parallelism: newInt32(9),
BackoffLimit: newInt32(10),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{Labels: defaultLabels},
},
},
},
expected: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(8),
Parallelism: newInt32(9),
BackoffLimit: newInt32(10),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{Labels: defaultLabels},
},
},
},
expectLabels: true,
},
"Both set, flipped -> no change": {
"All set, flipped -> no change": {
original: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(11),
Parallelism: newInt32(10),
Completions: newInt32(11),
Parallelism: newInt32(10),
BackoffLimit: newInt32(9),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{Labels: defaultLabels},
},
},
},
expected: &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: newInt32(11),
Parallelism: newInt32(10),
Completions: newInt32(11),
Parallelism: newInt32(10),
BackoffLimit: newInt32(9),
},
},
expectLabels: true,
Expand All @@ -171,22 +198,11 @@ func TestSetDefaultJob(t *testing.T) {
t.Errorf("%s: unexpected object: %v", name, actual)
t.FailNow()
}
if (actual.Spec.Completions == nil) != (expected.Spec.Completions == nil) {
t.Errorf("%s: got different *completions than expected: %v %v", name, actual.Spec.Completions, expected.Spec.Completions)
}
if actual.Spec.Completions != nil && expected.Spec.Completions != nil {
if *actual.Spec.Completions != *expected.Spec.Completions {
t.Errorf("%s: got different completions than expected: %d %d", name, *actual.Spec.Completions, *expected.Spec.Completions)
}
}
if (actual.Spec.Parallelism == nil) != (expected.Spec.Parallelism == nil) {
t.Errorf("%s: got different *Parallelism than expected: %v %v", name, actual.Spec.Parallelism, expected.Spec.Parallelism)
}
if actual.Spec.Parallelism != nil && expected.Spec.Parallelism != nil {
if *actual.Spec.Parallelism != *expected.Spec.Parallelism {
t.Errorf("%s: got different parallelism than expected: %d %d", name, *actual.Spec.Parallelism, *expected.Spec.Parallelism)
}
}

validateDefaultInt32(t, name, "Completions", actual.Spec.Completions, expected.Spec.Completions)
validateDefaultInt32(t, name, "Parallelism", actual.Spec.Parallelism, expected.Spec.Parallelism)
validateDefaultInt32(t, name, "BackoffLimit", actual.Spec.BackoffLimit, expected.Spec.BackoffLimit)

if test.expectLabels != reflect.DeepEqual(actual.Labels, actual.Spec.Template.Labels) {
if test.expectLabels {
t.Errorf("%s: expected: %v, got: %v", name, actual.Spec.Template.Labels, actual.Labels)
Expand All @@ -198,6 +214,17 @@ func TestSetDefaultJob(t *testing.T) {
}
}

func validateDefaultInt32(t *testing.T, name string, field string, actual *int32, expected *int32) {
if (actual == nil) != (expected == nil) {
t.Errorf("%s: got different *%s than expected: %v %v", name, field, actual, expected)
}
if actual != nil && expected != nil {
if *actual != *expected {
t.Errorf("%s: got different %s than expected: %d %d", name, field, *actual, *expected)
}
}
}

func roundTrip(t *testing.T, obj runtime.Object) runtime.Object {
data, err := runtime.Encode(api.Codecs.LegacyCodec(SchemeGroupVersion), obj)
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/batch/v1/zz_generated.conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ func autoConvert_v1_JobSpec_To_batch_JobSpec(in *v1.JobSpec, out *batch.JobSpec,
out.Parallelism = (*int32)(unsafe.Pointer(in.Parallelism))
out.Completions = (*int32)(unsafe.Pointer(in.Completions))
out.ActiveDeadlineSeconds = (*int64)(unsafe.Pointer(in.ActiveDeadlineSeconds))
out.BackoffLimit = (*int32)(unsafe.Pointer(in.BackoffLimit))
out.Selector = (*meta_v1.LabelSelector)(unsafe.Pointer(in.Selector))
out.ManualSelector = (*bool)(unsafe.Pointer(in.ManualSelector))
if err := api_v1.Convert_v1_PodTemplateSpec_To_api_PodTemplateSpec(&in.Template, &out.Template, s); err != nil {
Expand All @@ -173,6 +174,7 @@ func autoConvert_batch_JobSpec_To_v1_JobSpec(in *batch.JobSpec, out *v1.JobSpec,
out.Parallelism = (*int32)(unsafe.Pointer(in.Parallelism))
out.Completions = (*int32)(unsafe.Pointer(in.Completions))
out.ActiveDeadlineSeconds = (*int64)(unsafe.Pointer(in.ActiveDeadlineSeconds))
out.BackoffLimit = (*int32)(unsafe.Pointer(in.BackoffLimit))
out.Selector = (*meta_v1.LabelSelector)(unsafe.Pointer(in.Selector))
out.ManualSelector = (*bool)(unsafe.Pointer(in.ManualSelector))
if err := api_v1.Convert_api_PodTemplateSpec_To_v1_PodTemplateSpec(&in.Template, &out.Template, s); err != nil {
Expand Down
Loading

0 comments on commit 73ed961

Please sign in to comment.