Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce health checks for Numaflow CRDs #20297

Merged
merged 1 commit into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
local hs = {}
local healthyCondition = {}

if obj.status ~= nil then
if obj.status.conditions ~= nil then
for i, condition in ipairs(obj.status.conditions) do
if condition.type == "ChildrenResourcesHealthy" then
healthyCondition = condition
end
end
end

if obj.metadata.generation == obj.status.observedGeneration then
if (healthyCondition ~= {} and healthyCondition.status == "False") or obj.status.phase == "Failed" then
hs.status = "Degraded"
if obj.status.phase == "Failed" then
hs.message = obj.status.message
else
hs.message = healthyCondition.message
end
return hs
elseif (healthyCondition ~= {} and healthyCondition.status == "True") and obj.status.phase == "Running" then
hs.status = "Healthy"
hs.message = healthyCondition.message
return hs
end
end
end

hs.status = "Progressing"
hs.message = "Waiting for InterStepBufferService status"
return hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
tests:
- healthStatus:
status: Progressing
message: "Waiting for InterStepBufferService status"
inputPath: testdata/progressing.yaml
- healthStatus:
status: Healthy
message: "partitioned roll out complete: 3 new pods have been updated...\n"
inputPath: testdata/healthy.yaml
- healthStatus:
status: Degraded
message: "Waiting for 3 pods to be ready...\n"
inputPath: testdata/degraded.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: InterStepBufferService
metadata:
creationTimestamp: "2024-10-08T16:32:12Z"
finalizers:
- isbsvc-controller
generation: 1
name: test-isbservice-rollout
namespace: numaplane-system
ownerReferences:
- apiVersion: numaplane.numaproj.io/v1alpha1
blockOwnerDeletion: true
controller: true
kind: ISBServiceRollout
name: test-isbservice-rollout
uid: 28fc22f1-83f3-441c-bf76-dd4a11ce3891
resourceVersion: "348334"
uid: 06a5cb44-f3f9-42f8-ab9e-7d2f10323f9e
spec:
jetstream:
containerTemplate:
resources:
limits:
memory: 10Mi
persistence:
volumeSize: 10Mi
version: 2.9.6
status:
conditions:
- lastTransitionTime: "2024-10-08T16:32:37Z"
message: |
Waiting for 3 pods to be ready...
reason: Unavailable
status: "False"
type: ChildrenResourcesHealthy
- lastTransitionTime: "2024-10-08T16:32:37Z"
message: Successful
reason: Successful
status: "True"
type: Configured
- lastTransitionTime: "2024-10-08T16:32:37Z"
message: Successful
reason: Successful
status: "True"
type: Deployed
config:
jetstream:
auth:
basic:
password:
key: client-auth-password
name: isbsvc-test-isbservice-rollout-js-client-auth
user:
key: client-auth-user
name: isbsvc-test-isbservice-rollout-js-client-auth
streamConfig: |
consumer:
ackwait: 60s
maxackpending: 25000
otbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 3h
procbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 72h
stream:
duplicates: 60s
maxage: 72h
maxbytes: -1
maxmsgs: 100000
replicas: 3
retention: 0
storage: 0
url: nats://isbsvc-test-isbservice-rollout-js-svc.numaplane-system.svc:4222
message: |
Unavailable: Waiting for 3 pods to be ready...
observedGeneration: 1
phase: Running
type: jetstream
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: InterStepBufferService
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"numaflow.numaproj.io/v1alpha1","kind":"InterStepBufferService","metadata":{"annotations":{},"name":"default","namespace":"numaflow-system"},"spec":{"jetstream":{"persistence":{"volumeSize":"3Gi"},"version":"latest"}}}
creationTimestamp: "2024-10-08T18:21:09Z"
finalizers:
- isbsvc-controller
generation: 1
name: default
namespace: numaflow-system
resourceVersion: "357862"
uid: e175db66-3918-4ef8-993d-12b37eb9a964
spec:
jetstream:
persistence:
volumeSize: 3Gi
replicas: 3
version: latest
status:
conditions:
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: |
partitioned roll out complete: 3 new pods have been updated...
reason: Healthy
status: "True"
type: ChildrenResourcesHealthy
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Configured
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Deployed
config:
jetstream:
auth:
basic:
password:
key: client-auth-password
name: isbsvc-default-js-client-auth
user:
key: client-auth-user
name: isbsvc-default-js-client-auth
streamConfig: |
consumer:
ackwait: 60s
maxackpending: 25000
otbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 3h
procbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 72h
stream:
duplicates: 60s
maxage: 72h
maxbytes: -1
maxmsgs: 100000
replicas: 3
retention: 0
storage: 0
url: nats://isbsvc-default-js-svc.numaflow-system.svc:4222
observedGeneration: 1
phase: Running
type: jetstream
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: InterStepBufferService
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"numaflow.numaproj.io/v1alpha1","kind":"InterStepBufferService","metadata":{"annotations":{},"name":"default","namespace":"numaflow-system"},"spec":{"jetstream":{"persistence":{"volumeSize":"3Gi"},"version":"latest"}}}
creationTimestamp: "2024-10-08T18:21:09Z"
finalizers:
- isbsvc-controller
generation: 2
name: default
namespace: numaflow-system
resourceVersion: "357862"
uid: e175db66-3918-4ef8-993d-12b37eb9a964
spec:
jetstream:
persistence:
volumeSize: 3Gi
replicas: 3
version: latest
status:
conditions:
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: |
partitioned roll out complete: 3 new pods have been updated...
reason: Healthy
status: "True"
type: ChildrenResourcesHealthy
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Configured
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Deployed
config:
jetstream:
auth:
basic:
password:
key: client-auth-password
name: isbsvc-default-js-client-auth
user:
key: client-auth-user
name: isbsvc-default-js-client-auth
streamConfig: |
consumer:
ackwait: 60s
maxackpending: 25000
otbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 3h
procbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 72h
stream:
duplicates: 60s
maxage: 72h
maxbytes: -1
maxmsgs: 100000
replicas: 3
retention: 0
storage: 0
url: nats://isbsvc-default-js-svc.numaflow-system.svc:4222
observedGeneration: 1
phase: Running
type: jetstream
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
local hs = {}
local podsHealth = {}
local daemonHealth = {}

if obj.status ~= nil then
if obj.status.conditions ~= nil then
for i, condition in ipairs(obj.status.conditions) do
if condition.type == "PodsHealthy" then
podsHealth = condition
end
if condition.type == "DaemonHealthy" then
daemonHealth = condition
end
end
end

if obj.metadata.generation == obj.status.observedGeneration then
if (podsHealth ~= {} and podsHealth.status == "False") or (daemonHealth ~= {} and daemonHealth.status == "False") or obj.status.phase == "Failed" then
hs.status = "Degraded"
if obj.status.phase == "Failed" then
hs.message = obj.status.message
else
hs.message = "Subresources are unhealthy"
end
return hs
elseif obj.status.phase == "Paused" then
hs.status = "Suspended"
hs.message = "MonoVertex is paused"
return hs
elseif (podsHealth ~= {} and podsHealth.status == "True") and (daemonHealth ~= {} and daemonHealth.status == "True") and obj.status.phase == "Running" then
hs.status = "Healthy"
hs.message = "MonoVertex is healthy"
return hs
end
end
end

hs.status = "Progressing"
hs.message = "Waiting for MonoVertex status"
return hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
tests:
- healthStatus:
status: Progressing
message: "Waiting for MonoVertex status"
inputPath: testdata/progressing.yaml
- healthStatus:
status: Healthy
message: "MonoVertex is healthy"
inputPath: testdata/healthy.yaml
- healthStatus:
status: Degraded
message: "Subresources are unhealthy"
inputPath: testdata/degraded.yaml
- healthStatus:
status: Suspended
message: "MonoVertex is paused"
inputPath: testdata/suspended.yaml
Loading