Skip to content

Commit

Permalink
Merge pull request kubernetes#1261 from ricky-rav/az_lb_master
Browse files Browse the repository at this point in the history
Bug 2086092: UPSTREAM: 108284: fix: exclude non-ready nodes and deleted nodes from azure load balancers
  • Loading branch information
deads2k authored Jun 1, 2022
2 parents beaaed6 + 1fa0a91 commit 899fd9f
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 22 deletions.
63 changes: 51 additions & 12 deletions staging/src/k8s.io/legacy-cloud-providers/azure/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import (
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
cloudprovider "k8s.io/cloud-provider"
cloudproviderapi "k8s.io/cloud-provider/api"
"k8s.io/klog/v2"
"k8s.io/legacy-cloud-providers/azure/auth"
azcache "k8s.io/legacy-cloud-providers/azure/cache"
Expand Down Expand Up @@ -793,7 +794,7 @@ func (az *Cloud) updateNodeCaches(prevNode, newNode *v1.Node) {
}
}

//Remove from nodeZones cache if using depreciated LabelFailureDomainBetaZone
// Remove from nodeZones cache if using deprecated LabelFailureDomainBetaZone
prevZoneFailureDomain, ok := prevNode.ObjectMeta.Labels[v1.LabelFailureDomainBetaZone]
if ok && az.isAvailabilityZone(prevZoneFailureDomain) {
az.nodeZones[prevZone].Delete(prevNode.ObjectMeta.Name)
Expand All @@ -808,16 +809,17 @@ func (az *Cloud) updateNodeCaches(prevNode, newNode *v1.Node) {
delete(az.nodeResourceGroups, prevNode.ObjectMeta.Name)
}

// Remove from unmanagedNodes cache.
managed, ok := prevNode.ObjectMeta.Labels[managedByAzureLabel]
if ok && managed == "false" {
isNodeManagedByCloudProvider := !ok || managed != "false"

// Remove from unmanagedNodes cache
if !isNodeManagedByCloudProvider {
az.unmanagedNodes.Delete(prevNode.ObjectMeta.Name)
az.excludeLoadBalancerNodes.Delete(prevNode.ObjectMeta.Name)
}

// Remove from excludeLoadBalancerNodes cache.
if _, hasExcludeBalancerLabel := prevNode.ObjectMeta.Labels[v1.LabelNodeExcludeBalancers]; hasExcludeBalancerLabel {
az.excludeLoadBalancerNodes.Delete(prevNode.ObjectMeta.Name)
// if the node is being deleted from the cluster, exclude it from load balancers
if newNode == nil {
az.excludeLoadBalancerNodes.Insert(prevNode.ObjectMeta.Name)
}
}

Expand All @@ -840,16 +842,35 @@ func (az *Cloud) updateNodeCaches(prevNode, newNode *v1.Node) {
az.nodeResourceGroups[newNode.ObjectMeta.Name] = strings.ToLower(newRG)
}

// Add to unmanagedNodes cache.
_, hasExcludeBalancerLabel := newNode.ObjectMeta.Labels[v1.LabelNodeExcludeBalancers]
managed, ok := newNode.ObjectMeta.Labels[managedByAzureLabel]
if ok && managed == "false" {
isNodeManagedByCloudProvider := !ok || managed != "false"

// Update unmanagedNodes cache
if !isNodeManagedByCloudProvider {
az.unmanagedNodes.Insert(newNode.ObjectMeta.Name)
az.excludeLoadBalancerNodes.Insert(newNode.ObjectMeta.Name)
}

// Add to excludeLoadBalancerNodes cache.
if _, hasExcludeBalancerLabel := newNode.ObjectMeta.Labels[v1.LabelNodeExcludeBalancers]; hasExcludeBalancerLabel {
// Update excludeLoadBalancerNodes cache
switch {
case !isNodeManagedByCloudProvider:
az.excludeLoadBalancerNodes.Insert(newNode.ObjectMeta.Name)

case hasExcludeBalancerLabel:
az.excludeLoadBalancerNodes.Insert(newNode.ObjectMeta.Name)

case !isNodeReady(newNode) && getCloudTaint(newNode.Spec.Taints) == nil:
// If not in ready state and not a newly created node, add to excludeLoadBalancerNodes cache.
// New nodes (tainted with "node.cloudprovider.kubernetes.io/uninitialized") should not be
// excluded from load balancers regardless of their state, so as to reduce the number of
// VMSS API calls and not provoke VMScaleSetActiveModelsCountLimitReached.
// (https://github.com/kubernetes-sigs/cloud-provider-azure/issues/851)
az.excludeLoadBalancerNodes.Insert(newNode.ObjectMeta.Name)

default:
// Nodes not falling into the three cases above are valid backends and
// should not appear in excludeLoadBalancerNodes cache.
az.excludeLoadBalancerNodes.Delete(newNode.ObjectMeta.Name)
}
}
}
Expand Down Expand Up @@ -975,3 +996,21 @@ func (az *Cloud) ShouldNodeExcludedFromLoadBalancer(nodeName string) (bool, erro

return az.excludeLoadBalancerNodes.Has(nodeName), nil
}

func isNodeReady(node *v1.Node) bool {
for _, cond := range node.Status.Conditions {
if cond.Type == v1.NodeReady && cond.Status == v1.ConditionTrue {
return true
}
}
return false
}

func getCloudTaint(taints []v1.Taint) *v1.Taint {
for _, taint := range taints {
if taint.Key == cloudproviderapi.TaintExternalCloudProvider {
return &taint
}
}
return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -1125,11 +1125,8 @@ func (az *Cloud) reconcileLoadBalancer(clusterName string, service *v1.Service,
if err != nil && !errors.Is(err, cloudprovider.InstanceNotFound) {
return nil, err
}

// If a node is not supposed to be included in the LB, it
// would not be in the `nodes` slice. We need to check the nodes that
// have been added to the LB's backendpool, find the unwanted ones and
// delete them from the pool.
// If the node appears in the local cache of nodes to exclude,
// delete it from the load balancer backend pool.
shouldExcludeLoadBalancer, err := az.ShouldNodeExcludedFromLoadBalancer(nodeName)
if err != nil {
klog.Errorf("ShouldNodeExcludedFromLoadBalancer(%s) failed with error: %v", nodeName, err)
Expand Down
82 changes: 77 additions & 5 deletions staging/src/k8s.io/legacy-cloud-providers/azure/azure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/informers"
cloudprovider "k8s.io/cloud-provider"
cloudproviderapi "k8s.io/cloud-provider/api"
servicehelpers "k8s.io/cloud-provider/service/helpers"
"k8s.io/legacy-cloud-providers/azure/auth"
"k8s.io/legacy-cloud-providers/azure/clients/interfaceclient/mockinterfaceclient"
Expand Down Expand Up @@ -3228,7 +3229,7 @@ func TestUpdateNodeCaches(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
az := GetTestCloud(ctrl)

// delete node appearing in unmanagedNodes and excludeLoadBalancerNodes
zone := fmt.Sprintf("%s-0", az.Location)
nodesInZone := sets.NewString("prevNode")
az.nodeZones = map[string]sets.String{zone: nodesInZone}
Expand All @@ -3248,14 +3249,16 @@ func TestUpdateNodeCaches(t *testing.T) {
Name: "prevNode",
},
}

// node is deleted from the cluster
az.updateNodeCaches(&prevNode, nil)
assert.Equal(t, 0, len(az.nodeZones[zone]))
assert.Equal(t, 0, len(az.nodeResourceGroups))
assert.Equal(t, 0, len(az.unmanagedNodes))
assert.Equal(t, 0, len(az.excludeLoadBalancerNodes))
// deleted node should be excluded from load balancer
assert.Equal(t, 1, len(az.excludeLoadBalancerNodes))
assert.Equal(t, 0, len(az.nodeNames))

// add new (unmanaged and to-be-excluded) node
newNode := v1.Node{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
Expand All @@ -3267,12 +3270,12 @@ func TestUpdateNodeCaches(t *testing.T) {
Name: "newNode",
},
}

// new node is added to the cluster
az.updateNodeCaches(nil, &newNode)
assert.Equal(t, 1, len(az.nodeZones[zone]))
assert.Equal(t, 1, len(az.nodeResourceGroups))
assert.Equal(t, 1, len(az.unmanagedNodes))
assert.Equal(t, 1, len(az.excludeLoadBalancerNodes))
assert.Equal(t, 2, len(az.excludeLoadBalancerNodes))
assert.Equal(t, 1, len(az.nodeNames))
}

Expand Down Expand Up @@ -3310,6 +3313,75 @@ func TestUpdateNodeCacheExcludeLoadBalancer(t *testing.T) {
az.updateNodeCaches(&prevNode, &newNode)
assert.Equal(t, 1, len(az.excludeLoadBalancerNodes))

// a non-ready node should be excluded
az.unmanagedNodes = sets.NewString()
az.excludeLoadBalancerNodes = sets.NewString()
az.nodeNames = sets.NewString()
nonReadyNode := v1.Node{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1.LabelTopologyZone: zone,
},
Name: "aNode",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
},
},
},
}
az.updateNodeCaches(nil, &nonReadyNode)
assert.Equal(t, 1, len(az.excludeLoadBalancerNodes))

// node becomes ready, it should be removed from az.excludeLoadBalancerNodes
readyNode := v1.Node{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1.LabelTopologyZone: zone,
},
Name: "aNode",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
},
},
},
}
az.updateNodeCaches(&nonReadyNode, &readyNode)
assert.Equal(t, 0, len(az.excludeLoadBalancerNodes))

// new non-ready node with taint is added to the cluster: don't exclude it
nonReadyTaintedNode := v1.Node{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1.LabelTopologyZone: zone,
},
Name: "anotherNode",
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{{
Key: cloudproviderapi.TaintExternalCloudProvider,
Value: "aValue",
Effect: v1.TaintEffectNoSchedule},
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
},
},
},
}
az.updateNodeCaches(nil, &nonReadyTaintedNode)
assert.Equal(t, 0, len(az.excludeLoadBalancerNodes))
}

func TestGetActiveZones(t *testing.T) {
Expand Down

0 comments on commit 899fd9f

Please sign in to comment.