Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NodeController doesn't evict Pods if no Nodes are Ready #25571

Merged
merged 1 commit into from
May 20, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 80 additions & 37 deletions pkg/controller/node/nodecontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import (
"k8s.io/kubernetes/pkg/util/metrics"
utilruntime "k8s.io/kubernetes/pkg/util/runtime"
"k8s.io/kubernetes/pkg/util/sets"
"k8s.io/kubernetes/pkg/util/system"
"k8s.io/kubernetes/pkg/util/wait"
"k8s.io/kubernetes/pkg/version"
"k8s.io/kubernetes/pkg/watch"
Expand Down Expand Up @@ -123,6 +124,11 @@ type NodeController struct {

forcefullyDeletePod func(*api.Pod) error
nodeExistsInCloudProvider func(string) (bool, error)

// If in network segmentation mode NodeController won't evict Pods from unhealthy Nodes.
// It is enabled when all Nodes observed by the NodeController are NotReady and disabled
// when NC sees any healthy Node. This is a temporary fix for v1.3.
networkSegmentationMode bool
}

// NewNodeController returns a new node controller to sync instances from cloudprovider.
Expand All @@ -141,10 +147,10 @@ func NewNodeController(
recorder := eventBroadcaster.NewRecorder(api.EventSource{Component: "controllermanager"})
eventBroadcaster.StartLogging(glog.Infof)
if kubeClient != nil {
glog.Infof("Sending events to api server.")
glog.V(0).Infof("Sending events to api server.")
eventBroadcaster.StartRecordingToSink(&unversionedcore.EventSinkImpl{Interface: kubeClient.Core().Events("")})
} else {
glog.Infof("No api server defined - no events will be sent to API server.")
glog.V(0).Infof("No api server defined - no events will be sent to API server.")
}

if kubeClient != nil && kubeClient.Core().GetRESTClient().GetRateLimiter() != nil {
Expand Down Expand Up @@ -282,7 +288,7 @@ func (nc *NodeController) Run(period time.Duration) {
}

if completed {
glog.Infof("All pods terminated on %s", value.Value)
glog.V(2).Infof("All pods terminated on %s", value.Value)
nc.recordNodeEvent(value.Value, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
return true, 0
}
Expand Down Expand Up @@ -371,7 +377,7 @@ func (nc *NodeController) maybeDeleteTerminatingPod(obj interface{}) {
node := nodeObj.(*api.Node)
v, err := version.Parse(node.Status.NodeInfo.KubeletVersion)
if err != nil {
glog.Infof("couldn't parse verions %q of minion: %v", node.Status.NodeInfo.KubeletVersion, err)
glog.V(0).Infof("couldn't parse verions %q of minion: %v", node.Status.NodeInfo.KubeletVersion, err)
utilruntime.HandleError(nc.forcefullyDeletePod(pod))
return
}
Expand Down Expand Up @@ -407,7 +413,7 @@ func forcefullyDeletePod(c clientset.Interface, pod *api.Pod) error {
var zero int64
err := c.Core().Pods(pod.Namespace).Delete(pod.Name, &api.DeleteOptions{GracePeriodSeconds: &zero})
if err == nil {
glog.Infof("forceful deletion of %s succeeded", pod.Name)
glog.V(4).Infof("forceful deletion of %s succeeded", pod.Name)
}
return err
}
Expand Down Expand Up @@ -449,13 +455,14 @@ func (nc *NodeController) monitorNodeStatus() error {
// reduce lists/decouple this from monitoring status.
nc.reconcileNodeCIDRs(nodes)
}
seenReady := false
for i := range nodes.Items {
var gracePeriod time.Duration
var lastReadyCondition api.NodeCondition
var readyCondition *api.NodeCondition
var observedReadyCondition api.NodeCondition
var currentReadyCondition *api.NodeCondition
node := &nodes.Items[i]
for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
gracePeriod, lastReadyCondition, readyCondition, err = nc.tryUpdateNodeStatus(node)
gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeStatus(node)
if err == nil {
break
}
Expand All @@ -474,28 +481,32 @@ func (nc *NodeController) monitorNodeStatus() error {

decisionTimestamp := nc.now()

if readyCondition != nil {
if currentReadyCondition != nil {
// Check eviction timeout against decisionTimestamp
if lastReadyCondition.Status == api.ConditionFalse &&
if observedReadyCondition.Status == api.ConditionFalse &&
decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
if nc.evictPods(node.Name) {
glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout)
}
}
if lastReadyCondition.Status == api.ConditionUnknown &&
if observedReadyCondition.Status == api.ConditionUnknown &&
decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
if nc.evictPods(node.Name) {
glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod)
}
}
if lastReadyCondition.Status == api.ConditionTrue {
if observedReadyCondition.Status == api.ConditionTrue {
// We do not treat a master node as a part of the cluster for network segmentation checking.
if !system.IsMasterNode(node) {
seenReady = true
}
if nc.cancelPodEviction(node.Name) {
glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
}
}

// Report node event.
if readyCondition.Status != api.ConditionTrue && lastReadyCondition.Status == api.ConditionTrue {
if currentReadyCondition.Status != api.ConditionTrue && observedReadyCondition.Status == api.ConditionTrue {
nc.recordNodeStatusChange(node, "NodeNotReady")
if err = nc.markAllPodsNotReady(node.Name); err != nil {
utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err))
Expand All @@ -504,14 +515,14 @@ func (nc *NodeController) monitorNodeStatus() error {

// Check with the cloud provider to see if the node still exists. If it
// doesn't, delete the node immediately.
if readyCondition.Status != api.ConditionTrue && nc.cloud != nil {
if currentReadyCondition.Status != api.ConditionTrue && nc.cloud != nil {
exists, err := nc.nodeExistsInCloudProvider(node.Name)
if err != nil {
glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
continue
}
if !exists {
glog.Infof("Deleting node (no longer present in cloud provider): %s", node.Name)
glog.V(2).Infof("Deleting node (no longer present in cloud provider): %s", node.Name)
nc.recordNodeEvent(node.Name, api.EventTypeNormal, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name))
go func(nodeName string) {
defer utilruntime.HandleCrash()
Expand All @@ -527,6 +538,18 @@ func (nc *NodeController) monitorNodeStatus() error {
}
}
}

// NC don't see any Ready Node. We assume that the network is segmented and Nodes cannot connect to API server and
// update their statuses. NC enteres network segmentation mode and cancels all evictions in progress.
if !seenReady {
nc.networkSegmentationMode = true
nc.stopAllPodEvictions()
} else {
if nc.networkSegmentationMode {
nc.forceUpdateAllProbeTimes()
nc.networkSegmentationMode = false
}
}
return nil
}

Expand Down Expand Up @@ -632,13 +655,13 @@ func (nc *NodeController) recordNodeStatusChange(node *api.Node, new_status stri
func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, api.NodeCondition, *api.NodeCondition, error) {
var err error
var gracePeriod time.Duration
var lastReadyCondition api.NodeCondition
readyCondition := nc.getCondition(&node.Status, api.NodeReady)
if readyCondition == nil {
var observedReadyCondition api.NodeCondition
currentReadyCondition := nc.getCondition(&node.Status, api.NodeReady)
if currentReadyCondition == nil {
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
// A fake ready condition is created, where LastProbeTime and LastTransitionTime is set
// to node.CreationTimestamp to avoid handle the corner case.
lastReadyCondition = api.NodeCondition{
observedReadyCondition = api.NodeCondition{
Type: api.NodeReady,
Status: api.ConditionUnknown,
LastHeartbeatTime: node.CreationTimestamp,
Expand All @@ -652,7 +675,7 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
}
} else {
// If ready condition is not nil, make a copy of it, since we may modify it in place later.
lastReadyCondition = *readyCondition
observedReadyCondition = *currentReadyCondition
gracePeriod = nc.nodeMonitorGracePeriod
}

Expand Down Expand Up @@ -683,15 +706,13 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
probeTimestamp: nc.now(),
readyTransitionTimestamp: nc.now(),
}
nc.nodeStatusMap[node.Name] = savedNodeStatus
} else if savedCondition == nil && observedCondition != nil {
glog.V(1).Infof("Creating timestamp entry for newly observed Node %s", node.Name)
savedNodeStatus = nodeStatusData{
status: node.Status,
probeTimestamp: nc.now(),
readyTransitionTimestamp: nc.now(),
}
nc.nodeStatusMap[node.Name] = savedNodeStatus
} else if savedCondition != nil && observedCondition == nil {
glog.Errorf("ReadyCondition was removed from Status of Node %s", node.Name)
// TODO: figure out what to do in this case. For now we do the same thing as above.
Expand All @@ -700,7 +721,6 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
probeTimestamp: nc.now(),
readyTransitionTimestamp: nc.now(),
}
nc.nodeStatusMap[node.Name] = savedNodeStatus
} else if savedCondition != nil && observedCondition != nil && savedCondition.LastHeartbeatTime != observedCondition.LastHeartbeatTime {
var transitionTime unversioned.Time
// If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now",
Expand All @@ -713,7 +733,7 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
transitionTime = savedNodeStatus.readyTransitionTimestamp
}
if glog.V(5) {
glog.Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, savedNodeStatus.status, node.Status)
glog.V(5).Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, savedNodeStatus.status, node.Status)
} else {
glog.V(3).Infof("Node %s ReadyCondition updated. Updating timestamp.", node.Name)
}
Expand All @@ -722,13 +742,13 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
probeTimestamp: nc.now(),
readyTransitionTimestamp: transitionTime,
}
nc.nodeStatusMap[node.Name] = savedNodeStatus
}
nc.nodeStatusMap[node.Name] = savedNodeStatus

if nc.now().After(savedNodeStatus.probeTimestamp.Add(gracePeriod)) {
// NodeReady condition was last set longer ago than gracePeriod, so update it to Unknown
// (regardless of its current value) in the master.
if readyCondition == nil {
if currentReadyCondition == nil {
glog.V(2).Infof("node %v is never updated by kubelet", node.Name)
node.Status.Conditions = append(node.Status.Conditions, api.NodeCondition{
Type: api.NodeReady,
Expand All @@ -740,14 +760,14 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
})
} else {
glog.V(4).Infof("node %v hasn't been updated for %+v. Last ready condition is: %+v",
node.Name, nc.now().Time.Sub(savedNodeStatus.probeTimestamp.Time), lastReadyCondition)
if lastReadyCondition.Status != api.ConditionUnknown {
readyCondition.Status = api.ConditionUnknown
readyCondition.Reason = "NodeStatusUnknown"
readyCondition.Message = fmt.Sprintf("Kubelet stopped posting node status.")
node.Name, nc.now().Time.Sub(savedNodeStatus.probeTimestamp.Time), observedReadyCondition)
if observedReadyCondition.Status != api.ConditionUnknown {
currentReadyCondition.Status = api.ConditionUnknown
currentReadyCondition.Reason = "NodeStatusUnknown"
currentReadyCondition.Message = fmt.Sprintf("Kubelet stopped posting node status.")
// LastProbeTime is the last time we heard from kubelet.
readyCondition.LastHeartbeatTime = lastReadyCondition.LastHeartbeatTime
readyCondition.LastTransitionTime = nc.now()
currentReadyCondition.LastHeartbeatTime = observedReadyCondition.LastHeartbeatTime
currentReadyCondition.LastTransitionTime = nc.now()
}
}

Expand Down Expand Up @@ -776,27 +796,41 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
}
}

if !api.Semantic.DeepEqual(nc.getCondition(&node.Status, api.NodeReady), &lastReadyCondition) {
if !api.Semantic.DeepEqual(nc.getCondition(&node.Status, api.NodeReady), &observedReadyCondition) {
if _, err = nc.kubeClient.Core().Nodes().UpdateStatus(node); err != nil {
glog.Errorf("Error updating node %s: %v", node.Name, err)
return gracePeriod, lastReadyCondition, readyCondition, err
return gracePeriod, observedReadyCondition, currentReadyCondition, err
} else {
nc.nodeStatusMap[node.Name] = nodeStatusData{
status: node.Status,
probeTimestamp: nc.nodeStatusMap[node.Name].probeTimestamp,
readyTransitionTimestamp: nc.now(),
}
return gracePeriod, lastReadyCondition, readyCondition, nil
return gracePeriod, observedReadyCondition, currentReadyCondition, nil
}
}
}

return gracePeriod, lastReadyCondition, readyCondition, err
return gracePeriod, observedReadyCondition, currentReadyCondition, err
}

// forceUpdateAllProbeTimes bumps all observed timestamps in saved nodeStatuses to now. This makes
// all eviction timer to reset.
func (nc *NodeController) forceUpdateAllProbeTimes() {
now := nc.now()
for k, v := range nc.nodeStatusMap {
v.probeTimestamp = now
v.readyTransitionTimestamp = now
nc.nodeStatusMap[k] = v
}
}

// evictPods queues an eviction for the provided node name, and returns false if the node is already
// queued for eviction.
func (nc *NodeController) evictPods(nodeName string) bool {
if nc.networkSegmentationMode {
return false
}
nc.evictorLock.Lock()
defer nc.evictorLock.Unlock()
return nc.podEvictor.Add(nodeName)
Expand All @@ -816,6 +850,15 @@ func (nc *NodeController) cancelPodEviction(nodeName string) bool {
return false
}

// stopAllPodEvictions removes any queued evictions for all Nodes.
func (nc *NodeController) stopAllPodEvictions() {
nc.evictorLock.Lock()
defer nc.evictorLock.Unlock()
glog.V(3).Infof("Cancelling all pod evictions.")
nc.podEvictor.Clear()
nc.terminationEvictor.Clear()
}

// deletePods will delete all pods from master running on given node, and return true
// if any pods were deleted.
func (nc *NodeController) deletePods(nodeName string) (bool, error) {
Expand Down
Loading