diff --git a/pkg/framework/plugins/removefailedpods/failedpods.go b/pkg/framework/plugins/removefailedpods/failedpods.go index c414f9f24d..9a8aa60ac1 100644 --- a/pkg/framework/plugins/removefailedpods/failedpods.go +++ b/pkg/framework/plugins/removefailedpods/failedpods.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/descheduler/pkg/descheduler/evictions" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" + "sigs.k8s.io/descheduler/pkg/utils" ) const PluginName = "RemoveFailedPods" @@ -59,8 +60,26 @@ func New(ctx context.Context, args runtime.Object, handle frameworktypes.Handle) } // We can combine Filter and PreEvictionFilter since for this strategy it does not matter where we run PreEvictionFilter + defaultFilter := handle.Evictor().Filter + if failedPodsArgs.EvictFailedSystemCriticalPods { + // The operator has explicitly opted in to evicting failed pods that + // would otherwise be protected by the framework default-evictor's + // SystemCriticalPods check. Wrap the default-evictor filter so that + // pods which are both PodFailed and system-critical are admitted at + // candidate-listing time; all other DefaultEvictor protections + // (DaemonSet, StaticPod, LocalStorage, ...) still apply, as do + // PreEvictionFilter, the PodFailed phase check (below), and + // validateCanEvict. + defaultFilter = func(pod *v1.Pod) bool { + if pod.Status.Phase == v1.PodFailed && utils.IsCriticalPriorityPod(pod) { + return true + } + return handle.Evictor().Filter(pod) + } + } + podFilter, err := podutil.NewOptions(). - WithFilter(podutil.WrapFilterFuncs(handle.Evictor().Filter, handle.Evictor().PreEvictionFilter)). + WithFilter(podutil.WrapFilterFuncs(defaultFilter, handle.Evictor().PreEvictionFilter)). WithNamespaces(includedNamespaces). WithoutNamespaces(excludedNamespaces). WithLabelSelector(failedPodsArgs.LabelSelector). diff --git a/pkg/framework/plugins/removefailedpods/failedpods_test.go b/pkg/framework/plugins/removefailedpods/failedpods_test.go index 010478e946..577e44144d 100644 --- a/pkg/framework/plugins/removefailedpods/failedpods_test.go +++ b/pkg/framework/plugins/removefailedpods/failedpods_test.go @@ -342,6 +342,30 @@ func TestRemoveFailedPods(t *testing.T) { buildTestPod("p2", "node1", newPodStatus("Shutdown", v1.PodFailed, nil, nil), nil), }, }, + { + description: "system-critical priority failed pod is protected by default", + args: createRemoveFailedPodsArgs(false, nil, nil, nil, nil), + nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, + expectedEvictedPodCount: 0, + pods: []*v1.Pod{ + buildCriticalPriorityPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"}, + })), + }, + }, + { + description: "evictFailedSystemCriticalPods=true, system-critical priority failed pod is evicted", + args: RemoveFailedPodsArgs{ + EvictFailedSystemCriticalPods: true, + }, + nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, + expectedEvictedPodCount: 1, + pods: []*v1.Pod{ + buildCriticalPriorityPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"}, + })), + }, + }, } for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { @@ -363,13 +387,14 @@ func TestRemoveFailedPods(t *testing.T) { } plugin, err := New(ctx, &RemoveFailedPodsArgs{ - Reasons: tc.args.Reasons, - ExitCodes: tc.args.ExitCodes, - MinPodLifetimeSeconds: tc.args.MinPodLifetimeSeconds, - IncludingInitContainers: tc.args.IncludingInitContainers, - ExcludeOwnerKinds: tc.args.ExcludeOwnerKinds, - LabelSelector: tc.args.LabelSelector, - Namespaces: tc.args.Namespaces, + Reasons: tc.args.Reasons, + ExitCodes: tc.args.ExitCodes, + MinPodLifetimeSeconds: tc.args.MinPodLifetimeSeconds, + IncludingInitContainers: tc.args.IncludingInitContainers, + EvictFailedSystemCriticalPods: tc.args.EvictFailedSystemCriticalPods, + ExcludeOwnerKinds: tc.args.ExcludeOwnerKinds, + LabelSelector: tc.args.LabelSelector, + Namespaces: tc.args.Namespaces, }, handle, ) @@ -414,3 +439,12 @@ func buildTestPod(podName, nodeName string, podStatus v1.PodStatus, deletionTime pod.DeletionTimestamp = deletionTimestamp return pod } + +// buildCriticalPriorityPod builds a failed pod whose priority matches the +// system-critical threshold checked by utils.IsCriticalPriorityPod. +func buildCriticalPriorityPod(podName, nodeName string, podStatus v1.PodStatus) *v1.Pod { + pod := buildTestPod(podName, nodeName, podStatus, nil) + priority := int32(2000000000) + pod.Spec.Priority = &priority + return pod +} diff --git a/pkg/framework/plugins/removefailedpods/types.go b/pkg/framework/plugins/removefailedpods/types.go index b72f379766..3cf8e514f8 100644 --- a/pkg/framework/plugins/removefailedpods/types.go +++ b/pkg/framework/plugins/removefailedpods/types.go @@ -25,11 +25,12 @@ import ( type RemoveFailedPodsArgs struct { metav1.TypeMeta `json:",inline"` - Namespaces *api.Namespaces `json:"namespaces,omitempty"` - LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty"` - ExcludeOwnerKinds []string `json:"excludeOwnerKinds,omitempty"` - MinPodLifetimeSeconds *uint `json:"minPodLifetimeSeconds,omitempty"` - Reasons []string `json:"reasons,omitempty"` - ExitCodes []int32 `json:"exitCodes,omitempty"` - IncludingInitContainers bool `json:"includingInitContainers,omitempty"` + Namespaces *api.Namespaces `json:"namespaces,omitempty"` + LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty"` + ExcludeOwnerKinds []string `json:"excludeOwnerKinds,omitempty"` + MinPodLifetimeSeconds *uint `json:"minPodLifetimeSeconds,omitempty"` + Reasons []string `json:"reasons,omitempty"` + ExitCodes []int32 `json:"exitCodes,omitempty"` + IncludingInitContainers bool `json:"includingInitContainers,omitempty"` + EvictFailedSystemCriticalPods bool `json:"evictFailedSystemCriticalPods,omitempty"` }