Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions charts/descheduler/templates/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ rules:
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["get", "watch", "list"]
- apiGroups: ["apps"]
resources: ["replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "patch"]
{{- if .Values.leaderElection.enabled }}
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
Expand Down
6 changes: 6 additions & 0 deletions kubernetes/base/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ rules:
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["get", "watch", "list"]
- apiGroups: ["apps"]
resources: ["replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "patch"]
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
115 changes: 112 additions & 3 deletions pkg/descheduler/evictions/evictions.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@ import (

"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
Expand Down Expand Up @@ -229,6 +231,11 @@ type PodEvictor struct {
erCache *evictionRequestsCache
featureGates featuregate.FeatureGate

// restartedDeployments tracks "namespace/name" of Deployments already rollout-restarted this cycle
restartedDeployments map[string]bool
// lastRolloutRestart indicates the most recent evictPod call used rollout restart
lastRolloutRestart bool

// registeredHandlers contains the registrations of all handlers. It's used to check if all handlers have finished syncing before the scheduling cycles start.
registeredHandlers []cache.ResourceEventHandlerRegistration
}
Expand Down Expand Up @@ -258,6 +265,7 @@ func NewPodEvictor(
metricsEnabled: options.metricsEnabled,
nodePodCount: make(nodePodEvictedCount),
namespacePodCount: make(namespacePodEvictCount),
restartedDeployments: make(map[string]bool),
featureGates: featureGates,
}

Expand Down Expand Up @@ -401,6 +409,7 @@ func (pe *PodEvictor) ResetCounters() {
pe.nodePodCount = make(nodePodEvictedCount)
pe.namespacePodCount = make(namespacePodEvictCount)
pe.totalPodCount = 0
pe.restartedDeployments = make(map[string]bool)
}

func (pe *PodEvictor) evictionRequestsTotal() uint {
Expand Down Expand Up @@ -552,24 +561,124 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
metrics.PodsEvictedTotal.With(map[string]string{"result": "success", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
}

method := "eviction"
if pe.lastRolloutRestart {
method = "rollout-restart"
}

if pe.dryRun {
klog.V(1).InfoS("Evicted pod in dry run mode", "pod", klog.KObj(pod), "reason", opts.Reason, "strategy", opts.StrategyName, "node", pod.Spec.NodeName, "profile", opts.ProfileName)
klog.V(1).InfoS("Evicted pod in dry run mode", "pod", klog.KObj(pod), "reason", opts.Reason, "strategy", opts.StrategyName, "node", pod.Spec.NodeName, "profile", opts.ProfileName, "method", method)
} else {
klog.V(1).InfoS("Evicted pod", "pod", klog.KObj(pod), "reason", opts.Reason, "strategy", opts.StrategyName, "node", pod.Spec.NodeName, "profile", opts.ProfileName)
klog.V(1).InfoS("Evicted pod", "pod", klog.KObj(pod), "reason", opts.Reason, "strategy", opts.StrategyName, "node", pod.Spec.NodeName, "profile", opts.ProfileName, "method", method)
reason := opts.Reason
if len(reason) == 0 {
reason = opts.StrategyName
Comment on lines 561 to 575
Copy link

Copilot AI Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rollout restart increments the eviction counters (nodePodCount, namespacePodCount, totalPodCount) even though no actual pod eviction occurs. This could be misleading for metrics and limits enforcement. Consider either: (1) not incrementing these counters for rollout restarts since no pod is immediately evicted, or (2) documenting this behavior clearly, as the rollout restart will eventually cause a pod to be terminated by the Deployment controller but at a different time than a direct eviction.

Copilot uses AI. Check for mistakes.
if len(reason) == 0 {
reason = "NotSet"
}
}
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeNormal, reason, "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler", pod.Spec.NodeName)
if pe.lastRolloutRestart {
Copy link
Copy Markdown
Contributor

@ingvagabund ingvagabund May 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it always guaranteed pe.lastRolloutRestart was set by pod.Namespace/pod.Name?

pe.eventRecorder.Eventf(pod, nil, v1.EventTypeNormal, reason, "Descheduled", "pod rollout-restarted (single-replica) from %v node by sigs.k8s.io/descheduler", pod.Spec.NodeName)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Annotating the corresponding deployment does not always guarantee the pod will get rolled out. Recording the event might be confusing.

} else {
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeNormal, reason, "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler", pod.Spec.NodeName)
}
}
return nil
}

// resolveDeploymentOwner walks the owner chain Pod → ReplicaSet → Deployment.
// Returns (nil, nil) if the pod is not owned by a Deployment.
func (pe *PodEvictor) resolveDeploymentOwner(ctx context.Context, pod *v1.Pod) (*appsv1.Deployment, error) {
var rsName string
for _, ref := range pod.OwnerReferences {
if ref.Kind == "ReplicaSet" {
rsName = ref.Name
break
}
}
if rsName == "" {
return nil, nil
}

rs, err := pe.client.AppsV1().ReplicaSets(pod.Namespace).Get(ctx, rsName, metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("failed to get ReplicaSet %s/%s: %v", pod.Namespace, rsName, err)
}

var deployName string
for _, ref := range rs.OwnerReferences {
if ref.Kind == "Deployment" {
deployName = ref.Name
break
}
}
if deployName == "" {
return nil, nil
}

deploy, err := pe.client.AppsV1().Deployments(pod.Namespace).Get(ctx, deployName, metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("failed to get Deployment %s/%s: %v", pod.Namespace, deployName, err)
}
Comment on lines +603 to +622
Copy link

Copilot AI Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the ReplicaSet or Deployment is not found (NotFound errors), the function returns an error which causes a fallback to normal eviction. However, NotFound errors for ReplicaSets or Deployments could be transient (e.g., during deletion) or permanent (orphaned pods). Consider distinguishing between NotFound errors and other errors: for NotFound errors, you might want to return (nil, nil) to indicate "not a Deployment pod" rather than an error, so the log message would be more accurate ("not owned by a Deployment" rather than "failed to resolve").

Copilot uses AI. Check for mistakes.

return deploy, nil
}

// rolloutRestartDeployment patches the Deployment's pod template annotation to trigger a rolling restart.
func (pe *PodEvictor) rolloutRestartDeployment(ctx context.Context, deploy *appsv1.Deployment) error {
patch := fmt.Sprintf(`{"spec":{"template":{"metadata":{"annotations":{"kubectl.kubernetes.io/restartedAt":"%s"}}}}}`, time.Now().Format(time.RFC3339))
Copy link

Copilot AI Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The annotation key "kubectl.kubernetes.io/restartedAt" is used here to trigger a rollout restart, which matches the convention used by kubectl rollout restart. However, consider whether descheduler should use its own annotation key (e.g., "descheduler.sigs.k8s.io/restartedAt") to clearly indicate the source of the restart and avoid confusion with manual kubectl restarts. This would also make it easier to track which restarts were initiated by the descheduler versus manual operations.

Copilot uses AI. Check for mistakes.
_, err := pe.client.AppsV1().Deployments(deploy.Namespace).Patch(ctx, deploy.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{})
return err
}

// tryRolloutRestart checks if the pod belongs to a single-replica Deployment with RollingUpdate
// strategy and triggers a rollout restart instead of eviction to avoid downtime.
// Returns (true, true/false, nil) if handled (ignore, rolloutRestarted),
// or (false, false, nil) if normal eviction should proceed.
func (pe *PodEvictor) tryRolloutRestart(ctx context.Context, pod *v1.Pod) (handled bool, ignore bool) {
deploy, err := pe.resolveDeploymentOwner(ctx, pod)
if err != nil {
klog.V(3).InfoS("Failed to resolve Deployment owner, falling through to normal eviction", "pod", klog.KObj(pod), "err", err)
return false, false
}
if deploy == nil {
return false, false
}

replicas := int32(1)
if deploy.Spec.Replicas != nil {
replicas = *deploy.Spec.Replicas
}
isRecreate := deploy.Spec.Strategy.Type == appsv1.RecreateDeploymentStrategyType

if replicas != 1 || isRecreate || deploy.Status.UnavailableReplicas != 0 {
Copy link

Copilot AI Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The check for deploy.Status.UnavailableReplicas != 0 may not be sufficient to detect unhealthy deployments. Consider also checking if the deployment is currently progressing (e.g., by examining deploy.Status.Conditions for the "Progressing" condition with status "False" or reason "ProgressDeadlineExceeded"). A deployment could have UnavailableReplicas == 0 but still be in a failed state if the previous rollout hasn't completed successfully.

Copilot uses AI. Check for mistakes.
return false, false
}

deployKey := deploy.Namespace + "/" + deploy.Name
if pe.restartedDeployments[deployKey] {
klog.V(3).InfoS("Deployment already rollout-restarted this cycle, skipping", "deployment", deployKey, "pod", klog.KObj(pod))
return true, true
Copy link

Copilot AI Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When a deployment has already been rollout-restarted in this cycle and a second pod from the same deployment is processed, the function returns (true, true) meaning "handled=true, ignore=true". However, in the calling code (EvictPod), when ignore=true, the counters are NOT incremented (line 549-550 returns early). This creates an inconsistency: the first pod increments counters, but subsequent pods from the same deployment don't. This could lead to under-counting of affected pods in scenarios where multiple pods from a single-replica deployment are candidates for eviction.

Suggested change
return true, true
return true, false

Copilot uses AI. Check for mistakes.
}
if !pe.dryRun {
if err := pe.rolloutRestartDeployment(ctx, deploy); err != nil {
klog.V(1).InfoS("Failed to rollout restart Deployment, falling through to normal eviction", "deployment", deployKey, "pod", klog.KObj(pod), "err", err)
return false, false
}
}
klog.V(1).InfoS("Triggered rollout restart for single-replica Deployment instead of eviction", "deployment", deployKey, "pod", klog.KObj(pod), "dryRun", pe.dryRun)
pe.restartedDeployments[deployKey] = true
Copy link
Copy Markdown
Contributor

@ingvagabund ingvagabund May 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The older deployment keys are not getting garbage collected.

EDIT: ResetCounters() resets it.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might not work well when the descheduling cycle is short.

return true, false
}

// return (ignore, err)
func (pe *PodEvictor) evictPod(ctx context.Context, pod *v1.Pod, opts EvictOptions) (bool, error) {
pe.lastRolloutRestart = false

if handled, ignore := pe.tryRolloutRestart(ctx, pod); handled {
pe.lastRolloutRestart = !ignore
return ignore, nil
}
deleteOptions := &metav1.DeleteOptions{
GracePeriodSeconds: pe.gracePeriodSeconds,
}
Expand Down
Loading
Loading