diff --git a/Makefile b/Makefile index e958d19548..86ddc82e9e 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ ARCHS = amd64 arm arm64 LDFLAGS=-ldflags "-X ${LDFLAG_LOCATION}.version=${VERSION} -X ${LDFLAG_LOCATION}.buildDate=${BUILD} -X ${LDFLAG_LOCATION}.gitbranch=${BRANCH} -X ${LDFLAG_LOCATION}.gitsha1=${SHA1}" -GOLANGCI_VERSION := v2.8.0 +GOLANGCI_VERSION := v2.12.2 HAS_GOLANGCI := $(shell ls _output/bin/golangci-lint 2> /dev/null) GOFUMPT_VERSION := v0.7.0 @@ -146,7 +146,7 @@ verify-gen: lint: ifndef HAS_GOLANGCI - curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b ./_output/bin ${GOLANGCI_VERSION} + GOBIN=$(CURRENT_DIR)/_output/bin go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@${GOLANGCI_VERSION} endif ./_output/bin/golangci-lint run -v ./_output/bin/golangci-lint fmt -v diff --git a/README.md b/README.md index f66e03c71b..8579ce19b5 100644 --- a/README.md +++ b/README.md @@ -697,6 +697,25 @@ The `topologyBalanceNodeFit` arg is used when balancing topology domains while t topologyBalanceNodeFit: false ``` +The `zoneAwareNodeFit` arg (default: `false`) prevents over-committing a single +under-loaded topology domain within one balancing round. For each candidate pod the +descheduler considers evicting, it requires that at least one specific under-loaded +domain (identified by the constraint's `topologyKey`) has both (a) a node where the pod +fits per the scheduler's per-node predicates AND (b) sufficient remaining aggregate +resource headroom — after subtracting pods already committed to that domain in this +round — to absorb the pod's cpu/memory request and one pod slot. + +Unlike `topologyBalanceNodeFit`, which only checks per-node fit on the union of +under-loaded domain nodes (a check that is stateless across the batch), +`zoneAwareNodeFit` tracks per-domain cumulative state. Use it when batched eviction +would otherwise push more pods toward an under-loaded domain than it can actually +absorb, causing the scheduler to send the excess back to the over-loaded domain +(eviction churn). + +```yaml +zoneAwareNodeFit: true +``` + Strategy parameter `labelSelector` is not utilized when balancing topology domains and is only applied during eviction to determine if the pod can be evicted. [Supported Constraints](https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/#spread-constraint-definition) fields: @@ -714,12 +733,13 @@ Strategy parameter `labelSelector` is not utilized when balancing topology domai **Parameters:** -|Name|Type| -|---|---| -|`namespaces`|(see [namespace filtering](#namespace-filtering))| -|`labelSelector`|(see [label filtering](#label-filtering))| +|Name|Type|Description| +|---|---|---| +|`namespaces`|(see [namespace filtering](#namespace-filtering))|| +|`labelSelector`|(see [label filtering](#label-filtering))|| |`constraints`|(see [whenUnsatisfiable](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#topologyspreadconstraint-v1-core))|| |`topologyBalanceNodeFit`|bool|default `true`. [node fit filtering](#node-fit-filtering) when balancing topology domains| +|`zoneAwareNodeFit`|bool|default `false`. When enabled, gates eviction on cumulative per-topology-domain headroom: a candidate pod is admitted only if some specific under-loaded domain has both a node where it fits AND remaining aggregate cpu/memory/pod-slot headroom (after pods already committed to that domain in this round). Prevents over-committing a single under-loaded domain — a case `topologyBalanceNodeFit` cannot detect.| **Example:** diff --git a/docs/proposals/zone-aware-nodefit.md b/docs/proposals/zone-aware-nodefit.md new file mode 100644 index 0000000000..f66d042141 --- /dev/null +++ b/docs/proposals/zone-aware-nodefit.md @@ -0,0 +1,282 @@ +# RFC: ZoneAwareNodeFit for RemovePodsViolatingTopologySpreadConstraint + +**Status:** Implementable +**Author:** bruno.chauvet@rokt.com +**Date:** 2026-04-15 (revised 2026-05-17) +**Related issues:** kubernetes-sigs/descheduler#1534, kubernetes-sigs/descheduler#1067 + +--- + +## Problem + +`RemovePodsViolatingTopologySpreadConstraint` evicts pods from over-loaded topology +domains in batches: each iteration of `balanceDomains` selects N candidate pods from +an over-loaded domain and lets the scheduler place them in under-loaded domains. The +existing `TopologyBalanceNodeFit` gate (default `true`) checks, for each candidate, +whether the pod can fit on at least one node in the union of all under-loaded domains. + +The check is **stateless across the batch**: every candidate evaluates `PodFitsAnyOtherNode` +against the same node-view (the descheduler does not simulate intermediate scheduler +decisions). So if N candidates each individually fit on a node in the only under-loaded +domain `B`, all N pass the gate — even though `B` may only have aggregate room for +K ≪ N of them. After eviction, the scheduler can place only K pods in `B`; the +remaining N − K either go back to the over-loaded domain (churn — issues #1534, #1067) +or remain Pending. + +### Illustrative scenario + +``` +Domain A (over-loaded): 7 pods on nodes with ample CPU +Domain B (under-loaded by pod count): 1 pod on a node with 250 m CPU allocatable +Existing pod in B uses 100 m → 150 m of aggregate headroom remains in domain B +Candidate pods request 100 m CPU each + +balanceDomains schedules 3 evictions toward domain B. +TopologyBalanceNodeFit evaluates each candidate against the live indexer state +(B1 still has 150 m headroom) → all 3 pass → all 3 evicted. +The scheduler can fit at most 1 of them in B. The other 2 go back to A → churn. +``` + +### Stateless vs cumulative checks, side-by-side + +```mermaid +sequenceDiagram + autonumber + participant BD as balanceDomains + participant TBNF as topologyBalanceNodeFit gate
(stateless per-node check on union of domains) + participant ZANF as zoneAwareNodeFit gate
(cumulative per-domain headroom) + participant Idx as live nodeIndexer
(B1 = 150 m headroom) + participant H as remainingHeadroom map
(zoneB = 150 m initially) + + Note over BD: 3 candidate pods
(each requests 100 m) + + BD->>TBNF: candidate 1 + TBNF->>Idx: PodFitsAnyOtherNode([B1]) + Idx-->>TBNF: yes (150 m ≥ 100 m) + TBNF-->>BD: pass + BD->>ZANF: candidate 1 + ZANF->>H: zoneB headroom? + H-->>ZANF: 150 m + ZANF->>H: decrement by 100 m → 50 m + ZANF-->>BD: pass + + BD->>TBNF: candidate 2 + TBNF->>Idx: PodFitsAnyOtherNode([B1]) + Idx-->>TBNF: yes — indexer is NOT mutated mid-batch + TBNF-->>BD: pass ← bug + BD->>ZANF: candidate 2 + ZANF->>H: zoneB headroom? + H-->>ZANF: 50 m + ZANF-->>BD: reject (50 m < 100 m) + + BD->>TBNF: candidate 3 + TBNF-->>BD: pass ← bug + BD->>ZANF: candidate 3 + ZANF-->>BD: reject + + Note over BD: topologyBalanceNodeFit alone:
3 evictions, scheduler returns 2 → churn
zoneAwareNodeFit (with or without TBNF):
1 eviction, no churn +``` + +The fundamental gap: `TopologyBalanceNodeFit` reasons about **per-node fit on the +union of under-loaded domains**, which mathematically collapses to +`OR(merge(domains))`. It cannot detect cumulative over-commit of a single under-loaded +domain because it never tracks state across the batch. + +## Proposed Change + +Add an opt-in field `ZoneAwareNodeFit *bool` to +`RemovePodsViolatingTopologySpreadConstraintArgs` (default `false`). + +When enabled, `balanceDomains` tracks, **per under-loaded topology domain**, the +aggregate resource headroom remaining for the current batch. Each candidate pod is +admitted only if some specific under-loaded domain has both: + +- (a) at least one node where the pod fits per the scheduler's per-node predicates + (`PodFitsAnyOtherNode`), AND +- (b) sufficient remaining aggregate headroom — after subtracting all pods already + committed to that domain in this batch — to absorb the pod's resource request and + one pod slot. + +On admission, that domain's headroom is decremented. Subsequent candidates see the +decremented state; once a domain is drained, later candidates must find another +qualifying domain or be skipped. + +Per-domain headroom is initialised as `sum(allocatable) − sum(existing pod requests)` +across the domain's nodes, for cpu, memory, and pod count. + +### Why this is qualitatively different from `TopologyBalanceNodeFit` + +The accumulator makes pod-N's decision depend on commitments 1..N-1. There is no +equivalent stateless `OR(merge(...))` formulation, because `PodFitsAnyOtherNode` has +no notion of "headroom remaining after prior commitments in this batch." + +`ZoneAwareNodeFit` is therefore **strictly stricter** than `TopologyBalanceNodeFit`: +any pod that fails `ZoneAwareNodeFit` also fails the cumulative-correctness goal +that `TopologyBalanceNodeFit` cannot express. + +## API + +```yaml +apiVersion: "descheduler/v1alpha2" +kind: DeschedulerPolicy +profiles: + - name: default + plugins: + balance: + enabled: + - name: RemovePodsViolatingTopologySpreadConstraint + pluginConfig: + - name: RemovePodsViolatingTopologySpreadConstraint + args: + topologyBalanceNodeFit: true # unchanged — existing flag + zoneAwareNodeFit: true # NEW — cumulative per-domain gate +``` + +### Field semantics + +| Field | Default | Reasoning level | Catches cumulative over-commit? | +|---|---|---|---| +| `topologyBalanceNodeFit` | `true` | Per-node, stateless across batch | No | +| `zoneAwareNodeFit` | `false` | Per-domain, cumulative within batch | Yes | + +Enabling both is valid: `TopologyBalanceNodeFit` runs first as a fast per-node pre-filter, +then `ZoneAwareNodeFit` applies the cumulative-headroom constraint. + +## Implementation + +### types.go + +```go +// ZoneAwareNodeFit, if set to true, requires that each pod evicted by topology-spread +// balancing have, in at least one specific under-loaded topology domain (as identified +// by the constraint's TopologyKey), (a) a node where the pod fits per the scheduler's +// per-node predicates AND (b) sufficient remaining aggregate resource headroom — after +// accounting for other pods already committed to that domain during the same balancing +// round — to absorb the pod. +ZoneAwareNodeFit *bool `json:"zoneAwareNodeFit,omitempty"` +``` + +### defaults.go + +```go +if args.ZoneAwareNodeFit == nil { + args.ZoneAwareNodeFit = utilptr.To(false) +} +``` + +### topologyspreadconstraint.go + +Two new helpers replace the original duplicate filter + naive OR(OR) check: + +```go +// groupNodesByDomain classifies the supplied nodes (already filtered to under-loaded +// domains) by their topology-key label value. Single pass over the same node slice +// the existing filterNodesBelowIdealAvg returns. +func groupNodesByDomain(nodes []*v1.Node, topologyKey string) map[string][]*v1.Node + +// computeDomainHeadroom returns, per under-loaded domain, the aggregate remaining +// resource headroom (allocatable − already-requested) summed across the domain's +// nodes. Tracks cpu, memory, and pod count. Computed once per balanceDomains call. +func computeDomainHeadroom( + nodesByDomain map[string][]*v1.Node, + nodeIndexer podutil.GetPodsAssignedToNodeFunc, +) map[string]v1.ResourceList + +// podFitsSomeDomainWithHeadroom returns the topology-domain key whose remaining +// headroom AND per-node fit both admit pod. On success it decrements that domain's +// headroom in place and returns ok=true. Iterates domains in descending remaining +// CPU headroom order so pods spread toward the roomiest under-loaded domain first; +// ties fall back to alphabetical for determinism. +func podFitsSomeDomainWithHeadroom( + nodeIndexer podutil.GetPodsAssignedToNodeFunc, + pod *v1.Pod, + nodesByDomain map[string][]*v1.Node, + remainingHeadroom map[string]v1.ResourceList, +) (string, bool) +``` + +In `balanceDomains`: + +```go +zoneAwareNodeFit := utilptr.Deref(d.args.ZoneAwareNodeFit, false) +var nodesByDomain map[string][]*v1.Node +var remainingHeadroom map[string]v1.ResourceList +if zoneAwareNodeFit { + nodesByDomain = groupNodesByDomain(nodesBelowIdealAvg, tsc.TopologyKey) + remainingHeadroom = computeDomainHeadroom(nodesByDomain, getPodsAssignedToNode) +} +``` + +Inside the per-pod loop, after the existing `topologyBalanceNodeFit` gate: + +```go +if zoneAwareNodeFit { + if _, ok := podFitsSomeDomainWithHeadroom( + getPodsAssignedToNode, aboveToEvict[k], nodesByDomain, remainingHeadroom, + ); !ok { + d.logger.V(2).Info( + "ignoring pod for eviction: no target topology domain has fit + remaining headroom", + "pod", klog.KObj(aboveToEvict[k]), + ) + continue + } +} +``` + +## Alternatives Considered + +**Modify `TopologyBalanceNodeFit` semantics to be cumulative.** Silent behavioural change +for existing users. Rejected. + +**Per-node-fit-only gate, no headroom tracking.** This is what the first revision of this +proposal shipped; review surfaced that it collapses to `OR(merge(domains))` and adds +nothing over `TopologyBalanceNodeFit`. Rejected — see Problem section. + +**Threading zone context through `EvictorPlugin.PreEvictionFilter`.** Breaking API change +across all plugins. Rejected in favour of a TSC-local gate. + +## Test Plan + +The acceptance bar for this test plan: each behavioural case must produce a different +result under the redesign than under a naive `OR(merge(domains))` per-node-fit +implementation. Cases marked **gap-catching** would have flagged the prior revision's +churn bug. Cases marked **regression** ensure the new gate still enforces existing +invariants (per-node fit, default-off no-op). + +### Integration cases (`TestTopologySpreadConstraint` table) + +Shared topology unless noted: `zoneA` over-loaded (7 pods on a 2 000 m node); +`zoneB` under-loaded (1 pod on a 250 m node → 150 m aggregate headroom). +Each candidate pod requests 100 m CPU. balanceDomains schedules 3 evictions. + +| # | Scenario | Args | Expected | Catches gap? | +|---|---|---|---|---| +| 1 | ZoneAwareNodeFit caps cumulative load **on top of** TopologyBalanceNodeFit | TBNF=true (default), ZANF=true, evictor nodeFit=false | 1 of 3 | ✅ — prior impl admits 3 (per-node check stateless across batch) | +| 2 | ZoneAwareNodeFit alone caps at aggregate headroom | TBNF=false, ZANF=true, evictor nodeFit=false | 1 of 3 | ✅ — prior impl admits 3 | +| 3 | TopologyBalanceNodeFit alone does **not** cap cumulative load (the bug) | defaults (TBNF=true, ZANF=false), nodeFit=false | 3 of 3 | ❌ baseline contrast (proves the bug exists without the new gate) | +| 4 | Drains domain by domain, caps at sum of headroom across domains (zoneB=150 m, zoneC=250 m → total headroom 400 m fits 4×100 m, but algorithm schedules 4 evictions in 2 iterations; the 4th finds both domains drained) | TBNF=false, ZANF=true, nodeFit=false | 3 of 4 | ✅ — prior impl admits 4 (zoneC always has per-node fit) | +| 5 | Per-node fit still required when aggregate headroom is misleadingly high (zoneB has 5 × 50 m nodes = 250 m aggregate but no single node fits 100 m) | TBNF=false, ZANF=true, nodeFit=false | 0 evictions | ❌ regression — both impls reject | +| 6 | Multi-node-per-domain grouping (zoneB has 2 nodes each contributing 100 m → 200 m aggregate fits 2×100 m) | TBNF=false, ZANF=true, nodeFit=false | 2 evictions | ✅ — would surface a grouping bug that mis-bucketed nodes | +| 7 | Default off — backward compat (ZANF unset, evictor nodeFit handles it) | defaults, nodeFit=true | 0 evictions | ❌ regression — identical to prior versions | + +### Helper unit tests + +| Test | Coverage | +|---|---| +| `TestGroupNodesByDomain` | Classifies by `node.Labels[topologyKey]`; drops nodes missing the label | +| `TestComputeDomainHeadroom` | Aggregates allocatable − sum-of-requests across a domain's nodes; counts cpu (millicores) and pod slots correctly | +| `TestComputeDomainHeadroomFailsClosedOnIndexerError` | When `ListPodsOnANode` errors for a node, that node contributes zero to the domain aggregate (no allocatable, no requests) | +| `TestPodFitsSomeDomainWithHeadroom` — roomiest first | Picks the domain with the highest remaining CPU headroom even when it sorts later alphabetically | +| `TestPodFitsSomeDomainWithHeadroom` — alphabetical tie-break | When two domains have equal headroom, commits to the alphabetically-earlier domain (determinism) | +| `TestPodFitsSomeDomainWithHeadroom` — drain then reject | Three sequential calls with starting headroom = 250 m; first two admit, third rejects (50 m < 100 m) | +| `TestPodFitsSomeDomainWithHeadroom` — aggregate failure | Headroom < pod request → reject | +| `TestPodFitsSomeDomainWithHeadroom` — per-node failure | Aggregate ample but no single node fits (occupant pod consumes capacity) → reject | +| `TestPodFitsSomeDomainWithHeadroom` — empty input | Empty `nodesByDomain` → reject without error | + +## Risk + +**Low.** +- Opt-in (`false` by default). +- Additive: existing `TopologyBalanceNodeFit` is unchanged. +- Localised to `balanceDomains` and a few new helpers in the TSC plugin. +- No changes to framework types, eviction API, or other plugins. diff --git a/hack/lib/go.sh b/hack/lib/go.sh index 311d0f93b7..d0a4242ba0 100644 --- a/hack/lib/go.sh +++ b/hack/lib/go.sh @@ -19,7 +19,7 @@ go::verify_version() { GO_VERSION=($(go version)) - if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.23|go1.24|go1.25') ]]; then + if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.24|go1.25|go1.26') ]]; then echo "Unknown go version '${GO_VERSION[2]}', skipping gofmt." exit 1 fi diff --git a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/defaults.go b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/defaults.go index 55b4092f87..49016363ea 100644 --- a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/defaults.go +++ b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/defaults.go @@ -36,6 +36,9 @@ func SetDefaults_RemovePodsViolatingTopologySpreadConstraintArgs(obj runtime.Obj if args.TopologyBalanceNodeFit == nil { args.TopologyBalanceNodeFit = utilptr.To(true) } + if args.ZoneAwareNodeFit == nil { + args.ZoneAwareNodeFit = utilptr.To(false) + } if len(args.Constraints) == 0 { args.Constraints = append(args.Constraints, v1.DoNotSchedule) } diff --git a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/defaults_test.go b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/defaults_test.go index 0901e70874..4b96b9760c 100644 --- a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/defaults_test.go +++ b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/defaults_test.go @@ -49,6 +49,7 @@ func TestSetDefaults_RemovePodsViolatingTopologySpreadConstraintArgs(t *testing. LabelSelector: nil, Constraints: []v1.UnsatisfiableConstraintAction{v1.DoNotSchedule}, TopologyBalanceNodeFit: utilptr.To(true), + ZoneAwareNodeFit: utilptr.To(false), }, }, { @@ -63,6 +64,7 @@ func TestSetDefaults_RemovePodsViolatingTopologySpreadConstraintArgs(t *testing. LabelSelector: &metav1.LabelSelector{}, Constraints: []v1.UnsatisfiableConstraintAction{v1.DoNotSchedule, v1.ScheduleAnyway}, TopologyBalanceNodeFit: utilptr.To(true), + ZoneAwareNodeFit: utilptr.To(false), }, }, { @@ -71,6 +73,7 @@ func TestSetDefaults_RemovePodsViolatingTopologySpreadConstraintArgs(t *testing. want: &RemovePodsViolatingTopologySpreadConstraintArgs{ Constraints: []v1.UnsatisfiableConstraintAction{v1.DoNotSchedule}, TopologyBalanceNodeFit: utilptr.To(true), + ZoneAwareNodeFit: utilptr.To(false), }, }, { @@ -81,6 +84,7 @@ func TestSetDefaults_RemovePodsViolatingTopologySpreadConstraintArgs(t *testing. want: &RemovePodsViolatingTopologySpreadConstraintArgs{ TopologyBalanceNodeFit: utilptr.To(false), Constraints: []v1.UnsatisfiableConstraintAction{v1.DoNotSchedule}, + ZoneAwareNodeFit: utilptr.To(false), }, }, { @@ -91,6 +95,18 @@ func TestSetDefaults_RemovePodsViolatingTopologySpreadConstraintArgs(t *testing. want: &RemovePodsViolatingTopologySpreadConstraintArgs{ Constraints: []v1.UnsatisfiableConstraintAction{v1.DoNotSchedule}, TopologyBalanceNodeFit: utilptr.To(true), + ZoneAwareNodeFit: utilptr.To(false), + }, + }, + { + name: "RemovePodsViolatingTopologySpreadConstraintArgs with ZoneAwareNodeFit=true is not overwritten", + in: &RemovePodsViolatingTopologySpreadConstraintArgs{ + ZoneAwareNodeFit: utilptr.To(true), + }, + want: &RemovePodsViolatingTopologySpreadConstraintArgs{ + TopologyBalanceNodeFit: utilptr.To(true), + Constraints: []v1.UnsatisfiableConstraintAction{v1.DoNotSchedule}, + ZoneAwareNodeFit: utilptr.To(true), }, }, } diff --git a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/topologyspreadconstraint.go b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/topologyspreadconstraint.go index a47cb77f72..e036a27e32 100644 --- a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/topologyspreadconstraint.go +++ b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/topologyspreadconstraint.go @@ -21,6 +21,7 @@ import ( "sort" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" @@ -315,6 +316,28 @@ func (d *RemovePodsViolatingTopologySpreadConstraint) balanceDomains(ctx context eligibleNodes := filterEligibleNodes(ctx, nodes, tsc) nodesBelowIdealAvg := filterNodesBelowIdealAvg(eligibleNodes, sortedDomains, tsc.TopologyKey, idealAvg) + zoneAwareNodeFit := utilptr.Deref(d.args.ZoneAwareNodeFit, false) + + // When zoneAwareNodeFit is enabled, group the same under-loaded nodes by their + // topology-domain value and compute each domain's remaining aggregate headroom. + // The headroom map is mutated as evictions are committed within this call so that + // later candidate pods see headroom already consumed by earlier ones. + // + // The grouping and headroom snapshot are taken once at entry and are *not* recomputed + // as the outer (i, j) loop progresses. As pods get added to podsForEviction the + // effective set of below-ideal domains can shift, but the snapshot's drift is the + // safe (conservative) direction: a domain that becomes saturated mid-batch will stop + // admitting new pods once its tracked headroom is drained, and a domain that becomes + // further below idealAvg is simply not promoted into the gate's view. Recomputing + // per iteration would be more accurate but costs an extra ListPodsOnANode sweep + // across the domain's nodes for every (i, j) step. + var nodesByDomain map[string][]*v1.Node + var remainingHeadroom map[string]v1.ResourceList + if zoneAwareNodeFit { + nodesByDomain = groupNodesByDomain(nodesBelowIdealAvg, tsc.TopologyKey) + remainingHeadroom = computeDomainHeadroom(nodesByDomain, getPodsAssignedToNode) + } + // i is the index for belowOrEqualAvg // j is the index for aboveAvg i := 0 @@ -379,6 +402,23 @@ func (d *RemovePodsViolatingTopologySpreadConstraint) balanceDomains(ctx context continue } + if zoneAwareNodeFit { + // Note: headroom is decremented at this point, before the actual eviction + // is recorded in the eviction loop. If a later eviction filter (rate limit, + // pre-eviction predicate, eviction error) rejects this pod, the chosen + // domain's headroom has been consumed without an actual pod move, making + // the gate conservative on the next candidate. Deferring the subtraction + // would require restructuring the eviction loop to call back into the + // gate on success; the current behaviour favours safety (admit fewer + // pods than nominally possible) over throughput. + targetDomain, ok := podFitsSomeDomainWithHeadroom(getPodsAssignedToNode, aboveToEvict[k], nodesByDomain, remainingHeadroom) + if !ok { + d.logger.V(2).Info("ignoring pod for eviction: no target topology domain has fit + remaining headroom", "pod", klog.KObj(aboveToEvict[k])) + continue + } + d.logger.V(2).Info("ZoneAwareNodeFit admitted pod; target domain headroom decremented", "pod", klog.KObj(aboveToEvict[k]), "targetDomain", targetDomain) + } + podsForEviction[aboveToEvict[k]] = struct{}{} } sortedDomains[j].pods = sortedDomains[j].pods[:len(sortedDomains[j].pods)-movePods] @@ -405,6 +445,150 @@ func filterNodesBelowIdealAvg(nodes []*v1.Node, sortedDomains []topology, topolo return nodesBelowIdealAvg } +// groupNodesByDomain classifies the given nodes by their topology-domain label value. +// Nodes missing the label are skipped. Returns an empty (non-nil) map if no nodes have the label. +func groupNodesByDomain(nodes []*v1.Node, topologyKey string) map[string][]*v1.Node { + result := make(map[string][]*v1.Node) + for _, n := range nodes { + if v, ok := n.Labels[topologyKey]; ok { + result[v] = append(result[v], n) + } + } + return result +} + +// computeDomainHeadroom returns, for each topology domain, the aggregate remaining +// resource headroom (allocatable minus already-requested) summed across the domain's +// nodes. Tracked resources: cpu, memory, pod count. +// +// Fails closed: if the pod indexer cannot list pods on a node, that node is omitted +// from the headroom entirely (neither its allocatable nor any requests are counted) +// and the error is logged. Counting allocatable without subtracting requests would +// over-state available capacity and re-introduce the eviction churn this gate is +// meant to prevent. +func computeDomainHeadroom(nodesByDomain map[string][]*v1.Node, nodeIndexer podutil.GetPodsAssignedToNodeFunc) map[string]v1.ResourceList { + result := make(map[string]v1.ResourceList, len(nodesByDomain)) + for domain, dnodes := range nodesByDomain { + var cpuMilli, memBytes, podSlots int64 + for _, n := range dnodes { + // Filter is intentionally nil so this aggregation reasons over the same pod + // set as nodeAvailableResources / fitsRequest (pkg/descheduler/node/node.go), + // which also calls ListPodsOnANode with no filter. Diverging here would let + // per-node fit and per-domain headroom disagree about what counts as occupied + // capacity; if fitsRequest ever tightens its filter (e.g. to exclude + // terminal/terminating pods), this call site must be updated in the same + // change to keep the two halves of the gate aligned. + podsOnNode, err := podutil.ListPodsOnANode(n.Name, nodeIndexer, nil) + if err != nil { + klog.V(2).ErrorS(err, "ZoneAwareNodeFit: failed to list pods on node; excluding from domain headroom (fail-closed)", "node", n.Name, "domain", domain) + continue + } + cpuMilli += n.Status.Allocatable.Cpu().MilliValue() + memBytes += n.Status.Allocatable.Memory().Value() + podSlots += n.Status.Allocatable.Pods().Value() + for _, p := range podsOnNode { + req, _ := utils.PodRequestsAndLimits(p) + if q, ok := req[v1.ResourceCPU]; ok { + cpuMilli -= q.MilliValue() + } + if q, ok := req[v1.ResourceMemory]; ok { + memBytes -= q.Value() + } + podSlots-- + } + } + result[domain] = v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(cpuMilli, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(memBytes, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(podSlots, resource.DecimalSI), + } + } + return result +} + +// podFitsSomeDomainWithHeadroom returns the topology-domain key whose remaining aggregate +// headroom can absorb the pod AND whose nodes include at least one where the pod fits per +// the scheduler's per-node predicates. On success it decrements that domain's headroom by +// the pod's requests (cpu/mem/1 pod slot) and returns ok=true. Returns ok=false if no +// domain qualifies. +// +// Domain selection order: domains are iterated by descending remaining CPU headroom so +// pods are spread toward the roomiest under-loaded domain first; ties (and zero/negative +// headroom) fall back to alphabetical order for determinism. This avoids draining +// alphabetically-earlier domains first and leaving later candidates rejected even though +// a different domain had headroom to spare. +// +// The CPU-only sort key is intentional: CPU is the most commonly request-constrained +// resource for workloads we are spreading across topology domains, and a multi-resource +// composite scoring (e.g. normalized headroom across the pod's actual requests) would +// add complexity without changing behaviour for the typical case. For memory-heavy pods +// in environments with abundant CPU slack but uneven memory headroom, the alphabetical +// fallback can pick a domain with tighter memory than another candidate; the +// headroomCoversPod check below still rejects domains that cannot absorb the pod, so +// this only affects ordering, not correctness. +func podFitsSomeDomainWithHeadroom( + nodeIndexer podutil.GetPodsAssignedToNodeFunc, + pod *v1.Pod, + nodesByDomain map[string][]*v1.Node, + remainingHeadroom map[string]v1.ResourceList, +) (string, bool) { + podReq, _ := utils.PodRequestsAndLimits(pod) + domains := make([]string, 0, len(nodesByDomain)) + for d := range nodesByDomain { + domains = append(domains, d) + } + sort.Slice(domains, func(i, j int) bool { + ci := remainingHeadroom[domains[i]][v1.ResourceCPU] + cj := remainingHeadroom[domains[j]][v1.ResourceCPU] + if ci.MilliValue() != cj.MilliValue() { + return ci.MilliValue() > cj.MilliValue() + } + return domains[i] < domains[j] + }) + + for _, domain := range domains { + if !headroomCoversPod(remainingHeadroom[domain], podReq) { + continue + } + if !node.PodFitsAnyOtherNode(nodeIndexer, pod, nodesByDomain[domain]) { + continue + } + subtractPodFromHeadroom(remainingHeadroom, domain, podReq) + return domain, true + } + return "", false +} + +// headroomCoversPod reports whether the domain's remaining headroom is sufficient to +// absorb pod's cpu/memory request plus one pod slot. Missing keys are treated as zero. +func headroomCoversPod(headroom, podReq v1.ResourceList) bool { + cpuHead := headroom[v1.ResourceCPU] + memHead := headroom[v1.ResourceMemory] + podsHead := headroom[v1.ResourcePods] + cpuReq := podReq[v1.ResourceCPU] + memReq := podReq[v1.ResourceMemory] + return cpuHead.MilliValue() >= cpuReq.MilliValue() && + memHead.Value() >= memReq.Value() && + podsHead.Value() >= 1 +} + +// subtractPodFromHeadroom decrements remainingHeadroom[domain] by the pod's cpu/memory +// request and one pod slot. +func subtractPodFromHeadroom(remainingHeadroom map[string]v1.ResourceList, domain string, podReq v1.ResourceList) { + rl := remainingHeadroom[domain] + cpuReq := podReq[v1.ResourceCPU] + memReq := podReq[v1.ResourceMemory] + cpu := rl[v1.ResourceCPU] + cpu.Sub(cpuReq) + rl[v1.ResourceCPU] = cpu + mem := rl[v1.ResourceMemory] + mem.Sub(memReq) + rl[v1.ResourceMemory] = mem + pods := rl[v1.ResourcePods] + pods.Sub(*resource.NewQuantity(1, resource.DecimalSI)) + rl[v1.ResourcePods] = pods +} + // sortDomains sorts and splits the list of topology domains based on their size // it also sorts the list of pods within the domains based on their node affinity/selector and priority in the following order: // 1. non-evictable pods diff --git a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/topologyspreadconstraint_test.go b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/topologyspreadconstraint_test.go index 4a484288ab..a3ea9d6402 100644 --- a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/topologyspreadconstraint_test.go +++ b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/topologyspreadconstraint_test.go @@ -9,6 +9,7 @@ import ( appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" @@ -16,6 +17,7 @@ import ( utilptr "k8s.io/utils/ptr" "sigs.k8s.io/descheduler/pkg/api" + podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" "sigs.k8s.io/descheduler/pkg/framework/plugins/defaultevictor" frameworktesting "sigs.k8s.io/descheduler/pkg/framework/testing" frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" @@ -1454,6 +1456,219 @@ func TestTopologySpreadConstraint(t *testing.T) { namespaces: []string{"ns1"}, args: RemovePodsViolatingTopologySpreadConstraintArgs{}, }, + // --- ZoneAwareNodeFit test cases --- + // All five share the same topology: zoneA (over-loaded, 7 pods) vs zoneB (under-loaded, + // 1 pod). zoneB has just 250m CPU on a single node, of which 100m is already used by the + // existing pod — i.e. 150m of aggregate CPU headroom, enough for exactly one additional + // 100m-request pod. balanceDomains schedules 3 evictions toward zoneB. + // + // These cases isolate ZoneAwareNodeFit by disabling DefaultEvictor NodeFit and (where + // noted) TopologyBalanceNodeFit, so the only differentiator is the new gate. + { + name: "ZoneAwareNodeFit caps cumulative load on top of TopologyBalanceNodeFit (1 of 3)", + // Critical regression: with the previous OR(merge) implementation of + // ZoneAwareNodeFit, this case would evict 3 (the stateless per-node-fit check + // passes for every candidate because the live indexer reports B1 at 150m + // throughout the loop). The cumulative-headroom gate must reduce this to 1, + // even when TopologyBalanceNodeFit is also enabled. This proves ZoneAwareNodeFit + // adds value over the existing flag — not just as a renamed alias. + nodes: []*v1.Node{ + test.BuildTestNode("A1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("B1", 250, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 7, + node: "A1", + labels: map[string]string{"foo": "bar"}, + constraints: getDefaultTopologyConstraints(1), + }, + {count: 1, node: "B1", labels: map[string]string{"foo": "bar"}}, + }), + expectedEvictedCount: 1, + namespaces: []string{"ns1"}, + args: RemovePodsViolatingTopologySpreadConstraintArgs{ + // TopologyBalanceNodeFit default-on; ZoneAwareNodeFit on. The new gate + // must still cap evictions at zoneB's 150m headroom. + ZoneAwareNodeFit: utilptr.To(true), + }, + nodeFit: false, + }, + { + name: "ZoneAwareNodeFit=true alone caps evictions at zoneB's aggregate headroom (1 of 3)", + nodes: []*v1.Node{ + test.BuildTestNode("A1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("B1", 250, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 7, + node: "A1", + labels: map[string]string{"foo": "bar"}, + constraints: getDefaultTopologyConstraints(1), + }, + {count: 1, node: "B1", labels: map[string]string{"foo": "bar"}}, + }), + // zoneB headroom: 250m allocatable - 100m existing = 150m → fits exactly 1 more + // 100m-request pod. Pod 1 commits and decrements headroom to 50m. Pods 2 and 3 are + // rejected by the aggregate-headroom check (50m < 100m) even though per-node fit + // (live indexer view) still says B1 has room. + expectedEvictedCount: 1, + namespaces: []string{"ns1"}, + args: RemovePodsViolatingTopologySpreadConstraintArgs{ + TopologyBalanceNodeFit: utilptr.To(false), + ZoneAwareNodeFit: utilptr.To(true), + }, + nodeFit: false, + }, + { + name: "TopologyBalanceNodeFit=true alone does NOT cap cumulative load (evicts 3)", + // Same topology and same cumulative-overflow condition as the previous case. This + // case shows the existing TopologyBalanceNodeFit gate cannot detect the over-commit + // because it evaluates each candidate pod against a stateless per-node fit check; + // all three candidates individually see B1 as having 150m headroom (the indexer is + // not mutated mid-loop), so all three pass and get evicted — the exact churn the + // new gate prevents. + nodes: []*v1.Node{ + test.BuildTestNode("A1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("B1", 250, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 7, + node: "A1", + labels: map[string]string{"foo": "bar"}, + constraints: getDefaultTopologyConstraints(1), + }, + {count: 1, node: "B1", labels: map[string]string{"foo": "bar"}}, + }), + expectedEvictedCount: 3, + namespaces: []string{"ns1"}, + args: RemovePodsViolatingTopologySpreadConstraintArgs{}, // defaults: TBNF=true, ZANF=false + nodeFit: false, + }, + { + name: "ZoneAwareNodeFit drains domain by domain and caps total at sum of headroom", + // 3 domains, each under-loaded zone individually tight. Under the previous + // OR(merge) implementation, every candidate would see at least one node in some + // domain with per-node headroom — all candidates pass. The cumulative gate must + // pick the roomiest domain first (zoneC, 250m), commit until it can no longer + // fit, then fall back to zoneB (150m), and reject any 4th candidate because + // both domains are saturated. + // + // zoneB: 250m allocatable - 100m used = 150m headroom → fits 1 × 100m pod + // zoneC: 350m allocatable - 100m used = 250m headroom → fits 2 × 100m pods + nodes: []*v1.Node{ + test.BuildTestNode("A1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("B1", 250, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + test.BuildTestNode("C1", 350, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }), + }, + pods: createTestPods([]testPodList{ + { + count: 7, + node: "A1", + labels: map[string]string{"foo": "bar"}, + constraints: getDefaultTopologyConstraints(1), + }, + {count: 1, node: "B1", labels: map[string]string{"foo": "bar"}}, + {count: 1, node: "C1", labels: map[string]string{"foo": "bar"}}, + }), + // Total batch headroom: 150 + 250 = 400m → fits 3 × 100m pods plus 100m + // stranded across the two domains. The previous OR(merge) implementation + // would admit all 4 candidates; the new gate admits 3 (2 to zoneC, 1 to + // zoneB) and rejects the 4th when both domains are drained below the pod + // request. + expectedEvictedCount: 3, + namespaces: []string{"ns1"}, + args: RemovePodsViolatingTopologySpreadConstraintArgs{ + TopologyBalanceNodeFit: utilptr.To(false), + ZoneAwareNodeFit: utilptr.To(true), + }, + nodeFit: false, + }, + { + name: "ZoneAwareNodeFit still requires per-node fit (aggregate headroom alone is insufficient)", + // zoneB has 5 nodes with 50m CPU each → 250m aggregate headroom (looks like 2 pods' + // worth), but no single node can host a 100m-request pod. The new gate must reject. + nodes: []*v1.Node{ + test.BuildTestNode("A1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("B1", 50, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + test.BuildTestNode("B2", 50, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + test.BuildTestNode("B3", 50, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + test.BuildTestNode("B4", 50, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + test.BuildTestNode("B5", 50, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 7, + node: "A1", + labels: map[string]string{"foo": "bar"}, + constraints: getDefaultTopologyConstraints(1), + }, + {count: 1, node: "B1", labels: map[string]string{"foo": "bar"}}, + }), + expectedEvictedCount: 0, + namespaces: []string{"ns1"}, + args: RemovePodsViolatingTopologySpreadConstraintArgs{ + TopologyBalanceNodeFit: utilptr.To(false), + ZoneAwareNodeFit: utilptr.To(true), + }, + nodeFit: false, + }, + { + name: "ZoneAwareNodeFit aggregates headroom across multiple nodes in the same below-ideal domain", + // Verifies grouping correctness when more than one node belongs to the same + // below-ideal domain. zoneB has two nodes (B1, B2) each with 200m allocatable + // and a 100m existing pod → 100m per-node headroom, 200m aggregate. A grouping + // bug that mis-bucketed nodes (e.g. counted only one B node) would yield + // expectedEvictedCount=1; correct multi-node aggregation yields 2 commits + // (cumulatively draining the domain) and a 3rd rejection. + nodes: []*v1.Node{ + test.BuildTestNode("A1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("B1", 200, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + test.BuildTestNode("B2", 200, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 9, + node: "A1", + labels: map[string]string{"foo": "bar"}, + constraints: getDefaultTopologyConstraints(1), + }, + {count: 1, node: "B1", labels: map[string]string{"foo": "bar"}}, + {count: 1, node: "B2", labels: map[string]string{"foo": "bar"}}, + }), + expectedEvictedCount: 2, + namespaces: []string{"ns1"}, + args: RemovePodsViolatingTopologySpreadConstraintArgs{ + TopologyBalanceNodeFit: utilptr.To(false), + ZoneAwareNodeFit: utilptr.To(true), + }, + nodeFit: false, + }, + { + name: "ZoneAwareNodeFit default (false) is no-op — behaviour matches prior versions", + // Default args (TBNF=true, ZANF=false). With nodeFit=true on DefaultEvictor and + // zoneB unable to host more pods, eviction is blocked by the DefaultEvictor's own + // pre-filter — confirming the new flag introduces no behavioural change when off. + nodes: []*v1.Node{ + test.BuildTestNode("A1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("B1", 50, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 4, + node: "A1", + labels: map[string]string{"foo": "bar"}, + constraints: getDefaultTopologyConstraints(1), + }, + {count: 1, node: "B1", labels: map[string]string{"foo": "bar"}}, + }), + expectedEvictedCount: 0, + namespaces: []string{"ns1"}, + args: RemovePodsViolatingTopologySpreadConstraintArgs{}, + nodeFit: true, + }, } for _, tc := range testCases { @@ -1818,6 +2033,224 @@ func getDefaultTopologyConstraints(maxSkew int32, edits ...func(*v1.TopologySpre return []v1.TopologySpreadConstraint{constraint} } +func TestGroupNodesByDomain(t *testing.T) { + mkNode := func(name, zone string) *v1.Node { + return test.BuildTestNode(name, 1000, 1<<30, 10, func(n *v1.Node) { + if zone != "" { + n.Labels["zone"] = zone + } + }) + } + nodes := []*v1.Node{ + mkNode("A1", "zoneA"), + mkNode("A2", "zoneA"), + mkNode("B1", "zoneB"), + mkNode("X1", ""), // missing label + } + got := groupNodesByDomain(nodes, "zone") + if len(got["zoneA"]) != 2 { + t.Errorf("zoneA: got %d nodes, want 2", len(got["zoneA"])) + } + if len(got["zoneB"]) != 1 { + t.Errorf("zoneB: got %d nodes, want 1", len(got["zoneB"])) + } + if _, ok := got[""]; ok { + t.Error("nodes missing the topology key must not produce an empty-string entry") + } +} + +func TestComputeDomainHeadroom(t *testing.T) { + // zoneA has two nodes: A1 (1000m cpu, no pods), A2 (500m cpu, one 200m pod). + // Expected aggregate headroom: (1000 - 0) + (500 - 200) = 1300m. Pod slots: 10 + 10 - 1 = 19. + a1 := test.BuildTestNode("A1", 1000, 1<<30, 10, nil) + a2 := test.BuildTestNode("A2", 500, 1<<30, 10, nil) + occupant := test.BuildTestPod("occ", 200, 0, "A2", nil) + indexer := podutil.GetPodsAssignedToNodeFunc(func(nodeName string, filter podutil.FilterFunc) ([]*v1.Pod, error) { + if nodeName == "A2" { + return []*v1.Pod{occupant}, nil + } + return nil, nil + }) + + got := computeDomainHeadroom(map[string][]*v1.Node{"zoneA": {a1, a2}}, indexer) + if cpu := got["zoneA"][v1.ResourceCPU]; cpu.MilliValue() != 1300 { + t.Errorf("aggregate CPU: got %dm, want 1300m", cpu.MilliValue()) + } + if pods := got["zoneA"][v1.ResourcePods]; pods.Value() != 19 { + t.Errorf("aggregate pod slots: got %d, want 19", pods.Value()) + } +} + +func TestComputeDomainHeadroomFailsClosedOnIndexerError(t *testing.T) { + // If ListPodsOnANode fails for a node, that node must contribute zero — neither + // its allocatable nor any pod requests should land in the aggregate. Over-counting + // would re-introduce the eviction churn this gate prevents. + a1 := test.BuildTestNode("A1", 1000, 1<<30, 10, nil) + a2 := test.BuildTestNode("A2", 500, 1<<30, 10, nil) + indexer := podutil.GetPodsAssignedToNodeFunc(func(nodeName string, filter podutil.FilterFunc) ([]*v1.Pod, error) { + if nodeName == "A2" { + return nil, fmt.Errorf("simulated indexer failure") + } + return nil, nil + }) + + got := computeDomainHeadroom(map[string][]*v1.Node{"zoneA": {a1, a2}}, indexer) + // Only A1 (1000m, 10 pod slots) contributes; A2 is excluded fail-closed. + if cpu := got["zoneA"][v1.ResourceCPU]; cpu.MilliValue() != 1000 { + t.Errorf("aggregate CPU after fail-closed exclusion: got %dm, want 1000m", cpu.MilliValue()) + } + if pods := got["zoneA"][v1.ResourcePods]; pods.Value() != 10 { + t.Errorf("aggregate pod slots after fail-closed exclusion: got %d, want 10", pods.Value()) + } +} + +func TestPodFitsSomeDomainWithHeadroom(t *testing.T) { + // Build a stub node indexer that reports the supplied pods as living on the named node. + makeIndexer := func(podsByNode map[string][]*v1.Pod) podutil.GetPodsAssignedToNodeFunc { + return podutil.GetPodsAssignedToNodeFunc(func(nodeName string, filter podutil.FilterFunc) ([]*v1.Pod, error) { + out := []*v1.Pod{} + for _, p := range podsByNode[nodeName] { + if filter == nil || filter(p) { + out = append(out, p) + } + } + return out, nil + }) + } + + t.Run("commits to roomiest qualifying domain and decrements its headroom", func(t *testing.T) { + // zoneA has more CPU headroom than zoneB, so even though zoneB sorts first + // alphabetically, the candidate must commit to zoneA. This guards against + // regressing back to an alphabetical-first selection that would drain the + // tighter domain prematurely. + nodesByDomain := map[string][]*v1.Node{ + "zoneB": {test.BuildTestNode("B1", 1000, 1<<30, 10, nil)}, + "zoneA": {test.BuildTestNode("A1", 1000, 1<<30, 10, nil)}, + } + headroom := map[string]v1.ResourceList{ + "zoneA": { + v1.ResourceCPU: *resource.NewMilliQuantity(800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(1<<28, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI), + }, + "zoneB": { + v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(1<<28, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI), + }, + } + pod := test.BuildTestPod("candidate", 100, 0, "other", nil) + indexer := makeIndexer(nil) + + domain, ok := podFitsSomeDomainWithHeadroom(indexer, pod, nodesByDomain, headroom) + if !ok || domain != "zoneA" { + t.Fatalf("expected commit to roomiest domain zoneA, got domain=%q ok=%v", domain, ok) + } + if cpu := headroom["zoneA"][v1.ResourceCPU]; cpu.MilliValue() != 700 { + t.Errorf("zoneA CPU headroom after commit: got %dm, want 700m", cpu.MilliValue()) + } + if cpu := headroom["zoneB"][v1.ResourceCPU]; cpu.MilliValue() != 300 { + t.Errorf("zoneB CPU headroom must be untouched: got %dm, want 300m", cpu.MilliValue()) + } + }) + + t.Run("equal headroom falls back to alphabetical for determinism", func(t *testing.T) { + nodesByDomain := map[string][]*v1.Node{ + "zoneB": {test.BuildTestNode("B1", 1000, 1<<30, 10, nil)}, + "zoneA": {test.BuildTestNode("A1", 1000, 1<<30, 10, nil)}, + } + headroom := map[string]v1.ResourceList{ + "zoneA": { + v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(1<<28, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI), + }, + "zoneB": { + v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(1<<28, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI), + }, + } + pod := test.BuildTestPod("candidate", 100, 0, "other", nil) + indexer := makeIndexer(nil) + + domain, ok := podFitsSomeDomainWithHeadroom(indexer, pod, nodesByDomain, headroom) + if !ok || domain != "zoneA" { + t.Fatalf("expected alphabetical tie-break to commit to zoneA, got domain=%q ok=%v", domain, ok) + } + }) + + t.Run("repeated calls drain headroom and then reject", func(t *testing.T) { + nodesByDomain := map[string][]*v1.Node{ + "zoneA": {test.BuildTestNode("A1", 1000, 1<<30, 10, nil)}, + } + headroom := map[string]v1.ResourceList{ + "zoneA": { + v1.ResourceCPU: *resource.NewMilliQuantity(250, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(1<<28, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI), + }, + } + indexer := makeIndexer(nil) + pod := test.BuildTestPod("p", 100, 0, "other", nil) + + if _, ok := podFitsSomeDomainWithHeadroom(indexer, pod, nodesByDomain, headroom); !ok { + t.Fatal("first call: expected ok=true (250m >= 100m)") + } + if _, ok := podFitsSomeDomainWithHeadroom(indexer, pod, nodesByDomain, headroom); !ok { + t.Fatal("second call: expected ok=true (150m >= 100m)") + } + if _, ok := podFitsSomeDomainWithHeadroom(indexer, pod, nodesByDomain, headroom); ok { + t.Fatal("third call: expected ok=false (50m < 100m)") + } + }) + + t.Run("returns false when no domain has aggregate headroom", func(t *testing.T) { + nodesByDomain := map[string][]*v1.Node{ + "zoneA": {test.BuildTestNode("A1", 1000, 1<<30, 10, nil)}, + } + headroom := map[string]v1.ResourceList{ + "zoneA": { + v1.ResourceCPU: *resource.NewMilliQuantity(50, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(1<<28, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI), + }, + } + pod := test.BuildTestPod("p", 100, 0, "other", nil) + if _, ok := podFitsSomeDomainWithHeadroom(makeIndexer(nil), pod, nodesByDomain, headroom); ok { + t.Error("expected ok=false when aggregate CPU is insufficient") + } + }) + + t.Run("returns false when no domain node can host the pod (per-node fit fails)", func(t *testing.T) { + // Node has plenty of aggregate headroom by allocatable, but an existing pod on the + // node consumes most of it, leaving no per-node room. Aggregate headroom is supplied + // separately so only the per-node fit check (via fitsRequest) should reject. + smallNode := test.BuildTestNode("A1", 120, 1<<30, 10, nil) + nodesByDomain := map[string][]*v1.Node{"zoneA": {smallNode}} + headroom := map[string]v1.ResourceList{ + "zoneA": { + v1.ResourceCPU: *resource.NewMilliQuantity(10000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(1<<30, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(10, resource.DecimalSI), + }, + } + occupant := test.BuildTestPod("occupant", 100, 0, "A1", nil) + indexer := makeIndexer(map[string][]*v1.Pod{"A1": {occupant}}) + candidate := test.BuildTestPod("candidate", 100, 0, "other", nil) + if _, ok := podFitsSomeDomainWithHeadroom(indexer, candidate, nodesByDomain, headroom); ok { + t.Error("expected ok=false when no node has per-node room despite aggregate headroom") + } + }) + + t.Run("empty nodesByDomain returns false without error", func(t *testing.T) { + pod := test.BuildTestPod("p", 100, 0, "other", nil) + if _, ok := podFitsSomeDomainWithHeadroom(makeIndexer(nil), pod, map[string][]*v1.Node{}, map[string]v1.ResourceList{}); ok { + t.Error("expected ok=false on empty input") + } + }) +} + func TestCheckIdenticalConstraints(t *testing.T) { selector, _ := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}) diff --git a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/types.go b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/types.go index 272dde144a..cf45e2e423 100644 --- a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/types.go +++ b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/types.go @@ -33,4 +33,19 @@ type RemovePodsViolatingTopologySpreadConstraintArgs struct { LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty"` Constraints []v1.UnsatisfiableConstraintAction `json:"constraints,omitempty"` TopologyBalanceNodeFit *bool `json:"topologyBalanceNodeFit,omitempty"` + // ZoneAwareNodeFit, if set to true, requires that each pod evicted by + // topology-spread balancing have, in at least one specific under-loaded + // topology domain (as identified by the constraint's TopologyKey), + // (a) a node where the pod fits per the scheduler's per-node predicates + // AND (b) sufficient remaining aggregate resource headroom — after + // accounting for other pods already committed to that domain during + // the same balancing round — to absorb the pod. + // + // Unlike TopologyBalanceNodeFit, which only checks per-node fit across + // the union of under-loaded domains, ZoneAwareNodeFit reasons about + // per-domain cumulative capacity. This prevents over-committing a single + // under-loaded domain within one balancing round, which would otherwise + // cause the scheduler to push evicted pods back to the over-loaded + // domain (eviction churn). Defaults to false. + ZoneAwareNodeFit *bool `json:"zoneAwareNodeFit,omitempty"` } diff --git a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/zz_generated.deepcopy.go b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/zz_generated.deepcopy.go index 223bcbc913..dde145651c 100644 --- a/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/zz_generated.deepcopy.go +++ b/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/zz_generated.deepcopy.go @@ -52,6 +52,11 @@ func (in *RemovePodsViolatingTopologySpreadConstraintArgs) DeepCopyInto(out *Rem *out = new(bool) **out = **in } + if in.ZoneAwareNodeFit != nil { + in, out := &in.ZoneAwareNodeFit, &out.ZoneAwareNodeFit + *out = new(bool) + **out = **in + } return }