diff --git a/cmd/kueue/main.go b/cmd/kueue/main.go index b97872190f1..318c185b1c1 100644 --- a/cmd/kueue/main.go +++ b/cmd/kueue/main.go @@ -299,7 +299,7 @@ func main() { cacheOptions = append(cacheOptions, schdcache.WithResourceTransformations(cfg.Resources.Transformations)) queueOptions = append(queueOptions, qcache.WithResourceTransformations(cfg.Resources.Transformations)) } - if features.Enabled(features.DynamicResourceAllocation) && cfg.Resources != nil && len(cfg.Resources.DeviceClassMappings) > 0 { + if features.Enabled(features.KueueDRAIntegration) && cfg.Resources != nil && len(cfg.Resources.DeviceClassMappings) > 0 { if err := dra.CreateMapperFromConfiguration(cfg.Resources.DeviceClassMappings); err != nil { setupLog.Error(err, "Failed to initialize DRA mapper from configuration") os.Exit(1) diff --git a/keps/2941-DRA/kep.yaml b/keps/2941-DRA/kep.yaml index bf7320150d6..1e0b4b4bc39 100644 --- a/keps/2941-DRA/kep.yaml +++ b/keps/2941-DRA/kep.yaml @@ -35,8 +35,8 @@ milestone: # The following PRR answers are required at alpha release # List the feature gate name and the components for which it must be enabled feature-gates: - - name: DynamicResourceAllocation - - name: DRAExtendedResources + - name: KueueDRAIntegration + - name: KueueDRAIntegrationExtendedResource disable-supported: true # The following PRR answers are required at beta release diff --git a/pkg/cache/queue/cluster_queue.go b/pkg/cache/queue/cluster_queue.go index 9b47a859fc5..9a76658d3b0 100644 --- a/pkg/cache/queue/cluster_queue.go +++ b/pkg/cache/queue/cluster_queue.go @@ -393,7 +393,7 @@ func priorityBoostAnnotationChanged(oldInfo, newInfo *workload.Info) bool { // DRA extended resources are resolved in Reconcile, which can modify TotalRequests // without changing the workload Spec. func draRequestsChanged(oldInfo, newInfo *workload.Info) bool { - if !features.Enabled(features.DynamicResourceAllocation) { + if !features.Enabled(features.KueueDRAIntegration) { return false } return !equality.Semantic.DeepEqual(oldInfo.TotalRequests, newInfo.TotalRequests) diff --git a/pkg/cache/queue/manager.go b/pkg/cache/queue/manager.go index 7e61d620d10..145db2eda16 100644 --- a/pkg/cache/queue/manager.go +++ b/pkg/cache/queue/manager.go @@ -511,7 +511,7 @@ func (m *Manager) AddLocalQueue(ctx context.Context, q *kueue.LocalQueue) error return err } - if !features.Enabled(features.DynamicResourceAllocation) { + if !features.Enabled(features.KueueDRAIntegration) { return nil } diff --git a/pkg/cache/queue/manager_test.go b/pkg/cache/queue/manager_test.go index 7dde5a77ce0..2baab6f1bdc 100644 --- a/pkg/cache/queue/manager_test.go +++ b/pkg/cache/queue/manager_test.go @@ -96,7 +96,7 @@ func TestAddLocalQueueOrphans(t *testing.T) { } func TestAddLocalQueue_DRAReconcileChannelGuaranteedDelivery(t *testing.T) { - features.SetFeatureGateDuringTest(t, features.DynamicResourceAllocation, true) + features.SetFeatureGateDuringTest(t, features.KueueDRAIntegration, true) // Create an admissible workload that triggers dra.NeedsDRAReconcile via HasDRA(). tmplName := "claim-tmpl" diff --git a/pkg/config/validation.go b/pkg/config/validation.go index 2e9e44b39cc..0d15a1e6b24 100644 --- a/pkg/config/validation.go +++ b/pkg/config/validation.go @@ -91,6 +91,7 @@ func Validate(c *configapi.Configuration, scheme *runtime.Scheme) field.ErrorLis allErrs = append(allErrs, validateVisibilityServer(c)...) allErrs = append(allErrs, validateCustomLabels(c)...) allErrs = append(allErrs, validateQuotaCheckStrategy(c)...) + allErrs = append(allErrs, validateDRAFeatureGateDependencies()...) return allErrs } @@ -570,12 +571,18 @@ func LoadAndValidateFeatureGates(featureGateCLI string, featureGateMap map[strin } } - if features.Enabled(features.DRAExtendedResources) { - if !features.Enabled(features.DynamicResourceAllocation) { - allErrs = append(allErrs, field.Invalid(featureGatesPath, "DRAExtendedResources", "DRAExtendedResources requires DynamicResourceAllocation to be enabled")) + allErrs = append(allErrs, validateDRAFeatureGateDependencies()...) + + return allErrs +} + +func validateDRAFeatureGateDependencies() field.ErrorList { + var allErrs field.ErrorList + if features.Enabled(features.KueueDRAIntegrationExtendedResource) { + if !features.Enabled(features.KueueDRAIntegration) { + allErrs = append(allErrs, field.Invalid(featureGatesPath, "KueueDRAIntegrationExtendedResource", "KueueDRAIntegrationExtendedResource requires KueueDRAIntegration to be enabled")) } } - return allErrs } diff --git a/pkg/config/validation_test.go b/pkg/config/validation_test.go index 19592638a9e..fd97a353c44 100644 --- a/pkg/config/validation_test.go +++ b/pkg/config/validation_test.go @@ -1070,6 +1070,21 @@ func TestValidate(t *testing.T) { }, }, }, + "KueueDRAIntegrationExtendedResource requires KueueDRAIntegration": { + cfg: &configapi.Configuration{ + Integrations: defaultIntegrations, + }, + featureGates: map[featuregate.Feature]bool{ + features.KueueDRAIntegrationExtendedResource: true, + features.KueueDRAIntegration: false, + }, + wantErr: field.ErrorList{ + &field.Error{ + Type: field.ErrorTypeInvalid, + Field: "featureGates", + }, + }, + }, } for name, tc := range testCases { @@ -1093,18 +1108,18 @@ func TestLoadAndValidateFeatureGates(t *testing.T) { featureGatesCLI: "", }, "feature gate cli": { - featureGatesCLI: string(features.DynamicResourceAllocation) + "=false", + featureGatesCLI: string(features.KueueDRAIntegration) + "=false", gatesToRestore: map[featuregate.Feature]bool{ - features.DynamicResourceAllocation: false, + features.KueueDRAIntegration: false, }, }, "cannot specify both feature gates": { - featureGatesCLI: string(features.DynamicResourceAllocation) + "=false", + featureGatesCLI: string(features.KueueDRAIntegration) + "=false", featureGateMap: map[string]bool{ - string(features.DynamicResourceAllocation): false, + string(features.KueueDRAIntegration): false, }, gatesToRestore: map[featuregate.Feature]bool{ - features.DynamicResourceAllocation: false, + features.KueueDRAIntegration: false, }, wantErr: field.ErrorList{ &field.Error{ @@ -1218,20 +1233,20 @@ func TestLoadAndValidateFeatureGates(t *testing.T) { }, }, }, - "DRAExtendedResources requires DynamicResourceAllocation": { + "KueueDRAIntegrationExtendedResource requires KueueDRAIntegration": { featureGateMap: map[string]bool{ - string(features.DRAExtendedResources): true, - string(features.DynamicResourceAllocation): false, + string(features.KueueDRAIntegrationExtendedResource): true, + string(features.KueueDRAIntegration): false, }, gatesToRestore: map[featuregate.Feature]bool{ - features.DRAExtendedResources: false, - features.DynamicResourceAllocation: true, + features.KueueDRAIntegrationExtendedResource: false, + features.KueueDRAIntegration: true, }, wantErr: field.ErrorList{ &field.Error{ Type: field.ErrorTypeInvalid, Field: "featureGates", - Detail: "DRAExtendedResources requires DynamicResourceAllocation to be enabled", + Detail: "KueueDRAIntegrationExtendedResource requires KueueDRAIntegration to be enabled", }, }, }, diff --git a/pkg/controller/core/core.go b/pkg/controller/core/core.go index 07ff1079e7e..50d2be5ba2f 100644 --- a/pkg/controller/core/core.go +++ b/pkg/controller/core/core.go @@ -100,7 +100,7 @@ func SetupControllers(mgr ctrl.Manager, qManager *qcache.Manager, cc *schdcache. WithWorkloadCustomLabels(customLabels), WithAdmissionFairSharing(cfg.AdmissionFairSharing), ) - if features.Enabled(features.DynamicResourceAllocation) { + if features.Enabled(features.KueueDRAIntegration) { qManager.SetDRAReconcileChannel(workloadRec.GetDRAReconcileChannel()) } diff --git a/pkg/controller/core/indexer/indexer.go b/pkg/controller/core/indexer/indexer.go index 9a9bd2a7e5b..233b98294f5 100644 --- a/pkg/controller/core/indexer/indexer.go +++ b/pkg/controller/core/indexer/indexer.go @@ -268,7 +268,7 @@ func Setup(ctx context.Context, indexer client.FieldIndexer) error { } } // Index DeviceClasses by extendedResourceName for fast lookup during extended resource translation. - if features.Enabled(features.DRAExtendedResources) { + if features.Enabled(features.KueueDRAIntegrationExtendedResource) { if err := indexer.IndexField(ctx, &resourceapi.DeviceClass{}, DeviceClassExtendedResourceNameIndex, IndexDeviceClassExtendedResourceName); err != nil { return fmt.Errorf("setting index on extendedResourceName for DeviceClass: %w", err) } diff --git a/pkg/controller/core/workload_controller.go b/pkg/controller/core/workload_controller.go index d1faa3a2d4e..9ea2010ad63 100644 --- a/pkg/controller/core/workload_controller.go +++ b/pkg/controller/core/workload_controller.go @@ -288,7 +288,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c if workload.HasResourceClaim(&wl) { log.V(3).Info("Workload is inadmissible because it uses resource claims which is not supported") err := workload.PatchAdmissionStatus(ctx, r.client, &wl, r.clock, func(wl *kueue.Workload) (bool, error) { - updated := workload.UnsetQuotaReservationWithCondition(wl, kueue.WorkloadInadmissible, "DynamicResourceAllocation feature does not support use of resource claims", r.clock.Now()) + updated := workload.UnsetQuotaReservationWithCondition(wl, kueue.WorkloadInadmissible, "KueueDRAIntegration feature does not support use of resource claims", r.clock.Now()) if updated && workload.SetRequeuedCondition(wl, kueue.WorkloadInadmissible, "DRA resource claims not supported", false) { updated = true } @@ -323,7 +323,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c // Process Extended Resources backed by DRA (new path) var extendedResources map[kueue.PodSetReference]corev1.ResourceList var replacedExtendedResources map[kueue.PodSetReference]sets.Set[corev1.ResourceName] - if features.Enabled(features.DRAExtendedResources) { + if features.Enabled(features.KueueDRAIntegrationExtendedResource) { var extFieldErrs field.ErrorList extendedResources, replacedExtendedResources, extFieldErrs = dra.ResolveExtendedResourceQuota(ctx, r.client, &wl) if len(extFieldErrs) > 0 { diff --git a/pkg/controller/core/workload_controller_test.go b/pkg/controller/core/workload_controller_test.go index c5ee2f33e18..fddc5b7ab96 100644 --- a/pkg/controller/core/workload_controller_test.go +++ b/pkg/controller/core/workload_controller_test.go @@ -424,7 +424,7 @@ func TestReconcile(t *testing.T) { }{ "reconcile DRA ResourceClaim should be rejected as inadmissible": { featureGates: map[featuregate.Feature]bool{ - features.DynamicResourceAllocation: true, + features.KueueDRAIntegration: true, features.MultiKueueOrchestratedPreemption: false, }, workload: utiltestingapi.MakeWorkload("wlWithDRAResourceClaim", "ns"). @@ -453,7 +453,7 @@ func TestReconcile(t *testing.T) { Type: kueue.WorkloadQuotaReserved, Status: metav1.ConditionFalse, Reason: kueue.WorkloadInadmissible, - Message: "DynamicResourceAllocation feature does not support use of resource claims", + Message: "KueueDRAIntegration feature does not support use of resource claims", }). Condition(metav1.Condition{ Type: kueue.WorkloadRequeued, @@ -466,7 +466,7 @@ func TestReconcile(t *testing.T) { }, "reconcile DRA ResourceClaimTemplate should be pre-processed and queued": { featureGates: map[featuregate.Feature]bool{ - features.DynamicResourceAllocation: true, + features.KueueDRAIntegration: true, features.MultiKueueOrchestratedPreemption: false, }, wantDRAResourceTotal: new(int64(1)), @@ -504,7 +504,7 @@ func TestReconcile(t *testing.T) { }, "reconcile DRA ResourceClaimTemplate multi-pod should be pre-processed and queued": { featureGates: map[featuregate.Feature]bool{ - features.DynamicResourceAllocation: true, + features.KueueDRAIntegration: true, features.MultiKueueOrchestratedPreemption: false, }, wantDRAResourceTotal: new(int64(6)), @@ -542,7 +542,7 @@ func TestReconcile(t *testing.T) { }, "reconcile DRA ResourceClaimTemplate with unmapped device class": { featureGates: map[featuregate.Feature]bool{ - features.DynamicResourceAllocation: true, + features.KueueDRAIntegration: true, features.MultiKueueOrchestratedPreemption: false, }, workload: utiltestingapi.MakeWorkload("wlUnmappedDRA", "ns"). @@ -594,7 +594,7 @@ func TestReconcile(t *testing.T) { }, "reconcile DRA ResourceClaimTemplate not found should return error": { featureGates: map[featuregate.Feature]bool{ - features.DynamicResourceAllocation: true, + features.KueueDRAIntegration: true, features.MultiKueueOrchestratedPreemption: false, }, workload: utiltestingapi.MakeWorkload("wlMissingTemplate", "ns"). @@ -2936,7 +2936,7 @@ func TestReconcile(t *testing.T) { }, "should synchronize the status of preemption gates": { featureGates: map[featuregate.Feature]bool{ - features.DynamicResourceAllocation: false, + features.KueueDRAIntegration: false, features.MultiKueueOrchestratedPreemption: true, }, cq: utiltestingapi.MakeClusterQueue("cq").Obj(), @@ -3251,7 +3251,7 @@ func TestReconcile(t *testing.T) { queueOptions := []qcache.Option{qcache.WithPreemptionExpectations(preemptexpectations.New())} qManager := qcache.NewManagerForUnitTests(cl, cqCache, queueOptions...) reconciler := NewWorkloadReconciler(cl, qManager, cqCache, recorder, tc.reconcilerOpts...) - if features.Enabled(features.DynamicResourceAllocation) { + if features.Enabled(features.KueueDRAIntegration) { qManager.SetDRAReconcileChannel(reconciler.GetDRAReconcileChannel()) } // use a fake clock with jitter = 0 to be able to assert on the requeueAt. @@ -3346,7 +3346,7 @@ func TestReconcile(t *testing.T) { } // For DRA tests, verify that workloads are properly queued/cached - if tc.featureGates[features.DynamicResourceAllocation] && testWl != nil && + if tc.featureGates[features.KueueDRAIntegration] && testWl != nil && len(testWl.Spec.PodSets) > 0 && len(testWl.Spec.PodSets[0].Template.Spec.ResourceClaims) > 0 { workloadKey := client.ObjectKeyFromObject(testWl) diff --git a/pkg/dra/extended_resources.go b/pkg/dra/extended_resources.go index 3c52ac71457..4f5cf926902 100644 --- a/pkg/dra/extended_resources.go +++ b/pkg/dra/extended_resources.go @@ -42,13 +42,13 @@ import ( // Note: there is no DeviceClass watcher. If a DeviceClass is created after a workload // was marked inadmissible, requeuing depends on the next QueueInadmissibleWorkloads event. func NeedsDRAReconcile(wl *kueue.Workload) bool { - if !features.Enabled(features.DynamicResourceAllocation) { + if !features.Enabled(features.KueueDRAIntegration) { return false } if workload.HasDRA(wl) { return true } - if !features.Enabled(features.DRAExtendedResources) { + if !features.Enabled(features.KueueDRAIntegrationExtendedResource) { return false } for i := range wl.Spec.PodSets { diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 70213b39804..4f8c007858a 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -175,7 +175,10 @@ const ( // owner: @alaypatel07 // kep: https://github.com/kubernetes-sigs/kueue/tree/main/keps/2941-DRA // - // Enable quota accounting for Dynamic Resource Allocation (DRA) devices in workloads + // Enable quota accounting for Dynamic Resource Allocation (DRA) devices in workloads. + KueueDRAIntegration featuregate.Feature = "KueueDRAIntegration" + + // Deprecated: planned to be removed in 0.19. Use KueueDRAIntegration instead. DynamicResourceAllocation featuregate.Feature = "DynamicResourceAllocation" // owner: @sohankunkerkar @@ -183,6 +186,9 @@ const ( // // Enable extended resources support for DRA. Allows workloads to request DRA devices // via standard resources.requests using DeviceClass extendedResourceName. + KueueDRAIntegrationExtendedResource featuregate.Feature = "KueueDRAIntegrationExtendedResource" + + // Deprecated: planned to be removed in 0.19. Use KueueDRAIntegrationExtendedResource instead. DRAExtendedResources featuregate.Feature = "DRAExtendedResources" // owner: @MaysaMacedo @@ -484,11 +490,19 @@ var defaultVersionedFeatureGates = map[featuregate.Feature]featuregate.Versioned TASBalancedPlacement: { {Version: version.MustParse("0.15"), Default: false, PreRelease: featuregate.Alpha}, }, + KueueDRAIntegration: { + {Version: version.MustParse("0.18"), Default: false, PreRelease: featuregate.Alpha}, + }, DynamicResourceAllocation: { {Version: version.MustParse("0.14"), Default: false, PreRelease: featuregate.Alpha}, + {Version: version.MustParse("0.18"), Default: false, PreRelease: featuregate.Deprecated, LockToDefault: true}, // remove in 0.19 + }, + KueueDRAIntegrationExtendedResource: { + {Version: version.MustParse("0.18"), Default: false, PreRelease: featuregate.Alpha}, }, DRAExtendedResources: { {Version: version.MustParse("0.17"), Default: false, PreRelease: featuregate.Alpha}, + {Version: version.MustParse("0.18"), Default: false, PreRelease: featuregate.Deprecated, LockToDefault: true}, // remove in 0.19 }, MultiKueueAdaptersForCustomJobs: { {Version: version.MustParse("0.14"), Default: false, PreRelease: featuregate.Alpha}, diff --git a/pkg/workload/workload.go b/pkg/workload/workload.go index 018ba74162e..2b1e84b3f92 100644 --- a/pkg/workload/workload.go +++ b/pkg/workload/workload.go @@ -605,7 +605,7 @@ func totalRequestsFromPodSets(wl *kueue.Workload, info *InfoOptions) []PodSetRes effectiveRequests := dropExcludedResources(specRequests, info.excludedResourcePrefixes) effectiveRequests = applyResourceTransformations(effectiveRequests, info.resourceTransformations) setRes.Requests = resources.NewRequests(effectiveRequests) - if features.Enabled(features.DynamicResourceAllocation) && info.preprocessedDRAResources != nil { + if features.Enabled(features.KueueDRAIntegration) && info.preprocessedDRAResources != nil { // First, remove extended resources that were converted to DRA logical resources if replacedRes, exists := info.replacedExtendedResources[ps.Name]; exists { for extRes := range replacedRes { diff --git a/pkg/workload/workload_test.go b/pkg/workload/workload_test.go index 6d42799f212..16b73052bf9 100644 --- a/pkg/workload/workload_test.go +++ b/pkg/workload/workload_test.go @@ -1677,7 +1677,7 @@ func TestNeedsSecondPass(t *testing.T) { } func TestWithPreprocessedDRAResources(t *testing.T) { - features.SetFeatureGateDuringTest(t, features.DynamicResourceAllocation, true) + features.SetFeatureGateDuringTest(t, features.KueueDRAIntegration, true) cases := map[string]struct { workload kueue.Workload @@ -1798,7 +1798,7 @@ func TestWithPreprocessedDRAResources(t *testing.T) { } func TestWithPreprocessedDRAResourcesReplacesExtendedResources(t *testing.T) { - features.SetFeatureGateDuringTest(t, features.DynamicResourceAllocation, true) + features.SetFeatureGateDuringTest(t, features.KueueDRAIntegration, true) cases := map[string]struct { workload kueue.Workload diff --git a/site/content/en/docs/concepts/dynamic_resource_allocation.md b/site/content/en/docs/concepts/dynamic_resource_allocation.md index a4f59a4774b..1647d75887c 100644 --- a/site/content/en/docs/concepts/dynamic_resource_allocation.md +++ b/site/content/en/docs/concepts/dynamic_resource_allocation.md @@ -6,6 +6,12 @@ description: > Quota management for workloads using Kubernetes Dynamic Resource Allocation (DRA). --- +{{% alert title="Warning" color="warning" %}} +In Kueue 0.18, the DRA feature gates were renamed to avoid conflicts with upstream +Kubernetes feature gates: `DynamicResourceAllocation` is now `KueueDRAIntegration`, +and `DRAExtendedResources` is now `KueueDRAIntegrationExtendedResource`. +{{% /alert %}} + ## Dynamic Resource Allocation [Dynamic Resource Allocation (DRA)](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/) @@ -57,11 +63,11 @@ For setup instructions, see When a Pod requests an extended resource backed by DRA (e.g., `nvidia.com/gpu: 1`), the kube-scheduler auto-creates a `ResourceClaim`. -Without the `DRAExtendedResources` feature gate enabled, Kueue would charge +Without the `KueueDRAIntegrationExtendedResource` feature gate enabled, Kueue would charge quota for both the `resources.requests` entry **and** the auto-created claim, double counting the same device. -With `DRAExtendedResources` enabled, Kueue detects the matching `DeviceClass`, +With `KueueDRAIntegrationExtendedResource` enabled, Kueue detects the matching `DeviceClass`, uses `extendedResourceName` as the quota key, and drops the auto-created claim from accounting. No `deviceClassMappings` configuration is needed — the mapping is discovered from the `DeviceClass` automatically. @@ -69,8 +75,8 @@ mapping is discovered from the `DeviceClass` automatically. {{% alert title="Note" color="info" %}} The extended resource path additionally requires the Kubernetes `DRAExtendedResource` feature gate on kube-apiserver and kube-scheduler -(alpha in Kubernetes 1.34), in addition to Kueue's `DynamicResourceAllocation` -and `DRAExtendedResources` feature gates. +(alpha in Kubernetes 1.34), in addition to Kueue's `KueueDRAIntegration` +and `KueueDRAIntegrationExtendedResource` feature gates. {{% /alert %}} ## Path separation diff --git a/site/content/en/docs/tasks/manage/setup_dra.md b/site/content/en/docs/tasks/manage/setup_dra.md index ef5e9ffea0c..3a16afb7e02 100644 --- a/site/content/en/docs/tasks/manage/setup_dra.md +++ b/site/content/en/docs/tasks/manage/setup_dra.md @@ -29,6 +29,12 @@ Make sure the following conditions are met: for production). - [Kueue is installed](/docs/installation). +{{% alert title="Warning" color="warning" %}} +In Kueue 0.18, the DRA feature gates were renamed to avoid conflicts with upstream +Kubernetes feature gates: `DynamicResourceAllocation` is now `KueueDRAIntegration`, +and `DRAExtendedResources` is now `KueueDRAIntegrationExtendedResource`. +{{% /alert %}} + ## Choose a quota accounting path Kueue supports two paths for accounting DRA devices in quota. Choose the one @@ -36,8 +42,8 @@ that matches how your users submit workloads: | Path | User's Pod spec | Kueue feature gate | Admin configuration | |------|----------------|-------------------|-------------------| -| ResourceClaimTemplate | References a `ResourceClaimTemplate` | `DynamicResourceAllocation` | `deviceClassMappings` required | -| Extended resource | Uses `resources.requests` (e.g., `nvidia.com/gpu: 1`) | `DynamicResourceAllocation` + `DRAExtendedResources` | No mapping needed | +| ResourceClaimTemplate | References a `ResourceClaimTemplate` | `KueueDRAIntegration` | `deviceClassMappings` required | +| Extended resource | Uses `resources.requests` (e.g., `nvidia.com/gpu: 1`) | `KueueDRAIntegration` + `KueueDRAIntegrationExtendedResource` | No mapping needed | ## Set up the ResourceClaimTemplate path @@ -48,7 +54,7 @@ Use this path when your users submit workloads that explicitly reference ### 1. Enable the feature gate -Install or reconfigure Kueue with the `DynamicResourceAllocation` feature gate +Install or reconfigure Kueue with the `KueueDRAIntegration` feature gate enabled. Follow the [custom configuration installation instructions](/docs/installation#install-a-custom-configured-released-version). @@ -61,7 +67,7 @@ Add a `deviceClassMappings` entry to the Kueue Configuration that maps each apiVersion: config.kueue.x-k8s.io/v1beta2 kind: Configuration featureGates: - DynamicResourceAllocation: true + KueueDRAIntegration: true resources: deviceClassMappings: - name: example.com/gpu # Logical resource name for quota @@ -116,8 +122,8 @@ Install or reconfigure Kueue with both feature gates enabled: apiVersion: config.kueue.x-k8s.io/v1beta2 kind: Configuration featureGates: - DynamicResourceAllocation: true - DRAExtendedResources: true + KueueDRAIntegration: true + KueueDRAIntegrationExtendedResource: true ``` The Kubernetes cluster also needs the `DRAExtendedResource` feature gate @@ -171,7 +177,7 @@ kubectl apply -f https://kueue.sigs.k8s.io/examples/dra/sample-dra-queues.yaml ### Why this path exists -Without the `DRAExtendedResources` feature gate, Kueue charges quota for both +Without the `KueueDRAIntegrationExtendedResource` feature gate, Kueue charges quota for both the `resources.requests` entry and the auto-created `ResourceClaim`, double counting the same device. With the feature gate enabled, Kueue detects the matching `DeviceClass` and charges quota only for the extended resource. diff --git a/site/content/en/docs/tasks/run/dra.md b/site/content/en/docs/tasks/run/dra.md index f8d710f8378..7a72e452a32 100644 --- a/site/content/en/docs/tasks/run/dra.md +++ b/site/content/en/docs/tasks/run/dra.md @@ -134,7 +134,7 @@ If the Workload stays in `Pending` state: ### Double counting (extended resource path) If quota usage shows double the expected value (e.g., `2` instead of `1` for -a single GPU), the `DRAExtendedResources` feature gate may not be enabled. +a single GPU), the `KueueDRAIntegrationExtendedResource` feature gate may not be enabled. Ask your administrator to verify the [DRA setup](/docs/tasks/manage/setup_dra). diff --git a/site/data/featuregates/versioned_feature_list.yaml b/site/data/featuregates/versioned_feature_list.yaml index e0d2c9e0a47..f27d09a6e82 100644 --- a/site/data/featuregates/versioned_feature_list.yaml +++ b/site/data/featuregates/versioned_feature_list.yaml @@ -41,12 +41,20 @@ lockToDefault: false preRelease: Alpha version: "0.17" + - default: false + lockToDefault: true + preRelease: Deprecated + version: "0.18" - name: DynamicResourceAllocation versionedSpecs: - default: false lockToDefault: false preRelease: Alpha version: "0.14" + - default: false + lockToDefault: true + preRelease: Deprecated + version: "0.18" - name: ElasticJobsViaWorkloadSlices versionedSpecs: - default: false @@ -105,6 +113,18 @@ lockToDefault: true preRelease: GA version: "0.17" +- name: KueueDRAIntegration + versionedSpecs: + - default: false + lockToDefault: false + preRelease: Alpha + version: "0.18" +- name: KueueDRAIntegrationExtendedResource + versionedSpecs: + - default: false + lockToDefault: false + preRelease: Alpha + version: "0.18" - name: LendingLimit versionedSpecs: - default: false diff --git a/test/compatibility_lifecycle/reference/versioned_feature_list.yaml b/test/compatibility_lifecycle/reference/versioned_feature_list.yaml index e0d2c9e0a47..f27d09a6e82 100644 --- a/test/compatibility_lifecycle/reference/versioned_feature_list.yaml +++ b/test/compatibility_lifecycle/reference/versioned_feature_list.yaml @@ -41,12 +41,20 @@ lockToDefault: false preRelease: Alpha version: "0.17" + - default: false + lockToDefault: true + preRelease: Deprecated + version: "0.18" - name: DynamicResourceAllocation versionedSpecs: - default: false lockToDefault: false preRelease: Alpha version: "0.14" + - default: false + lockToDefault: true + preRelease: Deprecated + version: "0.18" - name: ElasticJobsViaWorkloadSlices versionedSpecs: - default: false @@ -105,6 +113,18 @@ lockToDefault: true preRelease: GA version: "0.17" +- name: KueueDRAIntegration + versionedSpecs: + - default: false + lockToDefault: false + preRelease: Alpha + version: "0.18" +- name: KueueDRAIntegrationExtendedResource + versionedSpecs: + - default: false + lockToDefault: false + preRelease: Alpha + version: "0.18" - name: LendingLimit versionedSpecs: - default: false diff --git a/test/e2e/config/dra/controller_manager_config.yaml b/test/e2e/config/dra/controller_manager_config.yaml index 7d43f63a940..4216ef97918 100644 --- a/test/e2e/config/dra/controller_manager_config.yaml +++ b/test/e2e/config/dra/controller_manager_config.yaml @@ -20,8 +20,8 @@ integrations: - "batch/job" - "pod" featureGates: - DynamicResourceAllocation: true - DRAExtendedResources: true + KueueDRAIntegration: true + KueueDRAIntegrationExtendedResource: true resources: deviceClassMappings: - name: gpu diff --git a/test/integration/multikueue/dra_test.go b/test/integration/multikueue/dra_test.go index dffdd7f12a8..f43b428cfbe 100644 --- a/test/integration/multikueue/dra_test.go +++ b/test/integration/multikueue/dra_test.go @@ -73,7 +73,7 @@ var _ = ginkgo.Describe("MultiKueue with DRA", ginkgo.Label("area:multikueue", " }) ginkgo.BeforeEach(func() { - features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.DynamicResourceAllocation, true) + features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.KueueDRAIntegration, true) managerNs = util.CreateNamespaceFromPrefixWithLog(managerTestCluster.ctx, managerTestCluster.client, "multikueue-dra-") worker1Ns = util.CreateNamespaceWithLog(worker1TestCluster.ctx, worker1TestCluster.client, managerNs.Name) diff --git a/test/integration/singlecluster/controller/dra/dra_test.go b/test/integration/singlecluster/controller/dra/dra_test.go index 3c87dd38b6a..fec0873be45 100644 --- a/test/integration/singlecluster/controller/dra/dra_test.go +++ b/test/integration/singlecluster/controller/dra/dra_test.go @@ -851,7 +851,7 @@ var _ = ginkgo.Describe("DRA Integration", ginkgo.Ordered, ginkgo.ContinueOnFail const extendedResourceName = "example.com/gpu" ginkgo.BeforeAll(func() { - features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.DRAExtendedResources, true) + features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.KueueDRAIntegrationExtendedResource, true) fwk.StopManager(ctx) fwk.StartManager(ctx, cfg, managerSetup(nil)) }) @@ -957,8 +957,8 @@ var _ = ginkgo.Describe("DRA Integration", ginkgo.Ordered, ginkgo.ContinueOnFail ) ginkgo.BeforeAll(func() { - features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.DynamicResourceAllocation, true) - features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.DRAExtendedResources, true) + features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.KueueDRAIntegration, true) + features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.KueueDRAIntegrationExtendedResource, true) deviceClass = &resourcev1.DeviceClass{ ObjectMeta: metav1.ObjectMeta{Name: "gpu.example.com"}, @@ -1043,7 +1043,7 @@ var _ = ginkgo.Describe("DRA Integration", ginkgo.Ordered, ginkgo.ContinueOnFail }) }) - ginkgo.When("DRAExtendedResources feature gate disabled", func() { + ginkgo.When("KueueDRAIntegrationExtendedResource feature gate disabled", func() { var ( ns *corev1.Namespace resourceFlavor *kueue.ResourceFlavor @@ -1054,8 +1054,8 @@ var _ = ginkgo.Describe("DRA Integration", ginkgo.Ordered, ginkgo.ContinueOnFail const extendedResourceName = "example.com/gpu" ginkgo.BeforeAll(func() { - features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.DynamicResourceAllocation, true) - features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.DRAExtendedResources, false) + features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.KueueDRAIntegration, true) + features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.KueueDRAIntegrationExtendedResource, false) fwk.StopManager(ctx) fwk.StartManager(ctx, cfg, managerSetup(nil)) }) diff --git a/test/integration/singlecluster/controller/dra/suite_test.go b/test/integration/singlecluster/controller/dra/suite_test.go index d6641276346..5323ef0205a 100644 --- a/test/integration/singlecluster/controller/dra/suite_test.go +++ b/test/integration/singlecluster/controller/dra/suite_test.go @@ -53,7 +53,7 @@ func TestAPIs(t *testing.T) { } var _ = ginkgo.BeforeSuite(func() { - features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.DynamicResourceAllocation, true) + features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.KueueDRAIntegration, true) fwk = &framework.Framework{ WebhookPath: util.WebhookPath, diff --git a/test/integration/singlecluster/scheduler/fairsharing/suite_test.go b/test/integration/singlecluster/scheduler/fairsharing/suite_test.go index 1a6b7a434fe..9ff91bdede4 100644 --- a/test/integration/singlecluster/scheduler/fairsharing/suite_test.go +++ b/test/integration/singlecluster/scheduler/fairsharing/suite_test.go @@ -57,7 +57,7 @@ func TestScheduler(t *testing.T) { } var _ = ginkgo.BeforeSuite(func() { - features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.DynamicResourceAllocation, true) + features.SetFeatureGateDuringTest(ginkgo.GinkgoTB(), features.KueueDRAIntegration, true) fwk = &framework.Framework{ WebhookPath: util.WebhookPath,