From 32a9a8eded4443b16505c87bc3b3217b19797c04 Mon Sep 17 00:00:00 2001 From: Nour Date: Sun, 17 May 2026 19:01:24 +0300 Subject: [PATCH] Adds an example showing how to query DRA resource pool availability with new ResourcePoolStatusRequest API Signed-off-by: Nour --- demo/resource-pool-status.yaml | 95 +++++++++++++++++++++++++++ demo/scripts/kind-cluster-config.yaml | 3 +- test/e2e/e2e_setup_test.go | 49 ++++++++++++++ test/e2e/e2e_test.go | 9 +++ 4 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 demo/resource-pool-status.yaml diff --git a/demo/resource-pool-status.yaml b/demo/resource-pool-status.yaml new file mode 100644 index 00000000..cc8e93e5 --- /dev/null +++ b/demo/resource-pool-status.yaml @@ -0,0 +1,95 @@ +# Example: DRA Resource Availability Visibility (ResourcePoolStatusRequest) +# +# Demonstrates the +# [DRA resource pool status](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#resource-pool-status) +# feature. +# For more information see +# [KEP-5677](https://github.com/kubernetes/enhancements/issues/5677). +# +# This manifest is self-contained: it creates one Pod that consumes 1 GPU and a +# cluster-scoped `ResourcePoolStatusRequest` that asks +# `kube-controller-manager` to publish a one-shot snapshot of every pool served +# by the example driver. The snapshot lists total / allocated / available / +# unavailable device counts per pool. +# +# `ResourcePoolStatusRequest` is particularly useful for non-admin users: +# `ResourceClaim`s are namespaced, so a user cannot ordinarily inspect claims +# in other namespaces. A cluster-scoped `ResourcePoolStatusRequest` lets them +# see aggregate consumption without that visibility. The driver itself needs +# no code changes; the aggregation is computed entirely by +# `kube-controller-manager` from existing `ResourceSlice`s and +# `ResourceClaim`s. +# +# Expected: once the controller reconciles the request, `.status` reports +# `allocatedDevices: 1` for the pool backing the consumer Pod. Inspect with: +# kubectl wait --for=condition=Complete \ +# resourcepoolstatusrequest/gpu-pool-status --timeout=30s +# kubectl get resourcepoolstatusrequest/gpu-pool-status -o yaml +# +# `ResourcePoolStatusRequest` is one-shot. To refresh the snapshot after +# launching or tearing down workloads, delete and re-create the request: +# kubectl delete resourcepoolstatusrequest/gpu-pool-status +# kubectl apply --filename=demo/resource-pool-status.yaml +# +# Driver requirements: +# Profile: gpu +# GPUs: 1 +# +# Cluster requirements: +# Kubernetes 1.36+ +# Feature gate: DRAResourcePoolStatus (on kube-apiserver and +# kube-controller-manager) +# API enabled: resource.k8s.io/v1alpha3 + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: resource-pool-status + +--- +apiVersion: resource.k8s.io/v1 +kind: ResourceClaimTemplate +metadata: + namespace: resource-pool-status + name: single-gpu +spec: + spec: + devices: + requests: + - name: gpu + exactly: + deviceClassName: gpu.example.com + +--- +apiVersion: v1 +kind: Pod +metadata: + namespace: resource-pool-status + name: pod0 + labels: + app: pod +spec: + containers: + - name: ctr0 + image: ubuntu:22.04 + command: ["bash", "-c"] + args: ["export; trap 'exit 0' TERM; sleep 9999 & wait"] + resources: + claims: + - name: gpu + resourceClaims: + - name: gpu + resourceClaimTemplateName: single-gpu + +--- +apiVersion: resource.k8s.io/v1alpha3 +kind: ResourcePoolStatusRequest +metadata: + name: gpu-pool-status +spec: + driver: gpu.example.com + # Optional: filter to a specific pool (typically the node name). + # poolName: dra-example-driver-cluster-worker + # Optional: cap the number of pools returned. Defaults to 100, max 1000. + # limit: 10 diff --git a/demo/scripts/kind-cluster-config.yaml b/demo/scripts/kind-cluster-config.yaml index 5f8dab5e..344c9026 100644 --- a/demo/scripts/kind-cluster-config.yaml +++ b/demo/scripts/kind-cluster-config.yaml @@ -7,6 +7,7 @@ featureGates: GangScheduling: true GenericWorkload: true DRAExtendedResource: true + DRAResourcePoolStatus: true containerdConfigPatches: # Enable CDI as described in # https://tags.cncf.io/container-device-interface#containerd-configuration @@ -20,7 +21,7 @@ nodes: kind: ClusterConfiguration apiServer: extraArgs: - runtime-config: "resource.k8s.io/v1beta1=true,scheduling.k8s.io/v1alpha2=true" + runtime-config: "resource.k8s.io/v1beta1=true,resource.k8s.io/v1alpha3=true,scheduling.k8s.io/v1alpha2=true" scheduler: extraArgs: v: "1" diff --git a/test/e2e/e2e_setup_test.go b/test/e2e/e2e_setup_test.go index ac6a28d8..18de8cf3 100644 --- a/test/e2e/e2e_setup_test.go +++ b/test/e2e/e2e_setup_test.go @@ -455,6 +455,55 @@ func verifyExtendedResourceClaimStatus(ctx context.Context, namespace, podName, }, checkPodLogsTimeout, checkPodLogsInterval).Should(Succeed()) } +// resourcePoolStatusRequestGVR identifies the v1alpha3 cluster-scoped resource +var resourcePoolStatusRequestGVR = schema.GroupVersionResource{ + Group: "resource.k8s.io", + Version: "v1alpha3", + Resource: "resourcepoolstatusrequests", +} + +// verifyResourcePoolStatusComplete waits for the named ResourcePoolStatusRequest +// to reach the Complete condition and asserts its first pool entry references +// the expected driver name. +func verifyResourcePoolStatusComplete(ctx context.Context, name, expectedDriverName string) { + GinkgoHelper() + Eventually(func(g Gomega) { + rpsr, err := dynamicClient.Resource(resourcePoolStatusRequestGVR).Get(ctx, name, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred(), + "Failed to get ResourcePoolStatusRequest %s", name) + + conditions, _, err := unstructured.NestedSlice(rpsr.Object, "status", "conditions") + g.Expect(err).NotTo(HaveOccurred()) + var complete bool + for _, c := range conditions { + cm, ok := c.(map[string]any) + if !ok { + continue + } + if cm["type"] == "Complete" && cm["status"] == "True" { + complete = true + break + } + } + g.Expect(complete).To(BeTrue(), + "ResourcePoolStatusRequest %s has no Complete=True condition; conditions: %v", + name, conditions) + + pools, _, err := unstructured.NestedSlice(rpsr.Object, "status", "pools") + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(pools).NotTo(BeEmpty(), + "ResourcePoolStatusRequest %s reported no pools for driver %s", + name, expectedDriverName) + + pool, ok := pools[0].(map[string]any) + g.Expect(ok).To(BeTrue(), "pool entry is not a map: %T", pools[0]) + g.Expect(pool["driver"]).To(Equal(expectedDriverName), + "ResourcePoolStatusRequest %s pool driver mismatch", name) + g.Expect(pool["poolName"]).NotTo(BeEmpty(), + "ResourcePoolStatusRequest %s pool has empty poolName", name) + }).WithContext(ctx).WithTimeout(30 * time.Second).WithPolling(2 * time.Second).Should(Succeed()) +} + // claimNewGPU verifies that a GPU is unclaimed and adds it to observedGPUs. func claimNewGPU(g Gomega, observedGPUs map[string]string, gpu, namespace, podName, containerName string) { GinkgoHelper() diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index d879d5d0..5d1fecbd 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -161,6 +161,15 @@ var _ = Describe("Test GPU allocation", func() { verifyDRAAdminAccess(ctx, namespace, pods[0], containerName, "true") }) + It("should publish a ResourcePoolStatusRequest snapshot for the driver", func(ctx SpecContext) { + drv := installDriver(ctx, DriverConfig{}) + namespace := "resource-pool-status" + + deployManifest(ctx, namespace, "resource-pool-status.yaml", drv) + checkPodsReadyAndRunning(ctx, namespace, []string{"pod0"}) + verifyResourcePoolStatusComplete(ctx, "gpu-pool-status", drv.DriverName) + }) + It("should allocate 1 GPU per pod for extended resource requests", func(ctx SpecContext) { // Each parallel test must advertise its DeviceClass under a unique // extended resource name so KEP-5004 reservations don't collide.