Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions demo/resource-pool-status.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Example: DRA Resource Availability Visibility (ResourcePoolStatusRequest)
#
# Demonstrates the
# [DRA resource pool status](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#resource-pool-status)
# feature.
# For more information see
# [KEP-5677](https://github.com/kubernetes/enhancements/issues/5677).
#
# This manifest is self-contained: it creates one Pod that consumes 1 GPU and a
# cluster-scoped `ResourcePoolStatusRequest` that asks
# `kube-controller-manager` to publish a one-shot snapshot of every pool served
# by the example driver. The snapshot lists total / allocated / available /
# unavailable device counts per pool.
#
# `ResourcePoolStatusRequest` is particularly useful for non-admin users:
# `ResourceClaim`s are namespaced, so a user cannot ordinarily inspect claims
# in other namespaces. A cluster-scoped `ResourcePoolStatusRequest` lets them
# see aggregate consumption without that visibility. The driver itself needs
# no code changes; the aggregation is computed entirely by
# `kube-controller-manager` from existing `ResourceSlice`s and
# `ResourceClaim`s.
#
# Expected: once the controller reconciles the request, `.status` reports
# `allocatedDevices: 1` for the pool backing the consumer Pod. Inspect with:
# kubectl wait --for=condition=Complete \
# resourcepoolstatusrequest/gpu-pool-status --timeout=30s
# kubectl get resourcepoolstatusrequest/gpu-pool-status -o yaml
#
# `ResourcePoolStatusRequest` is one-shot. To refresh the snapshot after
# launching or tearing down workloads, delete and re-create the request:
# kubectl delete resourcepoolstatusrequest/gpu-pool-status
# kubectl apply --filename=demo/resource-pool-status.yaml
#
# Driver requirements:
# Profile: gpu
# GPUs: 1
#
# Cluster requirements:
# Kubernetes 1.36+
# Feature gate: DRAResourcePoolStatus (on kube-apiserver and
# kube-controller-manager)
# API enabled: resource.k8s.io/v1alpha3

---
apiVersion: v1
kind: Namespace
metadata:
name: resource-pool-status

---
apiVersion: resource.k8s.io/v1
kind: ResourceClaimTemplate
metadata:
namespace: resource-pool-status
name: single-gpu
spec:
spec:
devices:
requests:
- name: gpu
exactly:
deviceClassName: gpu.example.com

---
apiVersion: v1
kind: Pod
metadata:
namespace: resource-pool-status
name: pod0
labels:
app: pod
spec:
containers:
- name: ctr0
image: ubuntu:22.04
command: ["bash", "-c"]
args: ["export; trap 'exit 0' TERM; sleep 9999 & wait"]
resources:
claims:
- name: gpu
resourceClaims:
- name: gpu
resourceClaimTemplateName: single-gpu

---
apiVersion: resource.k8s.io/v1alpha3
kind: ResourcePoolStatusRequest
metadata:
name: gpu-pool-status
spec:
driver: gpu.example.com
# Optional: filter to a specific pool (typically the node name).
# poolName: dra-example-driver-cluster-worker
# Optional: cap the number of pools returned. Defaults to 100, max 1000.
# limit: 10
3 changes: 2 additions & 1 deletion demo/scripts/kind-cluster-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ featureGates:
GangScheduling: true
GenericWorkload: true
DRAExtendedResource: true
DRAResourcePoolStatus: true
containerdConfigPatches:
# Enable CDI as described in
# https://tags.cncf.io/container-device-interface#containerd-configuration
Expand All @@ -20,7 +21,7 @@ nodes:
kind: ClusterConfiguration
apiServer:
extraArgs:
runtime-config: "resource.k8s.io/v1beta1=true,scheduling.k8s.io/v1alpha2=true"
runtime-config: "resource.k8s.io/v1beta1=true,resource.k8s.io/v1alpha3=true,scheduling.k8s.io/v1alpha2=true"
scheduler:
extraArgs:
v: "1"
Expand Down
49 changes: 49 additions & 0 deletions test/e2e/e2e_setup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,55 @@ func verifyExtendedResourceClaimStatus(ctx context.Context, namespace, podName,
}, checkPodLogsTimeout, checkPodLogsInterval).Should(Succeed())
}

// resourcePoolStatusRequestGVR identifies the v1alpha3 cluster-scoped resource
var resourcePoolStatusRequestGVR = schema.GroupVersionResource{
Group: "resource.k8s.io",
Version: "v1alpha3",
Resource: "resourcepoolstatusrequests",
}

// verifyResourcePoolStatusComplete waits for the named ResourcePoolStatusRequest
// to reach the Complete condition and asserts its first pool entry references
// the expected driver name.
func verifyResourcePoolStatusComplete(ctx context.Context, name, expectedDriverName string) {
GinkgoHelper()
Eventually(func(g Gomega) {
rpsr, err := dynamicClient.Resource(resourcePoolStatusRequestGVR).Get(ctx, name, metav1.GetOptions{})
g.Expect(err).NotTo(HaveOccurred(),
"Failed to get ResourcePoolStatusRequest %s", name)

conditions, _, err := unstructured.NestedSlice(rpsr.Object, "status", "conditions")
g.Expect(err).NotTo(HaveOccurred())
var complete bool
for _, c := range conditions {
cm, ok := c.(map[string]any)
if !ok {
continue
}
if cm["type"] == "Complete" && cm["status"] == "True" {
complete = true
break
}
}
g.Expect(complete).To(BeTrue(),
"ResourcePoolStatusRequest %s has no Complete=True condition; conditions: %v",
name, conditions)

pools, _, err := unstructured.NestedSlice(rpsr.Object, "status", "pools")
g.Expect(err).NotTo(HaveOccurred())
g.Expect(pools).NotTo(BeEmpty(),
"ResourcePoolStatusRequest %s reported no pools for driver %s",
name, expectedDriverName)

pool, ok := pools[0].(map[string]any)
g.Expect(ok).To(BeTrue(), "pool entry is not a map: %T", pools[0])
g.Expect(pool["driver"]).To(Equal(expectedDriverName),
"ResourcePoolStatusRequest %s pool driver mismatch", name)
g.Expect(pool["poolName"]).NotTo(BeEmpty(),
"ResourcePoolStatusRequest %s pool has empty poolName", name)
}).WithContext(ctx).WithTimeout(30 * time.Second).WithPolling(2 * time.Second).Should(Succeed())
}

// claimNewGPU verifies that a GPU is unclaimed and adds it to observedGPUs.
func claimNewGPU(g Gomega, observedGPUs map[string]string, gpu, namespace, podName, containerName string) {
GinkgoHelper()
Expand Down
9 changes: 9 additions & 0 deletions test/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,15 @@ var _ = Describe("Test GPU allocation", func() {
verifyDRAAdminAccess(ctx, namespace, pods[0], containerName, "true")
})

It("should publish a ResourcePoolStatusRequest snapshot for the driver", func(ctx SpecContext) {
drv := installDriver(ctx, DriverConfig{})
namespace := "resource-pool-status"

deployManifest(ctx, namespace, "resource-pool-status.yaml", drv)
checkPodsReadyAndRunning(ctx, namespace, []string{"pod0"})
verifyResourcePoolStatusComplete(ctx, "gpu-pool-status", drv.DriverName)
})

It("should allocate 1 GPU per pod for extended resource requests", func(ctx SpecContext) {
// Each parallel test must advertise its DeviceClass under a unique
// extended resource name so KEP-5004 reservations don't collide.
Expand Down