From 8dd88805c7259655c13168b2eda657d7cc1dadb2 Mon Sep 17 00:00:00 2001 From: Matt Boersma Date: Tue, 10 Mar 2026 16:29:31 +0000 Subject: [PATCH] Add ManagedGPUExperiencePreview nodepool and tests --- ...cluster-template-prow-aks-aso-kuberay.yaml | 45 +++ .../prow-aks-aso-kuberay/kustomization.yaml | 11 + .../patches/aks-gpu-pool.yaml | 42 +++ test/e2e/azure_kuberay.go | 270 ++++++++++++++++++ test/e2e/azure_test.go | 37 +++ test/e2e/config/azure-dev.yaml | 3 + 6 files changed, 408 insertions(+) create mode 100644 templates/test/ci/prow-aks-aso-kuberay/patches/aks-gpu-pool.yaml diff --git a/templates/test/ci/cluster-template-prow-aks-aso-kuberay.yaml b/templates/test/ci/cluster-template-prow-aks-aso-kuberay.yaml index aa9b56b618c..0d00eea3415 100644 --- a/templates/test/ci/cluster-template-prow-aks-aso-kuberay.yaml +++ b/templates/test/ci/cluster-template-prow-aks-aso-kuberay.yaml @@ -147,3 +147,48 @@ spec: name: ${CLUSTER_NAME} type: VirtualMachineScaleSets vmSize: ${AZURE_AKS_NODE_MACHINE_TYPE:=Standard_D2s_v3} +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachinePool +metadata: + name: ${CLUSTER_NAME}-gpupool + namespace: default +spec: + clusterName: ${CLUSTER_NAME} + replicas: 1 + template: + metadata: {} + spec: + bootstrap: + dataSecretName: "" + clusterName: ${CLUSTER_NAME} + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureASOManagedMachinePool + name: ${CLUSTER_NAME}-gpupool + version: ${KUBERNETES_VERSION} +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureASOManagedMachinePool +metadata: + name: ${CLUSTER_NAME}-gpupool + namespace: default +spec: + resources: + - apiVersion: containerservice.azure.com/v1api20240901 + kind: ManagedClustersAgentPool + metadata: + annotations: + serviceoperator.azure.com/credential-from: ${ASO_CREDENTIAL_SECRET_NAME} + name: ${CLUSTER_NAME}-gpupool + spec: + azureName: gpupool + mode: User + nodeTaints: + - sku=gpu:NoSchedule + owner: + name: ${CLUSTER_NAME} + tags: + EnableManagedGPUExperience: "true" + type: VirtualMachineScaleSets + vmSize: ${AZURE_GPU_NODE_MACHINE_TYPE:=Standard_NC6s_v3} diff --git a/templates/test/ci/prow-aks-aso-kuberay/kustomization.yaml b/templates/test/ci/prow-aks-aso-kuberay/kustomization.yaml index a803e4ab052..70f0d768b80 100644 --- a/templates/test/ci/prow-aks-aso-kuberay/kustomization.yaml +++ b/templates/test/ci/prow-aks-aso-kuberay/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization namespace: default resources: - ../../../flavors/aks-aso +- patches/aks-gpu-pool.yaml patches: - patch: |- @@ -43,6 +44,16 @@ patches: value: "${AZURE_AKS_NODE_MACHINE_TYPE:=Standard_D2s_v3}" target: kind: AzureASOManagedMachinePool +- patch: |- + - op: test + path: /spec/resources/0/kind + value: ManagedClustersAgentPool + - op: replace + path: /spec/resources/0/spec/vmSize + value: "${AZURE_GPU_NODE_MACHINE_TYPE:=Standard_NC6s_v3}" + target: + kind: AzureASOManagedMachinePool + name: ".*-gpupool" sortOptions: order: fifo diff --git a/templates/test/ci/prow-aks-aso-kuberay/patches/aks-gpu-pool.yaml b/templates/test/ci/prow-aks-aso-kuberay/patches/aks-gpu-pool.yaml new file mode 100644 index 00000000000..e20eb095aa4 --- /dev/null +++ b/templates/test/ci/prow-aks-aso-kuberay/patches/aks-gpu-pool.yaml @@ -0,0 +1,42 @@ +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachinePool +metadata: + name: "${CLUSTER_NAME}-gpupool" +spec: + clusterName: "${CLUSTER_NAME}" + replicas: 1 + template: + metadata: {} + spec: + bootstrap: + dataSecretName: "" + clusterName: "${CLUSTER_NAME}" + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureASOManagedMachinePool + name: "${CLUSTER_NAME}-gpupool" + version: "${KUBERNETES_VERSION}" +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureASOManagedMachinePool +metadata: + name: "${CLUSTER_NAME}-gpupool" +spec: + resources: + - apiVersion: "containerservice.azure.com/v1api20240901" + kind: ManagedClustersAgentPool + metadata: + name: ${CLUSTER_NAME}-gpupool + annotations: + serviceoperator.azure.com/credential-from: ${ASO_CREDENTIAL_SECRET_NAME} + spec: + azureName: gpupool + owner: + name: ${CLUSTER_NAME} + mode: User + type: VirtualMachineScaleSets + vmSize: "${AZURE_GPU_NODE_MACHINE_TYPE:=Standard_NC6s_v3}" + tags: + EnableManagedGPUExperience: "true" + nodeTaints: + - "sku=gpu:NoSchedule" diff --git a/test/e2e/azure_kuberay.go b/test/e2e/azure_kuberay.go index 8e5041b3114..57c16ea3169 100644 --- a/test/e2e/azure_kuberay.go +++ b/test/e2e/azure_kuberay.go @@ -505,3 +505,273 @@ func describeRayJobStatus(ctx context.Context, dynamicClient dynamic.Interface, b.WriteString(describeKubeRayOperatorLogs(ctx, clientset)) return b.String() } + +// KubeRayGPUClusterSpecInput is the input for KubeRayGPUClusterSpec. +type KubeRayGPUClusterSpecInput struct { + BootstrapClusterProxy framework.ClusterProxy + Namespace *corev1.Namespace + ClusterName string + SkipCleanup bool +} + +// KubeRayGPUClusterSpec implements a test that verifies the KubeRay operator can schedule +// Ray workers on AKS GPU nodes provisioned with ManagedGPUExperiencePreview. +func KubeRayGPUClusterSpec(ctx context.Context, inputGetter func() KubeRayGPUClusterSpecInput) { + var ( + specName = "kuberay-gpu" + input KubeRayGPUClusterSpecInput + ) + + input = inputGetter() + Expect(input.BootstrapClusterProxy).NotTo(BeNil(), "Invalid argument. input.BootstrapClusterProxy can't be nil when calling %s spec", specName) + Expect(input.Namespace).NotTo(BeNil(), "Invalid argument. input.Namespace can't be nil when calling %s spec", specName) + Expect(input.ClusterName).NotTo(BeEmpty(), "Invalid argument. input.ClusterName can't be empty when calling %s spec", specName) + + By("creating a Kubernetes client to the workload cluster") + clusterProxy := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, input.Namespace.Name, input.ClusterName) + Expect(clusterProxy).NotTo(BeNil()) + clientset := clusterProxy.GetClientSet() + Expect(clientset).NotTo(BeNil()) + + By("installing the KubeRay operator via Helm") + InstallKubeRayOperator(ctx, clusterProxy, specName) + + By("creating a GPU-enabled RayCluster") + dynamicClient := newDynamicClient(clusterProxy) + rayCluster := newGPURayClusterUnstructured("raycluster-gpu-e2e", corev1.NamespaceDefault) + _, err := dynamicClient.Resource(rayClusterGVR).Namespace(corev1.NamespaceDefault).Create(ctx, rayCluster, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + + By("waiting for the GPU RayCluster to become ready") + Eventually(func() bool { + rc, err := dynamicClient.Resource(rayClusterGVR).Namespace(corev1.NamespaceDefault).Get(ctx, "raycluster-gpu-e2e", metav1.GetOptions{}) + if err != nil { + return false + } + state, found, err := unstructured.NestedString(rc.Object, "status", "state") + if err != nil || !found { + return false + } + return state == "ready" + }, e2eConfig.GetIntervals(specName, "wait-raycluster-ready")...).Should(BeTrue(), func() string { + return describeKubeRayOperatorLogs(ctx, clientset) + }) + + By("verifying the GPU worker pod is running on a GPU node") + Eventually(func() bool { + pods, err := clientset.CoreV1().Pods(corev1.NamespaceDefault).List(ctx, metav1.ListOptions{ + LabelSelector: "ray.io/node-type=worker", + }) + if err != nil || len(pods.Items) == 0 { + return false + } + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning { + continue + } + // Verify the pod was scheduled on the GPU pool + if pod.Spec.NodeName == "" { + continue + } + node, err := clientset.CoreV1().Nodes().Get(ctx, pod.Spec.NodeName, metav1.GetOptions{}) + if err != nil { + continue + } + if pool, ok := node.Labels["agentpool"]; ok && pool == "gpupool" { + return true + } + } + return false + }, e2eConfig.GetIntervals(specName, "wait-deployment")...).Should(BeTrue(), "GPU worker pod did not reach Running state on a GPU node") + + By("running a RayJob that verifies GPU access") + rayJob := newGPURayJobUnstructured("rayjob-gpu-e2e", corev1.NamespaceDefault) + _, err = dynamicClient.Resource(rayJobGVR).Namespace(corev1.NamespaceDefault).Create(ctx, rayJob, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + + By("waiting for the GPU RayJob to complete") + Eventually(func() bool { + rj, err := dynamicClient.Resource(rayJobGVR).Namespace(corev1.NamespaceDefault).Get(ctx, "rayjob-gpu-e2e", metav1.GetOptions{}) + if err != nil { + return false + } + deploymentStatus, found, err := unstructured.NestedString(rj.Object, "status", "jobDeploymentStatus") + if err != nil || !found { + return false + } + return deploymentStatus == "Complete" + }, e2eConfig.GetIntervals(specName, "wait-rayjob-complete")...).Should(BeTrue(), func() string { + return describeRayJobStatus(ctx, dynamicClient, "rayjob-gpu-e2e", corev1.NamespaceDefault, clientset) + }) + + By("verifying the GPU RayJob completed with SUCCEEDED status") + rj, err := dynamicClient.Resource(rayJobGVR).Namespace(corev1.NamespaceDefault).Get(ctx, "rayjob-gpu-e2e", metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + jobStatus, _, _ := unstructured.NestedString(rj.Object, "status", "jobStatus") + Expect(jobStatus).To(Equal("SUCCEEDED"), "expected GPU RayJob status to be SUCCEEDED but got %s", jobStatus) + + if !input.SkipCleanup { + By("deleting the GPU RayJob") + err = dynamicClient.Resource(rayJobGVR).Namespace(corev1.NamespaceDefault).Delete(ctx, "rayjob-gpu-e2e", metav1.DeleteOptions{}) + Expect(err).NotTo(HaveOccurred()) + + By("deleting the GPU RayCluster") + err = dynamicClient.Resource(rayClusterGVR).Namespace(corev1.NamespaceDefault).Delete(ctx, "raycluster-gpu-e2e", metav1.DeleteOptions{}) + Expect(err).NotTo(HaveOccurred()) + } +} + +// gpuToleration returns the toleration needed to schedule pods on GPU-tainted nodes. +func gpuToleration() map[string]interface{} { + return map[string]interface{}{ + "key": "sku", + "operator": "Equal", + "value": "gpu", + "effect": "NoSchedule", + } +} + +// newGPURayClusterUnstructured creates a RayCluster with GPU worker nodes that tolerate +// the GPU taint and request nvidia.com/gpu resources. +func newGPURayClusterUnstructured(name, namespace string) *unstructured.Unstructured { + return &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "ray.io/v1", + "kind": "RayCluster", + "metadata": map[string]interface{}{ + "name": name, + "namespace": namespace, + }, + "spec": map[string]interface{}{ + "rayVersion": "2.41.0", + "headGroupSpec": map[string]interface{}{ + "rayStartParams": map[string]interface{}{ + "dashboard-host": "0.0.0.0", + }, + "template": map[string]interface{}{ + "spec": map[string]interface{}{ + "nodeSelector": map[string]interface{}{ + "kubernetes.io/os": "linux", + }, + "containers": []interface{}{ + map[string]interface{}{ + "name": "ray-head", + "image": rayImage, + "ports": []interface{}{ + map[string]interface{}{ + "containerPort": int64(6379), + "name": "gcs-server", + }, + map[string]interface{}{ + "containerPort": int64(8265), + "name": "dashboard", + }, + map[string]interface{}{ + "containerPort": int64(10001), + "name": "client", + }, + }, + "resources": map[string]interface{}{ + "requests": map[string]interface{}{ + "cpu": "300m", + "memory": "1Gi", + }, + "limits": map[string]interface{}{ + "cpu": "500m", + "memory": "2Gi", + }, + }, + }, + }, + }, + }, + }, + "workerGroupSpecs": []interface{}{ + map[string]interface{}{ + "replicas": int64(1), + "minReplicas": int64(1), + "maxReplicas": int64(1), + "groupName": "gpu-group", + "rayStartParams": map[string]interface{}{ + "num-gpus": "1", + }, + "template": map[string]interface{}{ + "spec": map[string]interface{}{ + "nodeSelector": map[string]interface{}{ + "kubernetes.io/os": "linux", + "agentpool": "gpupool", + }, + "tolerations": []interface{}{ + gpuToleration(), + }, + "containers": []interface{}{ + map[string]interface{}{ + "name": "ray-worker", + "image": rayImage, + "resources": map[string]interface{}{ + "requests": map[string]interface{}{ + "cpu": "300m", + "memory": "1Gi", + "nvidia.com/gpu": "1", + }, + "limits": map[string]interface{}{ + "cpu": "500m", + "memory": "2Gi", + "nvidia.com/gpu": "1", + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } +} + +// newGPURayJobUnstructured creates a RayJob that verifies GPU access via nvidia-smi, +// using the existing GPU RayCluster rather than creating an inline one. +func newGPURayJobUnstructured(name, namespace string) *unstructured.Unstructured { + return &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "ray.io/v1", + "kind": "RayJob", + "metadata": map[string]interface{}{ + "name": name, + "namespace": namespace, + }, + "spec": map[string]interface{}{ + "entrypoint": "python -c \"import ray; ray.init(); print('GPU IDs:', ray.get_gpu_ids()); assert len(ray.get_gpu_ids()) > 0, 'No GPUs found'; print('GPU test passed'); ray.shutdown()\"", + "clusterSelector": map[string]interface{}{ + "ray.io/cluster": "raycluster-gpu-e2e", + }, + "submitterPodTemplate": map[string]interface{}{ + "spec": map[string]interface{}{ + "restartPolicy": "Never", + "nodeSelector": map[string]interface{}{ + "kubernetes.io/os": "linux", + }, + "containers": []interface{}{ + map[string]interface{}{ + "name": "ray-job-submitter", + "image": rayImage, + "resources": map[string]interface{}{ + "requests": map[string]interface{}{ + "cpu": "200m", + "memory": "200Mi", + }, + "limits": map[string]interface{}{ + "cpu": "500m", + "memory": "500Mi", + }, + }, + }, + }, + }, + }, + }, + }, + } +} diff --git a/test/e2e/azure_test.go b/test/e2e/azure_test.go index fb64d5a5113..9242974de5f 100644 --- a/test/e2e/azure_test.go +++ b/test/e2e/azure_test.go @@ -1530,5 +1530,42 @@ spec: By("PASSED!") }) + + // This test requires the ManagedGPUExperiencePreview feature flag to be registered + // on the Azure subscription, and GPU quota for the configured SKU (default: Standard_NC6s_v3). + // Override the SKU via AZURE_GPU_NODE_MACHINE_TYPE. The aks-aso-kuberay flavor includes + // a dedicated GPU node pool with the EnableManagedGPUExperience=true tag. + It("Creates a RayCluster with GPU workers using ManagedGPUExperiencePreview [GPU]", func() { + clusterName = getClusterName(clusterNamePrefix, "kuberay-gpu") + kubernetesVersion, err := GetAKSKubernetesVersion(ctx, e2eConfig, AKSKubernetesVersion) + Expect(err).NotTo(HaveOccurred()) + + clusterctl.ApplyClusterTemplateAndWait(ctx, createApplyClusterTemplateInput( + specName, + withFlavor("aks-aso-kuberay"), + withNamespace(namespace.Name), + withClusterName(clusterName), + withKubernetesVersion(kubernetesVersion), + withWorkerMachineCount(1), + withMachinePoolInterval(specName, "wait-gpu-nodes"), + withControlPlaneWaiters(clusterctl.ControlPlaneWaiters{ + WaitForControlPlaneInitialized: WaitForAKSControlPlaneInitialized, + WaitForControlPlaneMachinesReady: WaitForAKSControlPlaneReady, + }), + ), result) + + By("Running the KubeRay GPU spec", func() { + KubeRayGPUClusterSpec(ctx, func() KubeRayGPUClusterSpecInput { + return KubeRayGPUClusterSpecInput{ + BootstrapClusterProxy: bootstrapClusterProxy, + Namespace: namespace, + ClusterName: clusterName, + SkipCleanup: skipCleanup, + } + }) + }) + + By("PASSED!") + }) }) }) diff --git a/test/e2e/config/azure-dev.yaml b/test/e2e/config/azure-dev.yaml index 827b595a003..8fbe4b3002f 100644 --- a/test/e2e/config/azure-dev.yaml +++ b/test/e2e/config/azure-dev.yaml @@ -299,6 +299,9 @@ intervals: kuberay-cluster/wait-raycluster-ready: ["10m", "10s"] kuberay-job/wait-deployment: ["15m", "10s"] kuberay-job/wait-rayjob-complete: ["15m", "10s"] + kuberay-gpu/wait-deployment: ["20m", "10s"] + kuberay-gpu/wait-raycluster-ready: ["15m", "10s"] + kuberay-gpu/wait-rayjob-complete: ["15m", "10s"] csi-migration/wait-controlplane-upgrade: ["60m", "10s"] csi-migration/wait-worker-nodes: ["60m", "10s"] csi-migration/wait-control-plane: ["60m", "10s"]