From b5faa64380fd5038c5e73739db874198076d92ca Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 21 Apr 2026 11:50:26 +0000 Subject: [PATCH 01/68] feat: support mount Azure File with user-provided managed identity token Add mountWithManagedIdentityToken parameter to support cross-tenant scenarios where users provide their own managed identity OAuth token via a Kubernetes Secret. This initial version supports the first mount flow: - New volume context parameter: mountwithmanagedidentitytoken - Secret field: azurestoragemanagedidentitytoken - Uses azfilesauthmanager to set credential cache with direct token - Mutually exclusive with mountWithManagedIdentity and mountWithWorkloadIdentityToken - Linux SMB only (sec=krb5 mount) Token refresh will be added in a follow-up PR. Design: https://github.com/andyzhangx/demo/blob/master/linux/azurefile/mount-with-managed-identity-token.md Fixes: https://github.com/kubernetes-sigs/azurefile-csi-driver/issues/3099 --- pkg/azurefile/azurefile.go | 45 +++++++++++++++----- pkg/azurefile/nodeserver.go | 71 +++++++++++++++++++++++++++++--- pkg/azurefile/nodeserver_test.go | 52 ++++++++++++++++++++++- pkg/azurefile/utils.go | 30 ++++++++++---- pkg/azurefile/utils_test.go | 41 +++++++++++++++++- 5 files changed, 211 insertions(+), 28 deletions(-) diff --git a/pkg/azurefile/azurefile.go b/pkg/azurefile/azurefile.go index f1051b665b..34c5dc0c82 100644 --- a/pkg/azurefile/azurefile.go +++ b/pkg/azurefile/azurefile.go @@ -173,8 +173,11 @@ const ( runtimeClassHandlerField = "runtimeclasshandler" defaultRuntimeClassHandler = "kata-cc" mountWithManagedIdentityField = "mountwithmanagedidentity" + mountWithOAuthTokenField = "mountwithoauthtoken" mountWithWITokenField = "mountwithworkloadidentitytoken" + defaultSecretOAuthToken = "oauthtoken" + accountNotProvisioned = "StorageAccountIsNotProvisioned" // this is a workaround fix for 429 throttling issue, will update cloud provider for better fix later tooManyRequests = "TooManyRequests" @@ -824,7 +827,7 @@ func (d *Driver) GetAccountInfo(ctx context.Context, volumeID string, secrets, r var protocol, accountKey, secretName, pvcNamespace string // getAccountKeyFromSecret indicates whether get account key only from k8s secret - var getAccountKeyFromSecret, getLatestAccountKey, mountWithManagedIdentity, mountWithWIToken bool + var getAccountKeyFromSecret, getLatestAccountKey, mountWithManagedIdentity, mountWithOAuthToken, mountWithWIToken bool var clientID, tenantID, tokenFilePath, serviceAccountToken string for k, v := range reqContext { @@ -861,6 +864,10 @@ func (d *Driver) GetAccountInfo(ctx context.Context, volumeID string, secrets, r if mountWithManagedIdentity, err = strconv.ParseBool(v); err != nil { return rgName, accountName, accountKey, fileShareName, diskName, subsID, tenantID, tokenFilePath, fmt.Errorf("invalid %s: %s in volume context", mountWithManagedIdentityField, v) } + case mountWithOAuthTokenField: + if mountWithOAuthToken, err = strconv.ParseBool(v); err != nil { + return rgName, accountName, accountKey, fileShareName, diskName, subsID, tenantID, tokenFilePath, fmt.Errorf("invalid %s: %s in volume context", mountWithOAuthTokenField, v) + } case mountWithWITokenField: if mountWithWIToken, err = strconv.ParseBool(v); err != nil { return rgName, accountName, accountKey, fileShareName, diskName, subsID, tenantID, tokenFilePath, fmt.Errorf("invalid %s: %s in volume context", mountWithWITokenField, v) @@ -905,6 +912,21 @@ func (d *Driver) GetAccountInfo(ctx context.Context, volumeID string, secrets, r return rgName, accountName, accountKey, fileShareName, diskName, subsID, tenantID, tokenFilePath, nil } + if mountWithOAuthToken { + klog.V(2).Infof("mountWithOAuthToken is true, use OAuth token from secret for mount") + // Read accountName from secret if not already set + if accountName == "" && secretName != "" { + name, _, _, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) + if err != nil { + return rgName, accountName, accountKey, fileShareName, diskName, subsID, tenantID, tokenFilePath, fmt.Errorf("failed to get account name from secret for mountWithOAuthToken: %v", err) + } + if name != "" { + accountName = name + } + } + return rgName, accountName, accountKey, fileShareName, diskName, subsID, tenantID, tokenFilePath, nil + } + if mountWithWIToken { if clientID == "" { clientID = d.cloud.Config.AzureAuthConfig.UserAssignedIdentityID @@ -957,7 +979,7 @@ func (d *Driver) GetAccountInfo(ctx context.Context, volumeID string, secrets, r if secretName != "" { var name string // 2. if not found in cache, get account key from kubernetes secret - name, accountKey, err = d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) + name, accountKey, _, err = d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) if name != "" { accountName = name } @@ -1354,7 +1376,7 @@ func (d *Driver) GetStorageAccesskey(ctx context.Context, accountOptions *storag if secretName == "" { secretName = fmt.Sprintf(secretNameTemplate, accountName) } - _, accountKey, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) + _, accountKey, _, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) if err != nil { klog.V(2).Infof("could not get account(%s) key from secret(%s), error: %v, use cluster identity to get account key instead", accountOptions.Name, secretName, err) accountKey, err = d.GetStorageAccesskeyWithSubsID(ctx, accountOptions.SubscriptionID, accountOptions.Name, accountOptions.ResourceGroup, accountOptions.GetLatestAccountKey) @@ -1378,21 +1400,22 @@ func (d *Driver) GetStorageAccesskeyWithSubsID(ctx context.Context, subsID, acco return d.cloud.GetStorageAccesskey(ctx, accountClient, account, resourceGroup, getLatestAccountKey) } -// GetStorageAccountFromSecret get storage account key from k8s secret -// return -func (d *Driver) GetStorageAccountFromSecret(ctx context.Context, secretName, secretNamespace string) (string, string, error) { +// GetStorageAccountFromSecret get storage account key and OAuth token from k8s secret +// return +func (d *Driver) GetStorageAccountFromSecret(ctx context.Context, secretName, secretNamespace string) (string, string, string, error) { if d.kubeClient == nil { - return "", "", fmt.Errorf("could not get account key from secret(%s): KubeClient is nil", secretName) + return "", "", "", fmt.Errorf("could not get account key from secret(%s): KubeClient is nil", secretName) } secret, err := d.kubeClient.CoreV1().Secrets(secretNamespace).Get(ctx, secretName, metav1.GetOptions{}) if err != nil { - return "", "", fmt.Errorf("could not get secret(%v): %v", secretName, err) + return "", "", "", fmt.Errorf("could not get secret(%v): %v", secretName, err) } - accountName := strings.TrimSpace(string(secret.Data[defaultSecretAccountName][:])) - accountKey := strings.TrimSpace(string(secret.Data[defaultSecretAccountKey][:])) - return accountName, accountKey, nil + accountName := strings.TrimSpace(string(secret.Data[defaultSecretAccountName])) + accountKey := strings.TrimSpace(string(secret.Data[defaultSecretAccountKey])) + oauthToken := strings.TrimSpace(string(secret.Data[defaultSecretOAuthToken])) + return accountName, accountKey, oauthToken, nil } // getSubnetResourceID get default subnet resource ID from cloud provider config diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 46527d3271..c5073793cd 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -293,7 +293,7 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe // don't respect fsType from req.GetVolumeCapability().GetMount().GetFsType() // since it's ext4 by default on Linux var fsType, server, protocol, ephemeralVolMountOptions, storageEndpointSuffix, folderName, clientID string - var ephemeralVol, createFolderIfNotExist, encryptInTransit, mountWithManagedIdentity, mountWithWIToken bool + var ephemeralVol, createFolderIfNotExist, encryptInTransit, mountWithManagedIdentity, mountWithOAuthToken, mountWithWIToken bool volumeMetadataReplaceMap := map[string]string{} mountPermissions := d.mountPermissions @@ -354,6 +354,11 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe if err != nil { return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("Volume context property %q must be a boolean value: %v", k, err)) } + case mountWithOAuthTokenField: + mountWithOAuthToken, err = strconv.ParseBool(v) + if err != nil { + return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("Volume context property %q must be a boolean value: %v", k, err)) + } case mountWithWITokenField: mountWithWIToken, err = strconv.ParseBool(v) if err != nil { @@ -380,8 +385,23 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe return nil, status.Errorf(codes.InvalidArgument, "fsGroupChangePolicy(%s) is not supported, supported fsGroupChangePolicy list: %v", fsGroupChangePolicy, supportedFSGroupChangePolicyList) } - if mountWithManagedIdentity && mountWithWIToken { - return nil, status.Error(codes.InvalidArgument, "mountWithManagedIdentity and mountWithWIToken cannot be both true") + if (mountWithManagedIdentity && mountWithWIToken) || (mountWithManagedIdentity && mountWithOAuthToken) || (mountWithWIToken && mountWithOAuthToken) { + return nil, status.Error(codes.InvalidArgument, "only one of mountWithManagedIdentity, mountWithOAuthToken, and mountWithWorkloadIdentityToken can be true") + } + + if mountWithOAuthToken { + if runtime.GOOS == "windows" { + return nil, status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported on Windows") + } + if protocol == nfs { + return nil, status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported with NFS protocol") + } + if strings.TrimSpace(getValueInMap(context, secretNameField)) == "" { + return nil, status.Error(codes.InvalidArgument, "secretName is required when mountWithOAuthToken is true") + } + if strings.EqualFold(getValueInMap(context, createFolderIfNotExistField), trueValue) { + return nil, status.Error(codes.InvalidArgument, "createFolderIfNotExist is not supported with mountWithOAuthToken") + } } lockKey := fmt.Sprintf("%s-%s", volumeID, targetPath) @@ -451,10 +471,13 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe klog.V(2).Infof("using workload identity token for volume %s with mount options: %v", volumeID, sensitiveMountOptions) if tokenFilePath != "" { // always set credential cache when token file is provided even mount does not happen - if out, err := setCredentialCache(server, clientID, tenantID, tokenFilePath); err != nil { + if out, err := setCredentialCache(server, clientID, tenantID, tokenFilePath, ""); err != nil { return nil, status.Errorf(codes.Internal, "setCredentialCache failed for %s with error: %v, output: %s", server, err, out) } } + } else if mountWithOAuthToken && runtime.GOOS != "windows" { + sensitiveMountOptions = []string{"sec=krb5,cruid=0,upcall_target=mount"} + klog.V(2).Infof("using OAuth token from secret for volume %s", volumeID) } else { if accountName == "" || accountKey == "" { return nil, status.Errorf(codes.Internal, "accountName(%s) or accountKey is empty", accountName) @@ -516,10 +539,15 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe } else { execFunc := func() error { if mountWithManagedIdentity && protocol != nfs && runtime.GOOS != "windows" { - if out, err := setCredentialCache(server, clientID, tenantID, tokenFilePath); err != nil { + if out, err := setCredentialCache(server, clientID, tenantID, tokenFilePath, ""); err != nil { return fmt.Errorf("setCredentialCache failed for %s with error: %v, output: %s", server, err, out) } } + if mountWithOAuthToken && protocol != nfs && runtime.GOOS != "windows" { + if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { + return fmt.Errorf("setCredentialCacheWithOAuthToken failed for %s with error: %v", server, err) + } + } return SMBMount(d.mounter, source, cifsMountPath, mountFsType, mountOptions, sensitiveMountOptions) } timeoutFunc := func() error { @@ -834,6 +862,39 @@ func (d *Driver) ensureMountPoint(target string, perm os.FileMode) (bool, error) return !notMnt, nil } +// setCredentialCacheWithOAuthToken reads the OAuth token from the referenced +// Kubernetes Secret via GetStorageAccountFromSecret and calls setCredentialCache +// to update the credential cache. +func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, server string, volumeContext map[string]string) error { + secretName := getValueInMap(volumeContext, secretNameField) + secretNamespace := getValueInMap(volumeContext, secretNamespaceField) + if secretNamespace == "" { + secretNamespace = getValueInMap(volumeContext, pvcNamespaceKey) + if secretNamespace == "" { + secretNamespace = defaultNamespace + } + } + if secretName == "" { + return fmt.Errorf("secretName is required when %s is true", mountWithOAuthTokenField) + } + + _, _, oauthToken, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) + if err != nil { + return fmt.Errorf("failed to get secret %s/%s: %v", secretNamespace, secretName, err) + } + + if oauthToken == "" { + return fmt.Errorf("%s not found in secret %s/%s", defaultSecretOAuthToken, secretNamespace, secretName) + } + + if out, err := setCredentialCache(server, "", "", "", oauthToken); err != nil { + return fmt.Errorf("setCredentialCache failed for %s: %v, output: %s", server, err, out) + } + + klog.V(2).Infof("setCredentialCacheWithOAuthToken: refreshed credential cache for server %s using secret %s/%s", server, secretNamespace, secretName) + return nil +} + func (d *Driver) mountWithProxy(ctx context.Context, source, target, fsType string, options, sensitiveMountOptions []string) error { conn, err := grpc.NewClient(d.azurefileProxyEndpoint, grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index f12c5cd32c..cfbdb41fea 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -856,7 +856,7 @@ func TestNodeStageVolume(t *testing.T) { }, }, { - desc: "[Error] mountWithManagedIdentity and mountWithWIToken cannot be both true", + desc: "[Error] only one of mountWithManagedIdentity, mountWithOAuthToken, and mountWithWorkloadIdentityToken can be true", req: &csi.NodeStageVolumeRequest{VolumeId: "vol_1##", StagingTargetPath: sourceTest, VolumeCapability: &stdVolCap, VolumeContext: map[string]string{ @@ -868,7 +868,55 @@ func TestNodeStageVolume(t *testing.T) { }, Secrets: secrets}, expectedErr: testutil.TestError{ - DefaultError: status.Error(codes.InvalidArgument, "mountWithManagedIdentity and mountWithWIToken cannot be both true"), + DefaultError: status.Error(codes.InvalidArgument, "only one of mountWithManagedIdentity, mountWithOAuthToken, and mountWithWorkloadIdentityToken can be true"), + }, + }, + { + desc: "[Error] mountWithOAuthToken not supported with NFS", + req: &csi.NodeStageVolumeRequest{VolumeId: "vol_1##", StagingTargetPath: sourceTest, + VolumeCapability: &stdVolCap, + VolumeContext: map[string]string{ + shareNameField: "test_sharename", + storageAccountField: "test_accountname", + mountWithOAuthTokenField: "true", + protocolField: "nfs", + }, + Secrets: secrets}, + expectedErr: testutil.TestError{ + DefaultError: status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported with NFS protocol"), + WindowsError: status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported on Windows"), + }, + }, + { + desc: "[Error] mountWithOAuthToken missing secretName", + req: &csi.NodeStageVolumeRequest{VolumeId: "vol_1##", StagingTargetPath: sourceTest, + VolumeCapability: &stdVolCap, + VolumeContext: map[string]string{ + shareNameField: "test_sharename", + storageAccountField: "test_accountname", + mountWithOAuthTokenField: "true", + }, + Secrets: secrets}, + expectedErr: testutil.TestError{ + DefaultError: status.Error(codes.InvalidArgument, "secretName is required when mountWithOAuthToken is true"), + WindowsError: status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported on Windows"), + }, + }, + { + desc: "[Error] mountWithOAuthToken with createFolderIfNotExist", + req: &csi.NodeStageVolumeRequest{VolumeId: "vol_1##", StagingTargetPath: sourceTest, + VolumeCapability: &stdVolCap, + VolumeContext: map[string]string{ + shareNameField: "test_sharename", + storageAccountField: "test_accountname", + mountWithOAuthTokenField: "true", + secretNameField: "test-secret", + createFolderIfNotExistField: "true", + }, + Secrets: secrets}, + expectedErr: testutil.TestError{ + DefaultError: status.Error(codes.InvalidArgument, "createFolderIfNotExist is not supported with mountWithOAuthToken"), + WindowsError: status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported on Windows"), }, }, { diff --git a/pkg/azurefile/utils.go b/pkg/azurefile/utils.go index 61f915c41b..76ba74f423 100644 --- a/pkg/azurefile/utils.go +++ b/pkg/azurefile/utils.go @@ -419,27 +419,39 @@ func getDefaultBandwidth(requestGiB int, storageAccountType string) *int32 { return &bandwidth } -func setCredentialCache(server, clientID, tenantID, tokenFile string) ([]byte, error) { +func setCredentialCache(server, clientID, tenantID, tokenFile, token string) ([]byte, error) { if server == "" { return nil, fmt.Errorf("server must be provided") } - if clientID == "" { - return nil, fmt.Errorf("clientID must be provided") - } + serverURL := "https://" + server var args []string - if tokenFile != "" { + switch { + case token != "": + // direct token mode: azfilesauthmanager set https:// + args = []string{"set", serverURL, token} + case tokenFile != "": + if clientID == "" { + return nil, fmt.Errorf("clientID must be provided") + } if tenantID == "" { return nil, fmt.Errorf("tenantID must be provided when tokenFile is provided") } - args = []string{"set", "https://" + server, "--workload-identity", "--tenant-id", tenantID, "--client-id", clientID, "--token-file", tokenFile} - } else { - args = []string{"set", "https://" + server, "--imds-client-id", clientID} + args = []string{"set", serverURL, "--workload-identity", "--tenant-id", tenantID, "--client-id", clientID, "--token-file", tokenFile} + default: + if clientID == "" { + return nil, fmt.Errorf("clientID must be provided") + } + args = []string{"set", serverURL, "--imds-client-id", clientID} } cmd := exec.Command("azfilesauthmanager", args...) cmd.Env = append(os.Environ(), cmd.Env...) - klog.V(2).Infof("Executing command: %q", cmd.String()) + if token != "" { + klog.V(2).Infof("Executing command: azfilesauthmanager set %s ", serverURL) + } else { + klog.V(2).Infof("Executing command: %q", cmd.String()) + } return cmd.CombinedOutput() } diff --git a/pkg/azurefile/utils_test.go b/pkg/azurefile/utils_test.go index 3e579d848f..3a05ece3c4 100644 --- a/pkg/azurefile/utils_test.go +++ b/pkg/azurefile/utils_test.go @@ -1480,7 +1480,7 @@ func TestSetCredentialCache(t *testing.T) { } for _, test := range tests { - _, err := setCredentialCache(test.server, test.clientID, test.tenantID, test.tokenFile) + _, err := setCredentialCache(test.server, test.clientID, test.tenantID, test.tokenFile, "") if test.expectedError != "" { if err == nil { t.Errorf("test[%s]: expected error containing %q, got nil", test.desc, test.expectedError) @@ -1491,6 +1491,45 @@ func TestSetCredentialCache(t *testing.T) { // Note: We don't test successful execution as it requires azfilesauthmanager binary // The actual command execution will fail, but we've validated the argument construction } + + // Test direct token mode + tokenTests := []struct { + desc string + server string + token string + expectedError string + }{ + { + desc: "token mode: empty server", + server: "", + token: "test-oauth-token", + expectedError: "server must be provided", + }, + { + desc: "token mode: valid token bypasses clientID check", + server: "test.file.core.windows.net", + token: "test-oauth-token", + expectedError: "", // Will fail due to missing azfilesauthmanager, but must NOT fail with clientID/tenantID error + }, + } + + for _, test := range tokenTests { + _, err := setCredentialCache(test.server, "", "", "", test.token) + if test.expectedError != "" { + if err == nil { + t.Errorf("test[%s]: expected error containing %q, got nil", test.desc, test.expectedError) + } else if !strings.Contains(err.Error(), test.expectedError) { + t.Errorf("test[%s]: expected error containing %q, got %v", test.desc, test.expectedError, err) + } + } else if err != nil { + // Token mode should not return clientID/tenantID validation errors + errMsg := err.Error() + if strings.Contains(errMsg, "clientID must be provided") || strings.Contains(errMsg, "tenantID must be provided") { + t.Errorf("test[%s]: token mode should bypass clientID/tenantID validation, got: %v", test.desc, err) + } + // Other errors (e.g., azfilesauthmanager not found) are expected in test environment + } + } } func int32Ptr(i int32) *int32 { From 44c7c2fb85ba2b08dd895bfab233b569c28a94e8 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 22 Apr 2026 02:54:43 +0000 Subject: [PATCH 02/68] fix: redact command output in setCredentialCache error to avoid token leak --- pkg/azurefile/nodeserver.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index c5073793cd..44a255dafd 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -887,8 +887,8 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, server st return fmt.Errorf("%s not found in secret %s/%s", defaultSecretOAuthToken, secretNamespace, secretName) } - if out, err := setCredentialCache(server, "", "", "", oauthToken); err != nil { - return fmt.Errorf("setCredentialCache failed for %s: %v, output: %s", server, err, out) + if _, err := setCredentialCache(server, "", "", "", oauthToken); err != nil { + return fmt.Errorf("setCredentialCache failed for %s: %v", server, err) } klog.V(2).Infof("setCredentialCacheWithOAuthToken: refreshed credential cache for server %s using secret %s/%s", server, secretNamespace, secretName) From 5d26a0d76b855bc5a31f9515d05fe55a4d9a1aca Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 22 Apr 2026 02:58:33 +0000 Subject: [PATCH 03/68] feat: ignore mountWithOAuthToken parameter in controller server --- pkg/azurefile/controllerserver.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index 0e07c13f3b..6b0ce5cdfd 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -327,6 +327,8 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) if err != nil { return nil, status.Errorf(codes.InvalidArgument, "invalid %s: %s in storage class", mountWithWITokenField, v) } + case mountWithOAuthTokenField: + // ignored in controller, handled in NodeStageVolume default: return nil, status.Errorf(codes.InvalidArgument, "invalid parameter %q in storage class", k) } From bbfa76f2918d43d8923f9f335783475d6905b32a Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 22 Apr 2026 03:06:55 +0000 Subject: [PATCH 04/68] fix: generalize error message in GetStorageAccountFromSecret --- pkg/azurefile/azurefile.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/azurefile/azurefile.go b/pkg/azurefile/azurefile.go index 34c5dc0c82..948f9ee5a4 100644 --- a/pkg/azurefile/azurefile.go +++ b/pkg/azurefile/azurefile.go @@ -1404,7 +1404,7 @@ func (d *Driver) GetStorageAccesskeyWithSubsID(ctx context.Context, subsID, acco // return func (d *Driver) GetStorageAccountFromSecret(ctx context.Context, secretName, secretNamespace string) (string, string, string, error) { if d.kubeClient == nil { - return "", "", "", fmt.Errorf("could not get account key from secret(%s): KubeClient is nil", secretName) + return "", "", "", fmt.Errorf("could not get credentials from secret(%s): KubeClient is nil", secretName) } secret, err := d.kubeClient.CoreV1().Secrets(secretNamespace).Get(ctx, secretName, metav1.GetOptions{}) From 9b5f2eecb3f0cc6a5923c86c665ca1d0371d9223 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 03:55:41 +0000 Subject: [PATCH 05/68] feat: support mountWithOAuthToken in NodePublishVolume for token refresh from secret When mountWithOAuthToken is set, NodePublishVolume now delegates to NodeStageVolume (same pattern as mountWithWIToken). This ensures the OAuth token is re-read from the Kubernetes Secret on every publish, enabling token refresh without pod restart. --- pkg/azurefile/nodeserver.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 44a255dafd..2caf87841d 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -94,6 +94,18 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu return &csi.NodePublishVolumeResponse{}, err } + // mountWithOAuthToken: delegate to NodeStageVolume to refresh OAuth token from secret + if strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) { + klog.V(2).Infof("NodePublishVolume: volume(%s) mount on %s with OAuth token from secret", volumeID, target) + _, err := d.NodeStageVolume(ctx, &csi.NodeStageVolumeRequest{ + StagingTargetPath: target, + VolumeContext: context, + VolumeCapability: volCap, + VolumeId: volumeID, + }) + return &csi.NodePublishVolumeResponse{}, err + } + // ephemeral volume if strings.EqualFold(context[ephemeralField], trueValue) { setKeyValueInMap(context, secretNamespaceField, context[podNamespaceField]) From bf769f2555862e973ee39fef225df5c709544845 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 05:18:48 +0000 Subject: [PATCH 06/68] Revert "feat: support mountWithOAuthToken in NodePublishVolume for token refresh from secret" This reverts commit 18a1c8e263f4102d12d2ef87098a45c5cd240fd6. --- pkg/azurefile/nodeserver.go | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 2caf87841d..44a255dafd 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -94,18 +94,6 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu return &csi.NodePublishVolumeResponse{}, err } - // mountWithOAuthToken: delegate to NodeStageVolume to refresh OAuth token from secret - if strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) { - klog.V(2).Infof("NodePublishVolume: volume(%s) mount on %s with OAuth token from secret", volumeID, target) - _, err := d.NodeStageVolume(ctx, &csi.NodeStageVolumeRequest{ - StagingTargetPath: target, - VolumeContext: context, - VolumeCapability: volCap, - VolumeId: volumeID, - }) - return &csi.NodePublishVolumeResponse{}, err - } - // ephemeral volume if strings.EqualFold(context[ephemeralField], trueValue) { setKeyValueInMap(context, secretNamespaceField, context[podNamespaceField]) From 93826827188426554713e7130be1ab17022aca95 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 05:20:17 +0000 Subject: [PATCH 07/68] fix: NodePublishVolume should only refresh OAuth token credential cache, not re-mount When mountWithOAuthToken is set, NodePublishVolume now only refreshes the credential cache by reading the OAuth token from the Kubernetes Secret, without delegating to NodeStageVolume for a full mount. The actual mount was already done during NodeStageVolume; NodePublishVolume just needs to ensure the token is fresh before the bind mount proceeds. --- pkg/azurefile/nodeserver.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 44a255dafd..c65d1dfde8 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -94,6 +94,28 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu return &csi.NodePublishVolumeResponse{}, err } + // mountWithOAuthToken: refresh OAuth token from secret without re-mounting + if strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) { + server := getValueInMap(context, serverNameField) + if server == "" { + accountName := getValueInMap(context, storageAccountField) + storageEndpointSuffix := getValueInMap(context, storageEndpointSuffixField) + if storageEndpointSuffix == "" { + storageEndpointSuffix = "core.windows.net" + } + if accountName != "" { + server = fmt.Sprintf("%s.file.%s", accountName, storageEndpointSuffix) + } + } + if server == "" { + return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", volumeID) + } + if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { + return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) + } + klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, server) + } + // ephemeral volume if strings.EqualFold(context[ephemeralField], trueValue) { setKeyValueInMap(context, secretNamespaceField, context[podNamespaceField]) From 2197965dfdcb7552862b2f1dcb6d19bf9273a318 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 05:38:16 +0000 Subject: [PATCH 08/68] address review: fix OAuth token refresh and error handling - Move setCredentialCacheWithOAuthToken outside execFunc in NodeStageVolume so token refresh happens even when staging dir is already mounted (same pattern as workload identity token refresh) - Remove duplicate setCredentialCacheWithOAuthToken call from inside execFunc - Use d.getStorageEndPointSuffix() instead of hardcoded core.windows.net in NodePublishVolume to support sovereign clouds - Capture and redact command output in setCredentialCacheWithOAuthToken error for easier debugging without leaking tokens --- pkg/azurefile/nodeserver.go | 58 ++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index c65d1dfde8..70dd703ce0 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -94,28 +94,6 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu return &csi.NodePublishVolumeResponse{}, err } - // mountWithOAuthToken: refresh OAuth token from secret without re-mounting - if strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) { - server := getValueInMap(context, serverNameField) - if server == "" { - accountName := getValueInMap(context, storageAccountField) - storageEndpointSuffix := getValueInMap(context, storageEndpointSuffixField) - if storageEndpointSuffix == "" { - storageEndpointSuffix = "core.windows.net" - } - if accountName != "" { - server = fmt.Sprintf("%s.file.%s", accountName, storageEndpointSuffix) - } - } - if server == "" { - return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", volumeID) - } - if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { - return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) - } - klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, server) - } - // ephemeral volume if strings.EqualFold(context[ephemeralField], trueValue) { setKeyValueInMap(context, secretNamespaceField, context[podNamespaceField]) @@ -140,6 +118,28 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu return &csi.NodePublishVolumeResponse{}, err } + // mountWithOAuthToken: refresh OAuth token from secret without re-mounting + if strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) { + server := getValueInMap(context, serverNameField) + if server == "" { + accountName := getValueInMap(context, storageAccountField) + storageEndpointSuffix := getValueInMap(context, storageEndpointSuffixField) + if storageEndpointSuffix == "" { + storageEndpointSuffix = d.getStorageEndPointSuffix() + } + if accountName != "" { + server = fmt.Sprintf("%s.file.%s", accountName, storageEndpointSuffix) + } + } + if server == "" { + return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", volumeID) + } + if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { + return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) + } + klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, server) + } + if perm := getValueInMap(context, mountPermissionsField); perm != "" { var err error if mountPermissions, err = strconv.ParseUint(perm, 8, 32); err != nil { @@ -500,6 +500,10 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe } else if mountWithOAuthToken && runtime.GOOS != "windows" { sensitiveMountOptions = []string{"sec=krb5,cruid=0,upcall_target=mount"} klog.V(2).Infof("using OAuth token from secret for volume %s", volumeID) + // always refresh credential cache when mountWithOAuthToken is set, even if mount does not happen + if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { + return nil, status.Errorf(codes.Internal, "setCredentialCacheWithOAuthToken failed for %s with error: %v", server, err) + } } else { if accountName == "" || accountKey == "" { return nil, status.Errorf(codes.Internal, "accountName(%s) or accountKey is empty", accountName) @@ -565,11 +569,6 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe return fmt.Errorf("setCredentialCache failed for %s with error: %v, output: %s", server, err, out) } } - if mountWithOAuthToken && protocol != nfs && runtime.GOOS != "windows" { - if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { - return fmt.Errorf("setCredentialCacheWithOAuthToken failed for %s with error: %v", server, err) - } - } return SMBMount(d.mounter, source, cifsMountPath, mountFsType, mountOptions, sensitiveMountOptions) } timeoutFunc := func() error { @@ -909,8 +908,9 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, server st return fmt.Errorf("%s not found in secret %s/%s", defaultSecretOAuthToken, secretNamespace, secretName) } - if _, err := setCredentialCache(server, "", "", "", oauthToken); err != nil { - return fmt.Errorf("setCredentialCache failed for %s: %v", server, err) + if output, err := setCredentialCache(server, "", "", "", oauthToken); err != nil { + redactedOutput := strings.ReplaceAll(string(output), oauthToken, "") + return fmt.Errorf("setCredentialCache failed for %s with output: %s, error: %v", server, redactedOutput, err) } klog.V(2).Infof("setCredentialCacheWithOAuthToken: refreshed credential cache for server %s using secret %s/%s", server, secretNamespace, secretName) From f050aa7a6d3f58bb0e7698780eda449a6657d4f5 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 07:04:49 +0000 Subject: [PATCH 09/68] address review: move OAuth refresh after idempotency check, resolve accountName from secret - Move mountWithOAuthToken credential refresh in NodePublishVolume to after the already-mounted idempotency check, so transient secret/API failures don't break already-published volumes - When storageAccount is empty in volume context, resolve accountName from the referenced Secret (same as GetAccountInfo does) before constructing the server FQDN - Token refresh still runs even when target is already mounted, ensuring credential cache stays fresh on republish --- pkg/azurefile/nodeserver.go | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 70dd703ce0..40947e525f 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -212,6 +212,47 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu } if mnt { klog.V(2).Infof("NodePublishVolume: %s is already mounted", target) + } + + // mountWithOAuthToken: refresh OAuth token from secret after idempotency check + if context != nil && strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) { + server := getValueInMap(context, serverNameField) + if server == "" { + accountName := getValueInMap(context, storageAccountField) + if accountName == "" { + // resolve accountName from secret if not in volume context + secretName := getValueInMap(context, secretNameField) + secretNamespace := getValueInMap(context, secretNamespaceField) + if secretNamespace == "" { + secretNamespace = getValueInMap(context, pvcNamespaceKey) + if secretNamespace == "" { + secretNamespace = defaultNamespace + } + } + if secretName != "" { + if name, _, _, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace); err == nil && name != "" { + accountName = name + } + } + } + storageEndpointSuffix := getValueInMap(context, storageEndpointSuffixField) + if storageEndpointSuffix == "" { + storageEndpointSuffix = d.getStorageEndPointSuffix() + } + if accountName != "" { + server = fmt.Sprintf("%s.file.%s", accountName, storageEndpointSuffix) + } + } + if server == "" { + return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", volumeID) + } + if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { + return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) + } + klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, server) + } + + if mnt { return &csi.NodePublishVolumeResponse{}, nil } From 344c2b7ad11a306b0e652388500069e531a998c0 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 11:07:58 +0000 Subject: [PATCH 10/68] perf: skip OAuth token credential cache refresh when token is unchanged Cache SHA256 hash of OAuth token per server in oauthTokenSHAMap. On subsequent calls to setCredentialCacheWithOAuthToken, compare the current token SHA with the cached value and skip azfilesauthmanager refresh if the token has not changed. This avoids unnecessary credential cache operations on every NodePublishVolume/NodeStageVolume call. --- pkg/azurefile/azurefile.go | 2 ++ pkg/azurefile/nodeserver.go | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/pkg/azurefile/azurefile.go b/pkg/azurefile/azurefile.go index 948f9ee5a4..7830274ca8 100644 --- a/pkg/azurefile/azurefile.go +++ b/pkg/azurefile/azurefile.go @@ -295,6 +295,8 @@ type Driver struct { secretCacheMap azcache.Resource // a map storing all volumes using data plane API dataPlaneAPIVolMap sync.Map + // a map storing OAuth token SHA per server to avoid unnecessary credential cache refresh + oauthTokenSHAMap sync.Map // a timed cache storing all storage accounts that are using data plane API temporarily dataPlaneAPIAccountCache azcache.Resource // a timed cache storing account search history (solve account list throttling issue) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 40947e525f..e7ff9aea45 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -17,6 +17,7 @@ limitations under the License. package azurefile import ( + "crypto/sha256" "encoding/json" "fmt" "io" @@ -949,11 +950,19 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, server st return fmt.Errorf("%s not found in secret %s/%s", defaultSecretOAuthToken, secretNamespace, secretName) } + // check if token has changed by comparing SHA256 hash + tokenSHA := fmt.Sprintf("%x", sha256.Sum256([]byte(oauthToken))) + if cachedSHA, ok := d.oauthTokenSHAMap.Load(server); ok && cachedSHA.(string) == tokenSHA { + klog.V(4).Infof("setCredentialCacheWithOAuthToken: OAuth token unchanged for server %s, skipping refresh", server) + return nil + } + if output, err := setCredentialCache(server, "", "", "", oauthToken); err != nil { redactedOutput := strings.ReplaceAll(string(output), oauthToken, "") return fmt.Errorf("setCredentialCache failed for %s with output: %s, error: %v", server, redactedOutput, err) } + d.oauthTokenSHAMap.Store(server, tokenSHA) klog.V(2).Infof("setCredentialCacheWithOAuthToken: refreshed credential cache for server %s using secret %s/%s", server, secretNamespace, secretName) return nil } From 24708e2d557a4f373e572fddf03725a34803b913 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 11:15:47 +0000 Subject: [PATCH 11/68] fix: remove duplicate OAuth token refresh block in NodePublishVolume Consolidate to a single refresh block after the idempotency check. The old block before ensureMountPoint was left over from earlier edits and caused double secret reads and incorrect early failures when storageAccount was omitted. --- pkg/azurefile/nodeserver.go | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index e7ff9aea45..abd2ef2f86 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -119,28 +119,6 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu return &csi.NodePublishVolumeResponse{}, err } - // mountWithOAuthToken: refresh OAuth token from secret without re-mounting - if strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) { - server := getValueInMap(context, serverNameField) - if server == "" { - accountName := getValueInMap(context, storageAccountField) - storageEndpointSuffix := getValueInMap(context, storageEndpointSuffixField) - if storageEndpointSuffix == "" { - storageEndpointSuffix = d.getStorageEndPointSuffix() - } - if accountName != "" { - server = fmt.Sprintf("%s.file.%s", accountName, storageEndpointSuffix) - } - } - if server == "" { - return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", volumeID) - } - if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { - return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) - } - klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, server) - } - if perm := getValueInMap(context, mountPermissionsField); perm != "" { var err error if mountPermissions, err = strconv.ParseUint(perm, 8, 32); err != nil { From 0b2ebd325bc8cbb84e50c68ebbb9a10608b47c13 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 11:18:38 +0000 Subject: [PATCH 12/68] refactor: extract getSecretNamespace helper to deduplicate namespace resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract repeated secretNamespace fallback logic (secretNamespaceField → pvcNamespaceKey → defaultNamespace) into a shared getSecretNamespace() function in utils.go. --- pkg/azurefile/nodeserver.go | 16 ++-------------- pkg/azurefile/utils.go | 12 ++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index abd2ef2f86..4b897b97a8 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -201,13 +201,7 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu if accountName == "" { // resolve accountName from secret if not in volume context secretName := getValueInMap(context, secretNameField) - secretNamespace := getValueInMap(context, secretNamespaceField) - if secretNamespace == "" { - secretNamespace = getValueInMap(context, pvcNamespaceKey) - if secretNamespace == "" { - secretNamespace = defaultNamespace - } - } + secretNamespace := getSecretNamespace(context) if secretName != "" { if name, _, _, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace); err == nil && name != "" { accountName = name @@ -908,13 +902,7 @@ func (d *Driver) ensureMountPoint(target string, perm os.FileMode) (bool, error) // to update the credential cache. func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, server string, volumeContext map[string]string) error { secretName := getValueInMap(volumeContext, secretNameField) - secretNamespace := getValueInMap(volumeContext, secretNamespaceField) - if secretNamespace == "" { - secretNamespace = getValueInMap(volumeContext, pvcNamespaceKey) - if secretNamespace == "" { - secretNamespace = defaultNamespace - } - } + secretNamespace := getSecretNamespace(volumeContext) if secretName == "" { return fmt.Errorf("secretName is required when %s is true", mountWithOAuthTokenField) } diff --git a/pkg/azurefile/utils.go b/pkg/azurefile/utils.go index 76ba74f423..a6a6bb0f29 100644 --- a/pkg/azurefile/utils.go +++ b/pkg/azurefile/utils.go @@ -304,6 +304,18 @@ func getValueInMap(m map[string]string, key string) string { return "" } +// getSecretNamespace resolves the secret namespace from volume context, +// falling back to PVC namespace, then to "default". +func getSecretNamespace(volumeContext map[string]string) string { + if ns := getValueInMap(volumeContext, secretNamespaceField); ns != "" { + return ns + } + if ns := getValueInMap(volumeContext, pvcNamespaceKey); ns != "" { + return ns + } + return defaultNamespace +} + // replaceWithMap replace key with value for str func replaceWithMap(str string, m map[string]string) string { for k, v := range m { From ac27467a6576e5d34b721fa462340b9029ee2d22 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 11:43:15 +0000 Subject: [PATCH 13/68] test: add NodePublishVolume unit tests for mountWithOAuthToken path - Test empty server error when storageAccount/serverName are missing - Test secret fetch failure when kubeClient is nil --- pkg/azurefile/nodeserver_test.go | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index cfbdb41fea..9686596e24 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -324,6 +324,40 @@ func TestNodePublishVolume(t *testing.T) { cleanup: func() { }, }, + { + desc: "[Error] mountWithOAuthToken with empty server", + req: &csi.NodePublishVolumeRequest{VolumeCapability: &csi.VolumeCapability{AccessMode: &volumeCap}, + VolumeId: "vol_1", + TargetPath: targetTest, + StagingTargetPath: sourceTest, + VolumeContext: map[string]string{ + mountWithOAuthTokenField: "true", + secretNameField: "test-secret", + }, + }, + expectedErr: testutil.TestError{ + DefaultError: status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", "vol_1"), + }, + }, + { + desc: "[Error] mountWithOAuthToken with server but secret fetch fails", + req: &csi.NodePublishVolumeRequest{VolumeCapability: &csi.VolumeCapability{AccessMode: &volumeCap}, + VolumeId: "vol_1", + TargetPath: targetTest, + StagingTargetPath: sourceTest, + VolumeContext: map[string]string{ + mountWithOAuthTokenField: "true", + serverNameField: "testaccount.file.core.windows.net", + secretNameField: "test-secret", + }, + }, + expectedErr: testutil.TestError{ + DefaultError: status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", "vol_1", fmt.Errorf("failed to get secret %s/%s: %v", "default", "test-secret", fmt.Errorf("could not get credentials from secret(%s): KubeClient is nil", "test-secret"))), + }, + setup: func() { + d.kubeClient = nil + }, + }, } // Setup From e09bb5e18b4e6cb7cbb70851e3ce4a3a20699c5c Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 11:51:33 +0000 Subject: [PATCH 14/68] fix: validate secretName in NodePublishVolume before OAuth refresh Return codes.InvalidArgument when secretName is missing, instead of wrapping it as codes.Internal from setCredentialCacheWithOAuthToken. This gives clearer feedback and correct retry behavior for input errors. --- pkg/azurefile/nodeserver.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 4b897b97a8..de044af820 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -219,6 +219,9 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu if server == "" { return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", volumeID) } + if strings.TrimSpace(getValueInMap(context, secretNameField)) == "" { + return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: secretName is required for volume(%s) with mountWithOAuthToken", volumeID) + } if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) } From f02febf81e3109860e03b2548c1909ad92c3c234 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 13:17:35 +0000 Subject: [PATCH 15/68] fix: surface secret lookup errors and add OAuth token SHA dedup tests - NodePublishVolume: return explicit error when GetStorageAccountFromSecret fails during accountName resolution, instead of silently ignoring - Add TestSetCredentialCacheWithOAuthToken covering: missing secretName, nil kubeClient, missing oauthtoken in secret, SHA256 skip path --- pkg/azurefile/nodeserver.go | 6 ++- pkg/azurefile/nodeserver_test.go | 89 ++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 2 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index de044af820..1c4f8bc3f6 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -203,9 +203,11 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu secretName := getValueInMap(context, secretNameField) secretNamespace := getSecretNamespace(context) if secretName != "" { - if name, _, _, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace); err == nil && name != "" { - accountName = name + name, _, _, secretErr := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) + if secretErr != nil { + return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to get account from secret %s/%s: %v", secretNamespace, secretName, secretErr) } + accountName = name } } storageEndpointSuffix := getValueInMap(context, storageEndpointSuffixField) diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index 9686596e24..9d31336612 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -18,6 +18,7 @@ package azurefile import ( "context" + "crypto/sha256" "errors" "fmt" "net" @@ -35,6 +36,9 @@ import ( "go.uber.org/mock/gomock" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" clientset "k8s.io/client-go/kubernetes" mount "k8s.io/mount-utils" "k8s.io/utils/exec" @@ -1427,3 +1431,88 @@ func makeFakeOutput(output string, err error) testingexec.FakeAction { return []byte(o), nil, err } } + +func TestSetCredentialCacheWithOAuthToken(t *testing.T) { + d := NewFakeDriver() + + tests := []struct { + desc string + server string + volumeContext map[string]string + setupDriver func() + expectedErr string + expectSkip bool + }{ + { + desc: "missing secretName", + server: "testaccount.file.core.windows.net", + volumeContext: map[string]string{ + mountWithOAuthTokenField: "true", + }, + expectedErr: "secretName is required", + }, + { + desc: "kubeClient is nil", + server: "testaccount.file.core.windows.net", + volumeContext: map[string]string{ + secretNameField: "test-secret", + }, + setupDriver: func() { + d.kubeClient = nil + }, + expectedErr: "KubeClient is nil", + }, + { + desc: "oauthtoken missing in secret", + server: "testaccount.file.core.windows.net", + volumeContext: map[string]string{ + secretNameField: "test-secret", + }, + setupDriver: func() { + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "test-secret", Namespace: "default"}, + Data: map[string][]byte{defaultSecretAccountName: []byte("testaccount")}, + } + d.kubeClient = fake.NewSimpleClientset(secret) + }, + expectedErr: fmt.Sprintf("%s not found in secret", defaultSecretOAuthToken), + }, + { + desc: "skip refresh when token SHA unchanged", + server: "testaccount.file.core.windows.net", + volumeContext: map[string]string{ + secretNameField: "test-secret", + }, + setupDriver: func() { + token := "test-oauth-token-value" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "test-secret", Namespace: "default"}, + Data: map[string][]byte{ + defaultSecretAccountName: []byte("testaccount"), + defaultSecretOAuthToken: []byte(token), + }, + } + d.kubeClient = fake.NewSimpleClientset(secret) + // pre-populate SHA cache + tokenSHA := fmt.Sprintf("%x", sha256.Sum256([]byte(token))) + d.oauthTokenSHAMap.Store("testaccount.file.core.windows.net", tokenSHA) + }, + expectSkip: true, + }, + } + + for _, test := range tests { + t.Run(test.desc, func(t *testing.T) { + if test.setupDriver != nil { + test.setupDriver() + } + err := d.setCredentialCacheWithOAuthToken(context.Background(), test.server, test.volumeContext) + if test.expectSkip { + assert.NoError(t, err) + } else if test.expectedErr != "" { + assert.Error(t, err) + assert.Contains(t, err.Error(), test.expectedErr) + } + }) + } +} From cbc92254eaad8a1b33394582f4a903ddd387ec63 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 13:20:59 +0000 Subject: [PATCH 16/68] fix: gofmt import ordering --- pkg/azurefile/nodeserver_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index 9d31336612..d4477006f0 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -38,8 +38,8 @@ import ( "google.golang.org/grpc/status" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/fake" mount "k8s.io/mount-utils" "k8s.io/utils/exec" testingexec "k8s.io/utils/exec/testing" From 84090c33c34004bcdc269c11fa162511752d8304 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 13:56:43 +0000 Subject: [PATCH 17/68] fix: remove secretName from empty server test to avoid secret lookup error Without secretName, the code skips GetStorageAccountFromSecret and correctly reaches the empty server check. --- pkg/azurefile/nodeserver_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index d4477006f0..3c0322614a 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -336,7 +336,6 @@ func TestNodePublishVolume(t *testing.T) { StagingTargetPath: sourceTest, VolumeContext: map[string]string{ mountWithOAuthTokenField: "true", - secretNameField: "test-secret", }, }, expectedErr: testutil.TestError{ From cae74bcba776430201cd55cdf1689c2a25f82d28 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 14:21:18 +0000 Subject: [PATCH 18/68] fix: validate mountWithOAuthToken in controller and move OAuth validation before ensureMountPoint - Controller: parse mountWithOAuthToken boolean and validate mutual exclusion with mountWithManagedIdentity/mountWithWIToken early - NodePublishVolume: split OAuth logic into validation (before ensureMountPoint) and refresh (after idempotency check) to avoid leaving behind target directories on input validation failures --- pkg/azurefile/controllerserver.go | 11 ++++--- pkg/azurefile/controllerserver_test.go | 2 +- pkg/azurefile/nodeserver.go | 45 ++++++++++++++------------ 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index 6b0ce5cdfd..f2fdd2662a 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -126,7 +126,7 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) } var sku, subsID, resourceGroup, location, account, fileShareName, diskName, fsType, secretName string var secretNamespace, pvcNamespace, protocol, customTags, storageEndpointSuffix, networkEndpointType, shareAccessTier, accountAccessTier, rootSquashType, tagValueDelimiter string - var createAccount, useSeretCache, matchTags, selectRandomMatchingAccount, getLatestAccountKey, encryptInTransit, mountWithManagedIdentity, mountWithWIToken bool + var createAccount, useSeretCache, matchTags, selectRandomMatchingAccount, getLatestAccountKey, encryptInTransit, mountWithManagedIdentity, mountWithWIToken, mountWithOAuthToken bool var vnetResourceGroup, vnetName, vnetLinkName, publicNetworkAccess, subnetName, shareNamePrefix, fsGroupChangePolicy, useDataPlaneAPI, privateDNSZoneResourceGroup string var requireInfraEncryption, disableDeleteRetentionPolicy, enableLFS, isMultichannelEnabled, allowSharedKeyAccess, allowCrossTenantReplication *bool var provisionedBandwidthMibps, provisionedIops *int32 @@ -328,14 +328,17 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) return nil, status.Errorf(codes.InvalidArgument, "invalid %s: %s in storage class", mountWithWITokenField, v) } case mountWithOAuthTokenField: - // ignored in controller, handled in NodeStageVolume + mountWithOAuthToken, err = strconv.ParseBool(v) + if err != nil { + return nil, status.Errorf(codes.InvalidArgument, "invalid %s: %s in storage class", mountWithOAuthTokenField, v) + } default: return nil, status.Errorf(codes.InvalidArgument, "invalid parameter %q in storage class", k) } } - if mountWithManagedIdentity && mountWithWIToken { - return nil, status.Error(codes.InvalidArgument, "mountwithmanagedidentity and mountwithworkloadidentitytoken cannot be both true in storage class") + if (mountWithManagedIdentity && mountWithWIToken) || (mountWithManagedIdentity && mountWithOAuthToken) || (mountWithWIToken && mountWithOAuthToken) { + return nil, status.Error(codes.InvalidArgument, "only one of mountWithManagedIdentity, mountWithOAuthToken, and mountWithWorkloadIdentityToken can be true in storage class") } // When using managed identity or workload identity token for mount, diff --git a/pkg/azurefile/controllerserver_test.go b/pkg/azurefile/controllerserver_test.go index 02c0d2904f..7ae68a8852 100644 --- a/pkg/azurefile/controllerserver_test.go +++ b/pkg/azurefile/controllerserver_test.go @@ -1374,7 +1374,7 @@ var _ = ginkgo.Describe("TestCreateVolume", func() { }, } - expectedErr := status.Errorf(codes.InvalidArgument, "%s and %s cannot be both true in storage class", mountWithManagedIdentityField, mountWithWITokenField) + expectedErr := status.Errorf(codes.InvalidArgument, "only one of mountWithManagedIdentity, mountWithOAuthToken, and mountWithWorkloadIdentityToken can be true in storage class") _, err := d.CreateVolume(ctx, req) gomega.Expect(err).To(gomega.Equal(expectedErr)) }) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 1c4f8bc3f6..8472bffd70 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -185,21 +185,17 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu mountOptions = append(mountOptions, "ro") } - mnt, err := d.ensureMountPoint(target, os.FileMode(mountPermissions)) - if err != nil { - return nil, status.Errorf(codes.Internal, "Could not mount target %s: %v", target, err) - } - if mnt { - klog.V(2).Infof("NodePublishVolume: %s is already mounted", target) - } - - // mountWithOAuthToken: refresh OAuth token from secret after idempotency check - if context != nil && strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) { - server := getValueInMap(context, serverNameField) - if server == "" { + // mountWithOAuthToken: validate inputs and resolve server before ensureMountPoint + var oauthServer string + isMountWithOAuthToken := context != nil && strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) + if isMountWithOAuthToken { + if strings.TrimSpace(getValueInMap(context, secretNameField)) == "" { + return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: secretName is required for volume(%s) with mountWithOAuthToken", volumeID) + } + oauthServer = getValueInMap(context, serverNameField) + if oauthServer == "" { accountName := getValueInMap(context, storageAccountField) if accountName == "" { - // resolve accountName from secret if not in volume context secretName := getValueInMap(context, secretNameField) secretNamespace := getSecretNamespace(context) if secretName != "" { @@ -215,19 +211,28 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu storageEndpointSuffix = d.getStorageEndPointSuffix() } if accountName != "" { - server = fmt.Sprintf("%s.file.%s", accountName, storageEndpointSuffix) + oauthServer = fmt.Sprintf("%s.file.%s", accountName, storageEndpointSuffix) } } - if server == "" { + if oauthServer == "" { return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", volumeID) } - if strings.TrimSpace(getValueInMap(context, secretNameField)) == "" { - return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: secretName is required for volume(%s) with mountWithOAuthToken", volumeID) - } - if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { + } + + mnt, err := d.ensureMountPoint(target, os.FileMode(mountPermissions)) + if err != nil { + return nil, status.Errorf(codes.Internal, "Could not mount target %s: %v", target, err) + } + if mnt { + klog.V(2).Infof("NodePublishVolume: %s is already mounted", target) + } + + // mountWithOAuthToken: refresh credential cache after idempotency check + if isMountWithOAuthToken { + if err := d.setCredentialCacheWithOAuthToken(ctx, oauthServer, context); err != nil { return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) } - klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, server) + klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) } if mnt { From 275c5ffe6a75248e5355da5e08dca1ff70ef014c Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 25 Apr 2026 14:36:16 +0000 Subject: [PATCH 19/68] fix: address review comments - mutual exclusion, redact output, fresh driver per test - setCredentialCache: return error when both token and tokenFile are set - setCredentialCacheWithOAuthToken: log output at error level but omit from returned error to prevent sensitive data leakage - TestSetCredentialCacheWithOAuthToken: create fresh Driver per subtest to avoid shared state and order-dependency - Fix NodePublishVolume empty server test: renamed to 'missing secretName' since secretName validation now runs before server resolution --- pkg/azurefile/nodeserver.go | 4 ++-- pkg/azurefile/nodeserver_test.go | 17 ++++++++--------- pkg/azurefile/utils.go | 3 +++ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 8472bffd70..5a96d8831a 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -934,8 +934,8 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, server st } if output, err := setCredentialCache(server, "", "", "", oauthToken); err != nil { - redactedOutput := strings.ReplaceAll(string(output), oauthToken, "") - return fmt.Errorf("setCredentialCache failed for %s with output: %s, error: %v", server, redactedOutput, err) + klog.Errorf("setCredentialCache failed for %s with output: %s, error: %v", server, strings.ReplaceAll(string(output), oauthToken, ""), err) + return fmt.Errorf("setCredentialCache failed for %s: %v", server, err) } d.oauthTokenSHAMap.Store(server, tokenSHA) diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index 3c0322614a..508f02ff0e 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -329,7 +329,7 @@ func TestNodePublishVolume(t *testing.T) { }, }, { - desc: "[Error] mountWithOAuthToken with empty server", + desc: "[Error] mountWithOAuthToken with missing secretName", req: &csi.NodePublishVolumeRequest{VolumeCapability: &csi.VolumeCapability{AccessMode: &volumeCap}, VolumeId: "vol_1", TargetPath: targetTest, @@ -339,7 +339,7 @@ func TestNodePublishVolume(t *testing.T) { }, }, expectedErr: testutil.TestError{ - DefaultError: status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", "vol_1"), + DefaultError: status.Errorf(codes.InvalidArgument, "NodePublishVolume: secretName is required for volume(%s) with mountWithOAuthToken", "vol_1"), }, }, { @@ -1432,13 +1432,11 @@ func makeFakeOutput(output string, err error) testingexec.FakeAction { } func TestSetCredentialCacheWithOAuthToken(t *testing.T) { - d := NewFakeDriver() - tests := []struct { desc string server string volumeContext map[string]string - setupDriver func() + setupDriver func(d *Driver) expectedErr string expectSkip bool }{ @@ -1456,7 +1454,7 @@ func TestSetCredentialCacheWithOAuthToken(t *testing.T) { volumeContext: map[string]string{ secretNameField: "test-secret", }, - setupDriver: func() { + setupDriver: func(d *Driver) { d.kubeClient = nil }, expectedErr: "KubeClient is nil", @@ -1467,7 +1465,7 @@ func TestSetCredentialCacheWithOAuthToken(t *testing.T) { volumeContext: map[string]string{ secretNameField: "test-secret", }, - setupDriver: func() { + setupDriver: func(d *Driver) { secret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{Name: "test-secret", Namespace: "default"}, Data: map[string][]byte{defaultSecretAccountName: []byte("testaccount")}, @@ -1482,7 +1480,7 @@ func TestSetCredentialCacheWithOAuthToken(t *testing.T) { volumeContext: map[string]string{ secretNameField: "test-secret", }, - setupDriver: func() { + setupDriver: func(d *Driver) { token := "test-oauth-token-value" secret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{Name: "test-secret", Namespace: "default"}, @@ -1502,8 +1500,9 @@ func TestSetCredentialCacheWithOAuthToken(t *testing.T) { for _, test := range tests { t.Run(test.desc, func(t *testing.T) { + d := NewFakeDriver() if test.setupDriver != nil { - test.setupDriver() + test.setupDriver(d) } err := d.setCredentialCacheWithOAuthToken(context.Background(), test.server, test.volumeContext) if test.expectSkip { diff --git a/pkg/azurefile/utils.go b/pkg/azurefile/utils.go index a6a6bb0f29..320d652f0c 100644 --- a/pkg/azurefile/utils.go +++ b/pkg/azurefile/utils.go @@ -435,6 +435,9 @@ func setCredentialCache(server, clientID, tenantID, tokenFile, token string) ([] if server == "" { return nil, fmt.Errorf("server must be provided") } + if token != "" && tokenFile != "" { + return nil, fmt.Errorf("token and tokenFile are mutually exclusive, only one can be provided") + } serverURL := "https://" + server var args []string From fa6b4a4cc5e56cbe385b59809959a118c6638519 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 28 Apr 2026 07:38:36 +0000 Subject: [PATCH 20/68] fix: refresh OAuth token before ensureMountPoint to fix stale mount recovery When an OAuth token expires, the kernel mount becomes stale and ReadDir fails with 'required key not available'. ensureMountPoint then tries to unmount but fails with 'target is busy' because the pod is still using the mount. This causes the entire NodePublishVolume to fail before reaching the token refresh logic. Fix: move setCredentialCacheWithOAuthToken before ensureMountPoint so the kernel credential cache is updated with a fresh token before ReadDir validates the mount. --- pkg/azurefile/nodeserver.go | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 5a96d8831a..92aaf56095 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -219,6 +219,19 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu } } + // mountWithOAuthToken: refresh credential cache BEFORE ensureMountPoint so that + // a stale mount (token expired → "required key not available") can recover. + // ensureMountPoint does ReadDir to validate the mount, which will fail if the + // kernel credential cache has an expired token, leading to an unmount attempt + // that fails with "target is busy". Refreshing first gives the kernel a valid + // token before the ReadDir check. + if isMountWithOAuthToken { + if err := d.setCredentialCacheWithOAuthToken(ctx, oauthServer, context); err != nil { + return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) + } + klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) + } + mnt, err := d.ensureMountPoint(target, os.FileMode(mountPermissions)) if err != nil { return nil, status.Errorf(codes.Internal, "Could not mount target %s: %v", target, err) @@ -227,14 +240,6 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu klog.V(2).Infof("NodePublishVolume: %s is already mounted", target) } - // mountWithOAuthToken: refresh credential cache after idempotency check - if isMountWithOAuthToken { - if err := d.setCredentialCacheWithOAuthToken(ctx, oauthServer, context); err != nil { - return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) - } - klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) - } - if mnt { return &csi.NodePublishVolumeResponse{}, nil } From a9e107891fc09d09542f8c8f650651ee83940797 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 28 Apr 2026 08:46:17 +0000 Subject: [PATCH 21/68] address review: use field constants in error messages, merge duplicate if-mnt blocks - Use mountWith*Field constants in mutual-exclusion error messages (nodeserver + controllerserver) - Merge two consecutive if-mnt blocks into single block in NodePublishVolume --- pkg/azurefile/controllerserver.go | 2 +- pkg/azurefile/nodeserver.go | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index f2fdd2662a..d1993fc6f6 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -338,7 +338,7 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) } if (mountWithManagedIdentity && mountWithWIToken) || (mountWithManagedIdentity && mountWithOAuthToken) || (mountWithWIToken && mountWithOAuthToken) { - return nil, status.Error(codes.InvalidArgument, "only one of mountWithManagedIdentity, mountWithOAuthToken, and mountWithWorkloadIdentityToken can be true in storage class") + return nil, status.Errorf(codes.InvalidArgument, "only one of %s, %s, and %s can be true in storage class", mountWithManagedIdentityField, mountWithOAuthTokenField, mountWithWITokenField) } // When using managed identity or workload identity token for mount, diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 92aaf56095..0cc1db6a1c 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -238,9 +238,6 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu } if mnt { klog.V(2).Infof("NodePublishVolume: %s is already mounted", target) - } - - if mnt { return &csi.NodePublishVolumeResponse{}, nil } @@ -437,7 +434,7 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe } if (mountWithManagedIdentity && mountWithWIToken) || (mountWithManagedIdentity && mountWithOAuthToken) || (mountWithWIToken && mountWithOAuthToken) { - return nil, status.Error(codes.InvalidArgument, "only one of mountWithManagedIdentity, mountWithOAuthToken, and mountWithWorkloadIdentityToken can be true") + return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("only one of %q, %q, and %q can be true", mountWithManagedIdentityField, mountWithOAuthTokenField, mountWithWITokenField)) } if mountWithOAuthToken { From e3a43b6e1e3f71884625f6eb2fc9343b2bfcda7b Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 28 Apr 2026 09:00:29 +0000 Subject: [PATCH 22/68] fix: update tests to match new error message format using field constants --- pkg/azurefile/controllerserver_test.go | 2 +- pkg/azurefile/nodeserver_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/azurefile/controllerserver_test.go b/pkg/azurefile/controllerserver_test.go index 7ae68a8852..3bd93268d4 100644 --- a/pkg/azurefile/controllerserver_test.go +++ b/pkg/azurefile/controllerserver_test.go @@ -1374,7 +1374,7 @@ var _ = ginkgo.Describe("TestCreateVolume", func() { }, } - expectedErr := status.Errorf(codes.InvalidArgument, "only one of mountWithManagedIdentity, mountWithOAuthToken, and mountWithWorkloadIdentityToken can be true in storage class") + expectedErr := status.Errorf(codes.InvalidArgument, "only one of %s, %s, and %s can be true in storage class", mountWithManagedIdentityField, mountWithOAuthTokenField, mountWithWITokenField) _, err := d.CreateVolume(ctx, req) gomega.Expect(err).To(gomega.Equal(expectedErr)) }) diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index 508f02ff0e..4a7da8145a 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -905,7 +905,7 @@ func TestNodeStageVolume(t *testing.T) { }, Secrets: secrets}, expectedErr: testutil.TestError{ - DefaultError: status.Error(codes.InvalidArgument, "only one of mountWithManagedIdentity, mountWithOAuthToken, and mountWithWorkloadIdentityToken can be true"), + DefaultError: status.Error(codes.InvalidArgument, fmt.Sprintf("only one of %q, %q, and %q can be true", mountWithManagedIdentityField, mountWithOAuthTokenField, mountWithWITokenField)), }, }, { From 54da6083a0dc8be575607f0d816d1517a266622a Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 28 Apr 2026 12:37:02 +0000 Subject: [PATCH 23/68] add e2e test for mountWithOAuthToken dynamic provisioning Setup (CAPZ only): 1. Get node user-assigned managed identity (clientID + principalID) 2. Assign Storage File Data SMB MI Admin role to the identity 3. Obtain Azure Storage OAuth token via ManagedIdentityCredential 4. Create K8s Secret with the token Test: - Dynamic provisioning with Premium_LRS + mountWithOAuthToken=true - Mounts with sec=krb5 and verifies read/write Helpers added to azure_helpers.go: - GetNodeIdentityInfo: finds user-assigned identity on VMSS/VM - AssignRoleToIdentity: assigns Azure RBAC role - GetStorageOAuthToken: gets storage token via managed identity --- go.mod | 3 ++ test/e2e/dynamic_provisioning_test.go | 41 ++++++++++++++ test/e2e/suite_test.go | 78 +++++++++++++++++++++++++++ test/utils/azure/azure_helpers.go | 25 +++++++-- 4 files changed, 142 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 8dff3f95fb..20c37d66b7 100644 --- a/go.mod +++ b/go.mod @@ -54,7 +54,10 @@ require ( require ( cel.dev/expr v0.25.1 // indirect cyphar.com/go-pathrs v0.2.1 // indirect + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7 v7.3.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerregistry/armcontainerregistry v1.2.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v6 v6.6.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/keyvault/armkeyvault v1.5.0 // indirect diff --git a/test/e2e/dynamic_provisioning_test.go b/test/e2e/dynamic_provisioning_test.go index 5590ceba6b..ff9af4e162 100644 --- a/test/e2e/dynamic_provisioning_test.go +++ b/test/e2e/dynamic_provisioning_test.go @@ -1913,6 +1913,7 @@ var _ = ginkgo.Describe("Dynamic Provisioning", func() { ginkgo.Skip("test case is only available for capz test") } gomega.Expect(miRoleSetupSucceeded).To(gomega.BeTrue(), "MI role assignment failed, cannot run managed identity mount test") + pods := []testsuites.PodDetails{ { Cmd: "echo 'hello world' > /mnt/test-1/data && grep 'hello world' /mnt/test-1/data", @@ -1939,6 +1940,46 @@ var _ = ginkgo.Describe("Dynamic Provisioning", func() { test.Run(ctx, cs, ns) }) + ginkgo.It("should create a volume on demand with mountWithOAuthToken [file.csi.azure.com]", func(ctx ginkgo.SpecContext) { + skipIfUsingInTreeVolumePlugin() + skipIfTestingInWindowsCluster() + if !isCapzTest { + ginkgo.Skip("mountWithOAuthToken test requires CAPZ environment") + } + gomega.Expect(oauthTokenSetupSucceeded).To(gomega.BeTrue(), "OAuth token setup did not succeed") + + pods := []testsuites.PodDetails{ + { + Cmd: "echo 'hello world' > /mnt/test-1/data && grep 'hello world' /mnt/test-1/data", + Volumes: []testsuites.VolumeDetails{ + { + ClaimSize: "100Gi", + MountOptions: []string{ + "sec=krb5", + "dir_mode=0777", + "file_mode=0777", + }, + VolumeMount: testsuites.VolumeMountDetails{ + NameGenerate: "test-volume-", + MountPathGenerate: "/mnt/test-", + }, + }, + }, + }, + } + test := testsuites.DynamicallyProvisionedCmdVolumeTest{ + CSIDriver: testDriver, + Pods: pods, + StorageClassParameters: map[string]string{ + "skuName": "Premium_LRS", + "mountWithOAuthToken": "true", + "secretName": "azure-oauth-token-secret", + "secretNamespace": "default", + }, + } + test.Run(ctx, cs, ns) + }) + ginkgo.It("should create a volume on demand with workload identity token mount [file.csi.azure.com]", ginkgo.Serial, func(ctx ginkgo.SpecContext) { skipIfUsingInTreeVolumePlugin() skipIfTestingInWindowsCluster() diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 32c36fb94a..4ba31a3c25 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -48,6 +48,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" "k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework/config" "sigs.k8s.io/azurefile-csi-driver/pkg/azurefile" @@ -82,6 +83,7 @@ var ( supportEncryptInTransitwithNFS bool miRoleSetupSucceeded bool wiSetupSucceeded bool + oauthTokenSetupSucceeded bool // wiClientID is set during BeforeSuite after WI configuration succeeds. wiClientID string @@ -237,6 +239,16 @@ var _ = ginkgo.BeforeSuite(func(ctx ginkgo.SpecContext) { err := azurefileDriver.Run(context.Background()) gomega.Expect(err).NotTo(gomega.HaveOccurred()) }() + + // Setup OAuth token for mountWithOAuthToken e2e test (CAPZ only) + if isCapzTest { + if setupErr := setupOAuthToken(ctx, creds, azureClient); setupErr != nil { + log.Printf("WARNING: OAuth token setup failed: %v. OAuth token tests will fail.", setupErr) + } else { + oauthTokenSetupSucceeded = true + log.Println("OAuth token setup succeeded") + } + } } }) @@ -905,3 +917,69 @@ func waitForAADTokenExchange(ctx context.Context, cs clientset.Interface, client } func int64Ptr(i int64) *int64 { return &i } + +const ( + oauthSecretName = "azure-oauth-token-secret" + oauthSecretNamespace = "default" +) + +// setupOAuthToken obtains an OAuth token from the node's managed identity and stores it +// in a Kubernetes Secret for use by mountWithOAuthToken e2e tests. +func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureClient *azure.Client) error { + // Step 1: Get node managed identity info + identityInfo, err := azureClient.GetNodeIdentityInfo(ctx, creds.ResourceGroup) + if err != nil { + return fmt.Errorf("failed to get node identity info: %v", err) + } + log.Printf("Found node identity: clientID=%s, principalID=%s", identityInfo.ClientID, identityInfo.PrincipalID) + + // Step 2: Assign Storage File Data SMB MI Admin role to identity (reuse WI constant) + err = azureClient.AssignRoleToIdentity(ctx, creds.ResourceGroup, identityInfo.PrincipalID, wiStorageFileDataSMBMIAdmin) + if err != nil { + return fmt.Errorf("failed to assign Storage File Data SMB MI Admin role: %v", err) + } + log.Println("Storage File Data SMB MI Admin role assigned to node identity") + + // Step 3: Get OAuth token for Azure Storage using managed identity + token, err := azure.GetStorageOAuthToken(ctx, identityInfo.ClientID) + if err != nil { + return fmt.Errorf("failed to get storage OAuth token: %v", err) + } + log.Println("Obtained storage OAuth token from managed identity") + + // Step 4: Create Kubernetes Secret with the OAuth token + kubeconfig := os.Getenv(kubeconfigEnvVar) + restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + return fmt.Errorf("failed to build kubeconfig: %v", err) + } + cs, err := clientset.NewForConfig(restConfig) + if err != nil { + return fmt.Errorf("failed to create kubernetes client: %v", err) + } + + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: oauthSecretName, + Namespace: oauthSecretNamespace, + }, + Type: corev1.SecretTypeOpaque, + Data: map[string][]byte{ + "oauthtoken": []byte(token), + }, + } + _, err = cs.CoreV1().Secrets(oauthSecretNamespace).Create(ctx, secret, metav1.CreateOptions{}) + if err != nil { + if apierrors.IsAlreadyExists(err) { + _, err = cs.CoreV1().Secrets(oauthSecretNamespace).Update(ctx, secret, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update OAuth token secret: %v", err) + } + } else { + return fmt.Errorf("failed to create OAuth token secret: %v", err) + } + } + log.Printf("Created/updated OAuth token secret %s/%s", oauthSecretNamespace, oauthSecretName) + + return nil +} diff --git a/test/utils/azure/azure_helpers.go b/test/utils/azure/azure_helpers.go index f787668b04..5e39b4ff88 100644 --- a/test/utils/azure/azure_helpers.go +++ b/test/utils/azure/azure_helpers.go @@ -27,7 +27,9 @@ import ( "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" armauthorization "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2" armcompute "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" armmsi "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi" @@ -105,11 +107,6 @@ func GetAzureClient(cloud, subscriptionID, clientID, tenantID, clientSecret, aad if err != nil { return nil, fmt.Errorf("failed to create federated identity credentials client: %v", err) } - - return &Client{ - subscriptionID: subscriptionID, - groupsClient: factory.GetResourceGroupClient(), - accountsClient: factory.GetAccountClient(), filesharesClient: factory.GetFileShareClient(), vmssClient: vmssClient, vmClient: vmClient, @@ -182,6 +179,24 @@ func (az *Client) GetAccountNumByResourceGroup(ctx context.Context, groupName st return len(result), nil } +// GetStorageOAuthToken obtains an Azure Storage OAuth token using a managed identity. +// The token can be used for mounting Azure File shares with mountWithOAuthToken. +func GetStorageOAuthToken(ctx context.Context, clientID string) (string, error) { + cred, err := azidentity.NewManagedIdentityCredential(&azidentity.ManagedIdentityCredentialOptions{ + ID: azidentity.ClientID(clientID), + }) + if err != nil { + return "", fmt.Errorf("failed to create managed identity credential: %v", err) + } + token, err := cred.GetToken(ctx, policy.TokenRequestOptions{ + Scopes: []string{"https://storage.azure.com/.default"}, + }) + if err != nil { + return "", fmt.Errorf("failed to get storage OAuth token: %v", err) + } + return token.Token, nil +} + func stringPointer(s string) *string { return &s } From cb4b467741ab73df8fe2ca33cf1324460f9ae885 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 28 Apr 2026 12:55:07 +0000 Subject: [PATCH 24/68] e2e: fail fast if OAuth token setup fails in BeforeSuite --- test/e2e/suite_test.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 4ba31a3c25..a08ad24ff2 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -242,11 +242,10 @@ var _ = ginkgo.BeforeSuite(func(ctx ginkgo.SpecContext) { // Setup OAuth token for mountWithOAuthToken e2e test (CAPZ only) if isCapzTest { - if setupErr := setupOAuthToken(ctx, creds, azureClient); setupErr != nil { - log.Printf("WARNING: OAuth token setup failed: %v. OAuth token tests will fail.", setupErr) - } else { - oauthTokenSetupSucceeded = true - log.Println("OAuth token setup succeeded") + err = setupOAuthToken(ctx, creds, azureClient) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), "OAuth token setup failed") + oauthTokenSetupSucceeded = true + log.Println("OAuth token setup succeeded") } } } From f8dc87eea13d5e9460f8845711fe935a2b02bc4c Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 28 Apr 2026 13:10:41 +0000 Subject: [PATCH 25/68] fix: syntax errors in suite_test.go and azure_helpers.go --- test/e2e/suite_test.go | 1 - test/utils/azure/azure_helpers.go | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index a08ad24ff2..521ee4d866 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -246,7 +246,6 @@ var _ = ginkgo.BeforeSuite(func(ctx ginkgo.SpecContext) { gomega.Expect(err).NotTo(gomega.HaveOccurred(), "OAuth token setup failed") oauthTokenSetupSucceeded = true log.Println("OAuth token setup succeeded") - } } } }) diff --git a/test/utils/azure/azure_helpers.go b/test/utils/azure/azure_helpers.go index 5e39b4ff88..9009587eb0 100644 --- a/test/utils/azure/azure_helpers.go +++ b/test/utils/azure/azure_helpers.go @@ -107,6 +107,11 @@ func GetAzureClient(cloud, subscriptionID, clientID, tenantID, clientSecret, aad if err != nil { return nil, fmt.Errorf("failed to create federated identity credentials client: %v", err) } + + return &Client{ + subscriptionID: subscriptionID, + groupsClient: factory.GetResourceGroupClient(), + accountsClient: factory.GetAccountClient(), filesharesClient: factory.GetFileShareClient(), vmssClient: vmssClient, vmClient: vmClient, From 08b99505f266034759fce97d88afd6644048099f Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 28 Apr 2026 13:53:35 +0000 Subject: [PATCH 26/68] fix: move test-only deps to indirect in go.mod armauthorization/v2, armcompute/v7, and azidentity are only used in test/utils/azure/azure_helpers.go. go mod vendor excludes test-only deps, so they must be marked as indirect. --- go.mod | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 20c37d66b7..ff5d550159 100644 --- a/go.mod +++ b/go.mod @@ -54,10 +54,10 @@ require ( require ( cel.dev/expr v0.25.1 // indirect cyphar.com/go-pathrs v0.2.1 // indirect - github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7 v7.3.0 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7 v7.3.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerregistry/armcontainerregistry v1.2.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v6 v6.6.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/keyvault/armkeyvault v1.5.0 // indirect From e89fc8b5f702fba736722950094153a726ad2783 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 28 Apr 2026 14:08:29 +0000 Subject: [PATCH 27/68] fix: move azidentity/armauthorization/armcompute to direct deps in go.mod These are used in test/utils/azure/azure_helpers.go which is compiled as part of the main module (not a separate go.mod). go mod vendor requires them as direct dependencies. --- go.mod | 3 --- 1 file changed, 3 deletions(-) diff --git a/go.mod b/go.mod index ff5d550159..8dff3f95fb 100644 --- a/go.mod +++ b/go.mod @@ -54,10 +54,7 @@ require ( require ( cel.dev/expr v0.25.1 // indirect cyphar.com/go-pathrs v0.2.1 // indirect - github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 // indirect - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7 v7.3.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerregistry/armcontainerregistry v1.2.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v6 v6.6.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/keyvault/armkeyvault v1.5.0 // indirect From 58069668c8fbc22227da6eaee1fc5321ec75dc44 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 29 Apr 2026 10:55:55 +0000 Subject: [PATCH 28/68] fix: get OAuth token from agent node via IMDS pod instead of Prow pod The test runner (Prow pod) runs outside Azure VMs and cannot access IMDS (169.254.169.254), causing ManagedIdentityCredential to fail with 401. Instead, deploy a short-lived pod on a workload cluster agent node that curls IMDS to obtain the storage OAuth token, then read it from pod logs. - Replace direct GetStorageOAuthToken() call with getOAuthTokenFromNode() - Schedule token-fetcher pod with hostNetwork on agent nodes - Read token from pod logs after completion - Remove unused GetStorageOAuthToken and azidentity/policy imports --- test/e2e/suite_test.go | 127 +++++++++++++++++++++++++++--- test/utils/azure/azure_helpers.go | 20 ----- 2 files changed, 117 insertions(+), 30 deletions(-) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 521ee4d866..2ac573fa6d 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -924,28 +924,24 @@ const ( // setupOAuthToken obtains an OAuth token from the node's managed identity and stores it // in a Kubernetes Secret for use by mountWithOAuthToken e2e tests. func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureClient *azure.Client) error { - // Step 1: Get node managed identity info + // Step 1: Get node managed identity info (uses Azure SDK with SP creds, works from Prow) identityInfo, err := azureClient.GetNodeIdentityInfo(ctx, creds.ResourceGroup) if err != nil { return fmt.Errorf("failed to get node identity info: %v", err) } log.Printf("Found node identity: clientID=%s, principalID=%s", identityInfo.ClientID, identityInfo.PrincipalID) - // Step 2: Assign Storage File Data SMB MI Admin role to identity (reuse WI constant) + // Step 2: Assign Storage File Data SMB MI Admin role to identity err = azureClient.AssignRoleToIdentity(ctx, creds.ResourceGroup, identityInfo.PrincipalID, wiStorageFileDataSMBMIAdmin) if err != nil { return fmt.Errorf("failed to assign Storage File Data SMB MI Admin role: %v", err) } log.Println("Storage File Data SMB MI Admin role assigned to node identity") - // Step 3: Get OAuth token for Azure Storage using managed identity - token, err := azure.GetStorageOAuthToken(ctx, identityInfo.ClientID) - if err != nil { - return fmt.Errorf("failed to get storage OAuth token: %v", err) - } - log.Println("Obtained storage OAuth token from managed identity") - - // Step 4: Create Kubernetes Secret with the OAuth token + // Step 3: Get OAuth token by running a pod on the workload cluster node. + // The test runner (Prow pod) cannot access IMDS (169.254.169.254) because it runs + // outside Azure VMs. Instead, we schedule a pod on an agent node that curls IMDS + // to obtain the token, then read it from the pod logs. kubeconfig := os.Getenv(kubeconfigEnvVar) restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfig) if err != nil { @@ -956,6 +952,13 @@ func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureC return fmt.Errorf("failed to create kubernetes client: %v", err) } + token, err := getOAuthTokenFromNode(ctx, cs, identityInfo.ClientID) + if err != nil { + return fmt.Errorf("failed to get storage OAuth token from node: %v", err) + } + log.Println("Obtained storage OAuth token from agent node via IMDS") + + // Step 4: Create Kubernetes Secret with the OAuth token secret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: oauthSecretName, @@ -981,3 +984,107 @@ func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureC return nil } + +// getOAuthTokenFromNode deploys a pod on a workload cluster agent node that uses IMDS +// to obtain an Azure Storage OAuth token, then reads the token from the pod logs. +func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID string) (string, error) { + podName := "oauth-token-fetcher" + namespace := "default" + + // IMDS curl command that outputs only the access_token value + curlCmd := fmt.Sprintf( + `curl -s -H "Metadata: true" "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%s&resource=https://storage.azure.com" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4`, + clientID, + ) + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + }, + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + // Use hostNetwork to ensure IMDS is accessible + HostNetwork: true, + Containers: []corev1.Container{ + { + Name: "token-fetcher", + Image: "mcr.microsoft.com/cbl-mariner/base/core:2.0", + Command: []string{"/bin/sh", "-c", curlCmd}, + }, + }, + // Schedule on agent nodes (not control plane) + NodeSelector: map[string]string{ + "kubernetes.io/os": "linux", + }, + Tolerations: []corev1.Toleration{ + { + Key: "node-role.kubernetes.io/control-plane", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }, + }, + }, + } + + // Clean up any leftover pod from a previous run + _ = cs.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{}) + // Wait briefly for deletion + time.Sleep(5 * time.Second) + + _, err := cs.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{}) + if err != nil { + return "", fmt.Errorf("failed to create token fetcher pod: %v", err) + } + defer func() { + _ = cs.CoreV1().Pods(namespace).Delete(context.Background(), podName, metav1.DeleteOptions{}) + }() + + // Wait for pod to complete (up to 5 minutes) + log.Printf("Waiting for token fetcher pod %s/%s to complete...", namespace, podName) + waitCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + + err = waitForPodComplete(waitCtx, cs, namespace, podName) + if err != nil { + return "", fmt.Errorf("token fetcher pod did not complete: %v", err) + } + + // Read token from pod logs + logBytes, err := cs.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{}).Do(ctx).Raw() + if err != nil { + return "", fmt.Errorf("failed to get token fetcher pod logs: %v", err) + } + + token := strings.TrimSpace(string(logBytes)) + if token == "" { + return "", fmt.Errorf("token fetcher pod returned empty token") + } + + return token, nil +} + +// waitForPodComplete polls until the pod reaches Succeeded or Failed phase. +func waitForPodComplete(ctx context.Context, cs clientset.Interface, namespace, name string) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + pod, err := cs.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return err + } + + switch pod.Status.Phase { + case corev1.PodSucceeded: + return nil + case corev1.PodFailed: + return fmt.Errorf("pod failed with reason: %s", pod.Status.Reason) + } + + time.Sleep(3 * time.Second) + } +} diff --git a/test/utils/azure/azure_helpers.go b/test/utils/azure/azure_helpers.go index 9009587eb0..f787668b04 100644 --- a/test/utils/azure/azure_helpers.go +++ b/test/utils/azure/azure_helpers.go @@ -27,9 +27,7 @@ import ( "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore" - "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" - "github.com/Azure/azure-sdk-for-go/sdk/azidentity" armauthorization "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2" armcompute "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" armmsi "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi" @@ -184,24 +182,6 @@ func (az *Client) GetAccountNumByResourceGroup(ctx context.Context, groupName st return len(result), nil } -// GetStorageOAuthToken obtains an Azure Storage OAuth token using a managed identity. -// The token can be used for mounting Azure File shares with mountWithOAuthToken. -func GetStorageOAuthToken(ctx context.Context, clientID string) (string, error) { - cred, err := azidentity.NewManagedIdentityCredential(&azidentity.ManagedIdentityCredentialOptions{ - ID: azidentity.ClientID(clientID), - }) - if err != nil { - return "", fmt.Errorf("failed to create managed identity credential: %v", err) - } - token, err := cred.GetToken(ctx, policy.TokenRequestOptions{ - Scopes: []string{"https://storage.azure.com/.default"}, - }) - if err != nil { - return "", fmt.Errorf("failed to get storage OAuth token: %v", err) - } - return token.Token, nil -} - func stringPointer(s string) *string { return &s } From 90b9070207d777bb5a1e55951755f8f1ceb025e5 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 29 Apr 2026 11:13:55 +0000 Subject: [PATCH 29/68] fix: schedule oauth token-fetcher pod on agent nodes only Use node affinity with DoesNotExist on node-role.kubernetes.io/control-plane instead of tolerating control-plane taint. Control plane nodes don't have the managed identity needed for IMDS token retrieval. --- test/e2e/suite_test.go | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 2ac573fa6d..7401a992f6 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -1013,15 +1013,24 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID Command: []string{"/bin/sh", "-c", curlCmd}, }, }, - // Schedule on agent nodes (not control plane) + // Schedule on agent nodes only (not control plane — no managed identity there) NodeSelector: map[string]string{ "kubernetes.io/os": "linux", }, - Tolerations: []corev1.Toleration{ - { - Key: "node-role.kubernetes.io/control-plane", - Operator: corev1.TolerationOpExists, - Effect: corev1.TaintEffectNoSchedule, + Affinity: &corev1.Affinity{ + NodeAffinity: &corev1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ + NodeSelectorTerms: []corev1.NodeSelectorTerm{ + { + MatchExpressions: []corev1.NodeSelectorRequirement{ + { + Key: "node-role.kubernetes.io/control-plane", + Operator: corev1.NodeSelectorOpDoesNotExist, + }, + }, + }, + }, + }, }, }, }, From e8fe3fa8fec5e0781fa11f2a9659a7c697b2401a Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 29 Apr 2026 11:20:07 +0000 Subject: [PATCH 30/68] chore: add --show-labels to kubectl get nodes in azurefile_log.sh --- test/utils/azurefile_log.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/utils/azurefile_log.sh b/test/utils/azurefile_log.sh index 94b3f0a3b7..6430c47a15 100755 --- a/test/utils/azurefile_log.sh +++ b/test/utils/azurefile_log.sh @@ -31,7 +31,7 @@ cleanup() { trap cleanup ERR echo "print out all nodes status ..." -kubectl get nodes -o wide +kubectl get nodes -o wide --show-labels echo "======================================================================================" echo "print out all default namespace pods status ..." From 2e171424064b9e0eb0e858df7bbb9b0af97e48ed Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 29 Apr 2026 11:23:34 +0000 Subject: [PATCH 31/68] fix: use GenerateName for token-fetcher pod to avoid collisions - Use GenerateName instead of fixed pod name to prevent cross-run collisions in parallel e2e runs - Remove delete+sleep cleanup of stale pods (no longer needed with unique names) --- test/e2e/suite_test.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 7401a992f6..31a19f0768 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -988,7 +988,6 @@ func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureC // getOAuthTokenFromNode deploys a pod on a workload cluster agent node that uses IMDS // to obtain an Azure Storage OAuth token, then reads the token from the pod logs. func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID string) (string, error) { - podName := "oauth-token-fetcher" namespace := "default" // IMDS curl command that outputs only the access_token value @@ -999,8 +998,8 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ - Name: podName, - Namespace: namespace, + GenerateName: "oauth-token-fetcher-", + Namespace: namespace, }, Spec: corev1.PodSpec{ RestartPolicy: corev1.RestartPolicyNever, @@ -1036,15 +1035,11 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID }, } - // Clean up any leftover pod from a previous run - _ = cs.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{}) - // Wait briefly for deletion - time.Sleep(5 * time.Second) - - _, err := cs.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{}) + created, err := cs.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{}) if err != nil { return "", fmt.Errorf("failed to create token fetcher pod: %v", err) } + podName := created.Name defer func() { _ = cs.CoreV1().Pods(namespace).Delete(context.Background(), podName, metav1.DeleteOptions{}) }() From 3ae08bd9b5ee0ee319293d9eb3f13a43df6450ec Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 29 Apr 2026 11:44:49 +0000 Subject: [PATCH 32/68] fix: move azidentity to indirect, fix secret update and IMDS resource URI - Move azidentity from direct to indirect dependency (no longer directly imported after removing GetStorageOAuthToken) - Fetch existing secret ResourceVersion before Update to avoid conflict - Add trailing slash to IMDS resource URI (https://storage.azure.com/) --- go.mod | 2 +- test/e2e/suite_test.go | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 8dff3f95fb..ccee511a6d 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.25.0 require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 - github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7 v7.3.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi v1.3.0 @@ -54,6 +53,7 @@ require ( require ( cel.dev/expr v0.25.1 // indirect cyphar.com/go-pathrs v0.2.1 // indirect + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerregistry/armcontainerregistry v1.2.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v6 v6.6.0 // indirect diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 31a19f0768..a6b425c63d 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -972,6 +972,11 @@ func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureC _, err = cs.CoreV1().Secrets(oauthSecretNamespace).Create(ctx, secret, metav1.CreateOptions{}) if err != nil { if apierrors.IsAlreadyExists(err) { + existing, getErr := cs.CoreV1().Secrets(oauthSecretNamespace).Get(ctx, oauthSecretName, metav1.GetOptions{}) + if getErr != nil { + return fmt.Errorf("failed to get existing OAuth token secret: %v", getErr) + } + secret.ResourceVersion = existing.ResourceVersion _, err = cs.CoreV1().Secrets(oauthSecretNamespace).Update(ctx, secret, metav1.UpdateOptions{}) if err != nil { return fmt.Errorf("failed to update OAuth token secret: %v", err) @@ -992,7 +997,7 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID // IMDS curl command that outputs only the access_token value curlCmd := fmt.Sprintf( - `curl -s -H "Metadata: true" "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%s&resource=https://storage.azure.com" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4`, + `curl -s -H "Metadata: true" "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%s&resource=https://storage.azure.com/" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4`, clientID, ) From ae73836853e0ebd253f35ce1f21388494c8284f1 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 29 Apr 2026 14:14:52 +0000 Subject: [PATCH 33/68] fix: skip storing account key when mountWithOAuthToken is true When mountWithOAuthToken=true, the OAuth token (fetched by the token fetcher pod from IMDS) is stored in the secret. CreateVolume was overwriting that secret with the storage account key, causing azfilesauthmanager to fail with exit status 1 when trying to set the Kerberos credential cache (it expects an OAuth token, not a key). Set storeAccountKey=false when mountWithOAuthToken is enabled. --- pkg/azurefile/controllerserver.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index d1993fc6f6..66be3c3161 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -341,10 +341,7 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) return nil, status.Errorf(codes.InvalidArgument, "only one of %s, %s, and %s can be true in storage class", mountWithManagedIdentityField, mountWithOAuthTokenField, mountWithWITokenField) } - // When using managed identity or workload identity token for mount, - // the account key should not be stored in the secret since mount - // authentication uses identity-based tokens, not account keys. - if mountWithManagedIdentity || mountWithWIToken { + if mountWithManagedIdentity || mountWithWIToken || mountWithOAuthToken { storeAccountKey = false } From 24664377e7e183a2e610f77236e56414e23f0e60 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Thu, 30 Apr 2026 02:33:18 +0000 Subject: [PATCH 34/68] test: log which node the oauth token fetcher pod ran on --- test/e2e/suite_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index a6b425c63d..89fdd0b630 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -1059,6 +1059,12 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID return "", fmt.Errorf("token fetcher pod did not complete: %v", err) } + // Print which node the token fetcher pod ran on + completedPod, getErr := cs.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if getErr == nil { + log.Printf("Token fetcher pod %s ran on node: %s", podName, completedPod.Spec.NodeName) + } + // Read token from pod logs logBytes, err := cs.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{}).Do(ctx).Raw() if err != nil { From 70f22e6bf902932baa194a75e8a410ac2ef5bce2 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Thu, 30 Apr 2026 05:15:35 +0000 Subject: [PATCH 35/68] move OAuth token setup from BeforeSuite to test case Ensures fresh token at mount time, avoiding potential token expiration when the mountWithOAuthToken test runs late in the suite. --- test/e2e/dynamic_provisioning_test.go | 4 +++- test/e2e/suite_test.go | 12 +++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/e2e/dynamic_provisioning_test.go b/test/e2e/dynamic_provisioning_test.go index ff9af4e162..ef3ef9d454 100644 --- a/test/e2e/dynamic_provisioning_test.go +++ b/test/e2e/dynamic_provisioning_test.go @@ -1946,7 +1946,9 @@ var _ = ginkgo.Describe("Dynamic Provisioning", func() { if !isCapzTest { ginkgo.Skip("mountWithOAuthToken test requires CAPZ environment") } - gomega.Expect(oauthTokenSetupSucceeded).To(gomega.BeTrue(), "OAuth token setup did not succeed") + // Setup OAuth token inline to ensure fresh token at mount time + err := setupOAuthToken(ctx, suiteCreds, suiteAzureClient) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), "OAuth token setup failed") pods := []testsuites.PodDetails{ { diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 89fdd0b630..bb83d5cadd 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -83,7 +83,8 @@ var ( supportEncryptInTransitwithNFS bool miRoleSetupSucceeded bool wiSetupSucceeded bool - oauthTokenSetupSucceeded bool + suiteCreds *credentials.Credentials + suiteAzureClient *azure.Client // wiClientID is set during BeforeSuite after WI configuration succeeds. wiClientID string @@ -121,6 +122,8 @@ var _ = ginkgo.BeforeSuite(func(ctx ginkgo.SpecContext) { gomega.Expect(err).NotTo(gomega.HaveOccurred()) azureClient, err := azure.GetAzureClient(creds.Cloud, creds.SubscriptionID, creds.AADClientID, creds.TenantID, creds.AADClientSecret, creds.AADFederatedTokenFile) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + suiteCreds = creds + suiteAzureClient = azureClient _, err = azureClient.EnsureResourceGroup(ctx, creds.ResourceGroup, creds.Location, nil) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -241,12 +244,7 @@ var _ = ginkgo.BeforeSuite(func(ctx ginkgo.SpecContext) { }() // Setup OAuth token for mountWithOAuthToken e2e test (CAPZ only) - if isCapzTest { - err = setupOAuthToken(ctx, creds, azureClient) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), "OAuth token setup failed") - oauthTokenSetupSucceeded = true - log.Println("OAuth token setup succeeded") - } + // Moved to test case itself to ensure fresh token at mount time } }) From 26d2e53b7ed7611908dd39ea9aaa889ed03320db Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Thu, 30 Apr 2026 05:33:52 +0000 Subject: [PATCH 36/68] fix: gofmt indentation --- test/e2e/suite_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index bb83d5cadd..0001f26a1f 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -244,7 +244,7 @@ var _ = ginkgo.BeforeSuite(func(ctx ginkgo.SpecContext) { }() // Setup OAuth token for mountWithOAuthToken e2e test (CAPZ only) - // Moved to test case itself to ensure fresh token at mount time + // Moved to test case itself to ensure fresh token at mount time } }) From 74888c4c3d7bc8615e54b0a60af32e267eb06674 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Thu, 30 Apr 2026 12:56:36 +0000 Subject: [PATCH 37/68] fix: remove trailing slash from storage.azure.com resource, log OAuth token info - Resource URL: https://storage.azure.com/ -> https://storage.azure.com - Token is already trimmed via strings.TrimSpace - Add debug logging showing token length and prefix/suffix --- test/e2e/suite_test.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 0001f26a1f..3499aa9b5f 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -995,7 +995,7 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID // IMDS curl command that outputs only the access_token value curlCmd := fmt.Sprintf( - `curl -s -H "Metadata: true" "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%s&resource=https://storage.azure.com/" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4`, + `curl -s -H "Metadata: true" "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%s&resource=https://storage.azure.com" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4`, clientID, ) @@ -1074,6 +1074,13 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID return "", fmt.Errorf("token fetcher pod returned empty token") } + // Log token info for debugging (show length and first/last 10 chars) + if len(token) > 20 { + log.Printf("OAuth token obtained: length=%d, prefix=%s...suffix=%s", len(token), token[:10], token[len(token)-10:]) + } else { + log.Printf("OAuth token obtained: length=%d (token too short, possibly invalid)", len(token)) + } + return token, nil } From 39de2e8e0276a8db8d9d30948fcd30a5277a08ff Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Fri, 1 May 2026 03:29:05 +0000 Subject: [PATCH 38/68] fix: pin OAuth test pod to same node where token was fetched The CSI node plugin's setCredentialCache runs on the node where the test pod is scheduled. Previously the token fetcher pod could run on a different node than the test pod, causing azfilesauthmanager to fail if the Kerberos lib environment differs between nodes. Changes: - setupOAuthToken now returns the node name where token was obtained - getOAuthTokenFromNode returns (token, nodeName, error) - Test pod is pinned via kubernetes.io/hostname NodeSelector - DynamicallyProvisionedCmdVolumeTest gains NodeSelector field --- test/e2e/dynamic_provisioning_test.go | 14 +++++-- test/e2e/suite_test.go | 42 ++++++++++--------- ...namically_provisioned_cmd_volume_tester.go | 6 +++ 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/test/e2e/dynamic_provisioning_test.go b/test/e2e/dynamic_provisioning_test.go index ef3ef9d454..8361859327 100644 --- a/test/e2e/dynamic_provisioning_test.go +++ b/test/e2e/dynamic_provisioning_test.go @@ -1947,7 +1947,7 @@ var _ = ginkgo.Describe("Dynamic Provisioning", func() { ginkgo.Skip("mountWithOAuthToken test requires CAPZ environment") } // Setup OAuth token inline to ensure fresh token at mount time - err := setupOAuthToken(ctx, suiteCreds, suiteAzureClient) + tokenNodeName, err := setupOAuthToken(ctx, suiteCreds, suiteAzureClient) gomega.Expect(err).NotTo(gomega.HaveOccurred(), "OAuth token setup failed") pods := []testsuites.PodDetails{ @@ -1969,9 +1969,17 @@ var _ = ginkgo.Describe("Dynamic Provisioning", func() { }, }, } + // Pin test pod to the same node where the OAuth token was fetched, + // so that the CSI node plugin's setCredentialCache runs on the same node. + nodeSelector := map[string]string{"kubernetes.io/os": "linux"} + if tokenNodeName != "" { + nodeSelector["kubernetes.io/hostname"] = tokenNodeName + log.Printf("Pinning OAuth test pod to node: %s", tokenNodeName) + } test := testsuites.DynamicallyProvisionedCmdVolumeTest{ - CSIDriver: testDriver, - Pods: pods, + CSIDriver: testDriver, + Pods: pods, + NodeSelector: nodeSelector, StorageClassParameters: map[string]string{ "skuName": "Premium_LRS", "mountWithOAuthToken": "true", diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 3499aa9b5f..a1f38c18cc 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -921,18 +921,19 @@ const ( // setupOAuthToken obtains an OAuth token from the node's managed identity and stores it // in a Kubernetes Secret for use by mountWithOAuthToken e2e tests. -func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureClient *azure.Client) error { +// Returns the node name where the token was fetched from. +func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureClient *azure.Client) (string, error) { // Step 1: Get node managed identity info (uses Azure SDK with SP creds, works from Prow) identityInfo, err := azureClient.GetNodeIdentityInfo(ctx, creds.ResourceGroup) if err != nil { - return fmt.Errorf("failed to get node identity info: %v", err) + return "", fmt.Errorf("failed to get node identity info: %v", err) } log.Printf("Found node identity: clientID=%s, principalID=%s", identityInfo.ClientID, identityInfo.PrincipalID) // Step 2: Assign Storage File Data SMB MI Admin role to identity err = azureClient.AssignRoleToIdentity(ctx, creds.ResourceGroup, identityInfo.PrincipalID, wiStorageFileDataSMBMIAdmin) if err != nil { - return fmt.Errorf("failed to assign Storage File Data SMB MI Admin role: %v", err) + return "", fmt.Errorf("failed to assign Storage File Data SMB MI Admin role: %v", err) } log.Println("Storage File Data SMB MI Admin role assigned to node identity") @@ -943,18 +944,18 @@ func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureC kubeconfig := os.Getenv(kubeconfigEnvVar) restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfig) if err != nil { - return fmt.Errorf("failed to build kubeconfig: %v", err) + return "", fmt.Errorf("failed to build kubeconfig: %v", err) } cs, err := clientset.NewForConfig(restConfig) if err != nil { - return fmt.Errorf("failed to create kubernetes client: %v", err) + return "", fmt.Errorf("failed to create kubernetes client: %v", err) } - token, err := getOAuthTokenFromNode(ctx, cs, identityInfo.ClientID) + token, nodeName, err := getOAuthTokenFromNode(ctx, cs, identityInfo.ClientID) if err != nil { - return fmt.Errorf("failed to get storage OAuth token from node: %v", err) + return "", fmt.Errorf("failed to get storage OAuth token from node: %v", err) } - log.Println("Obtained storage OAuth token from agent node via IMDS") + log.Printf("Obtained storage OAuth token from agent node %s via IMDS", nodeName) // Step 4: Create Kubernetes Secret with the OAuth token secret := &corev1.Secret{ @@ -972,25 +973,26 @@ func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureC if apierrors.IsAlreadyExists(err) { existing, getErr := cs.CoreV1().Secrets(oauthSecretNamespace).Get(ctx, oauthSecretName, metav1.GetOptions{}) if getErr != nil { - return fmt.Errorf("failed to get existing OAuth token secret: %v", getErr) + return "", fmt.Errorf("failed to get existing OAuth token secret: %v", getErr) } secret.ResourceVersion = existing.ResourceVersion _, err = cs.CoreV1().Secrets(oauthSecretNamespace).Update(ctx, secret, metav1.UpdateOptions{}) if err != nil { - return fmt.Errorf("failed to update OAuth token secret: %v", err) + return "", fmt.Errorf("failed to update OAuth token secret: %v", err) } } else { - return fmt.Errorf("failed to create OAuth token secret: %v", err) + return "", fmt.Errorf("failed to create OAuth token secret: %v", err) } } log.Printf("Created/updated OAuth token secret %s/%s", oauthSecretNamespace, oauthSecretName) - return nil + return nodeName, nil } // getOAuthTokenFromNode deploys a pod on a workload cluster agent node that uses IMDS // to obtain an Azure Storage OAuth token, then reads the token from the pod logs. -func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID string) (string, error) { +// Returns (token, nodeName, error). +func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID string) (string, string, error) { namespace := "default" // IMDS curl command that outputs only the access_token value @@ -1040,7 +1042,7 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID created, err := cs.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{}) if err != nil { - return "", fmt.Errorf("failed to create token fetcher pod: %v", err) + return "", "", fmt.Errorf("failed to create token fetcher pod: %v", err) } podName := created.Name defer func() { @@ -1054,24 +1056,26 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID err = waitForPodComplete(waitCtx, cs, namespace, podName) if err != nil { - return "", fmt.Errorf("token fetcher pod did not complete: %v", err) + return "", "", fmt.Errorf("token fetcher pod did not complete: %v", err) } // Print which node the token fetcher pod ran on completedPod, getErr := cs.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + nodeName := "" if getErr == nil { - log.Printf("Token fetcher pod %s ran on node: %s", podName, completedPod.Spec.NodeName) + nodeName = completedPod.Spec.NodeName + log.Printf("Token fetcher pod %s ran on node: %s", podName, nodeName) } // Read token from pod logs logBytes, err := cs.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{}).Do(ctx).Raw() if err != nil { - return "", fmt.Errorf("failed to get token fetcher pod logs: %v", err) + return "", "", fmt.Errorf("failed to get token fetcher pod logs: %v", err) } token := strings.TrimSpace(string(logBytes)) if token == "" { - return "", fmt.Errorf("token fetcher pod returned empty token") + return "", "", fmt.Errorf("token fetcher pod returned empty token") } // Log token info for debugging (show length and first/last 10 chars) @@ -1081,7 +1085,7 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID log.Printf("OAuth token obtained: length=%d (token too short, possibly invalid)", len(token)) } - return token, nil + return token, nodeName, nil } // waitForPodComplete polls until the pod reaches Succeeded or Failed phase. diff --git a/test/e2e/testsuites/dynamically_provisioned_cmd_volume_tester.go b/test/e2e/testsuites/dynamically_provisioned_cmd_volume_tester.go index 4657b0b31c..40fe0dfa25 100644 --- a/test/e2e/testsuites/dynamically_provisioned_cmd_volume_tester.go +++ b/test/e2e/testsuites/dynamically_provisioned_cmd_volume_tester.go @@ -37,6 +37,8 @@ type DynamicallyProvisionedCmdVolumeTest struct { Pods []PodDetails StorageClassParameters map[string]string ServiceAccountName string + // NodeSelector constrains which node the test pod runs on + NodeSelector map[string]string } func (t *DynamicallyProvisionedCmdVolumeTest) Run(ctx context.Context, client clientset.Interface, namespace *v1.Namespace) { @@ -54,6 +56,10 @@ func (t *DynamicallyProvisionedCmdVolumeTest) Run(ctx context.Context, client cl // pod's service account; no in-container token mount is needed. } + if len(t.NodeSelector) > 0 { + tpod.SetNodeSelector(t.NodeSelector) + } + ginkgo.By("deploying the pod") tpod.Create(ctx) defer tpod.Cleanup(ctx) From c71e94246774c04e732fd7391ce1066944e2cf4e Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Fri, 1 May 2026 15:56:49 +0000 Subject: [PATCH 39/68] fix: include mountWithOAuthToken in requiresSmbOAuth condition mountWithOAuthToken was missing from the requiresSmbOAuth check, causing SMB OAuth to not be enabled on the storage account when using OAuth token based mount. This would lead to mount failures. --- pkg/azurefile/controllerserver.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index 66be3c3161..412316378c 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -575,8 +575,8 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) } var requiresSmbOAuth *bool - if mountWithManagedIdentity || mountWithWIToken { - klog.V(2).Info("enabling smb oauth for managed identity or work identity token based mount") + if mountWithManagedIdentity || mountWithWIToken || mountWithOAuthToken { + klog.V(2).Info("enabling smb oauth for managed identity, workload identity token, or oauth token based mount") requiresSmbOAuth = to.Ptr(true) } From b9fe412251dc3eba264930cb975f1baa739f3fc2 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 2 May 2026 00:58:30 +0000 Subject: [PATCH 40/68] fix: resolve accountName from volume ID in NodePublishVolume for OAuth token mount NodePublishVolume's OAuth server resolution only checked volume context fields (storageAccount, secretName) but the OAuth token secret doesn't contain the account name. Parse it from volume ID (which encodes it as the second field) before falling back to the secret lookup. This fixes: 'NodePublishVolume: server is empty for volume(...) with mountWithOAuthToken' --- pkg/azurefile/nodeserver.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 0cc1db6a1c..768bd36be2 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -195,6 +195,13 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu oauthServer = getValueInMap(context, serverNameField) if oauthServer == "" { accountName := getValueInMap(context, storageAccountField) + if accountName == "" { + // Try to parse account name from volume ID + _, parsedAccountName, _, _, _, _, parseErr := GetFileShareInfo(volumeID) + if parseErr == nil && parsedAccountName != "" { + accountName = parsedAccountName + } + } if accountName == "" { secretName := getValueInMap(context, secretNameField) secretNamespace := getSecretNamespace(context) From 374ac44e0a0d0cb3e28f9d2244b686f2d1846bc6 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 2 May 2026 14:36:42 +0000 Subject: [PATCH 41/68] revert: remove node logging, token info logging, and node-pinning for OAuth test Reverts: - test: log which node the oauth token fetcher pod ran on (a275045) - fix: remove trailing slash from storage.azure.com resource, log OAuth token info (354c8bd) - fix: pin OAuth test pod to same node where token was fetched (b4b6bd2) Keep the trailing slash fix (storage.azure.com/ -> storage.azure.com) as it is a functional bug fix, only revert the debug logging and node-pinning logic. --- test/e2e/dynamic_provisioning_test.go | 14 ++--- test/e2e/suite_test.go | 53 +++++++------------ ...namically_provisioned_cmd_volume_tester.go | 6 --- 3 files changed, 21 insertions(+), 52 deletions(-) diff --git a/test/e2e/dynamic_provisioning_test.go b/test/e2e/dynamic_provisioning_test.go index 8361859327..ef3ef9d454 100644 --- a/test/e2e/dynamic_provisioning_test.go +++ b/test/e2e/dynamic_provisioning_test.go @@ -1947,7 +1947,7 @@ var _ = ginkgo.Describe("Dynamic Provisioning", func() { ginkgo.Skip("mountWithOAuthToken test requires CAPZ environment") } // Setup OAuth token inline to ensure fresh token at mount time - tokenNodeName, err := setupOAuthToken(ctx, suiteCreds, suiteAzureClient) + err := setupOAuthToken(ctx, suiteCreds, suiteAzureClient) gomega.Expect(err).NotTo(gomega.HaveOccurred(), "OAuth token setup failed") pods := []testsuites.PodDetails{ @@ -1969,17 +1969,9 @@ var _ = ginkgo.Describe("Dynamic Provisioning", func() { }, }, } - // Pin test pod to the same node where the OAuth token was fetched, - // so that the CSI node plugin's setCredentialCache runs on the same node. - nodeSelector := map[string]string{"kubernetes.io/os": "linux"} - if tokenNodeName != "" { - nodeSelector["kubernetes.io/hostname"] = tokenNodeName - log.Printf("Pinning OAuth test pod to node: %s", tokenNodeName) - } test := testsuites.DynamicallyProvisionedCmdVolumeTest{ - CSIDriver: testDriver, - Pods: pods, - NodeSelector: nodeSelector, + CSIDriver: testDriver, + Pods: pods, StorageClassParameters: map[string]string{ "skuName": "Premium_LRS", "mountWithOAuthToken": "true", diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index a1f38c18cc..78ab961fcd 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -921,19 +921,18 @@ const ( // setupOAuthToken obtains an OAuth token from the node's managed identity and stores it // in a Kubernetes Secret for use by mountWithOAuthToken e2e tests. -// Returns the node name where the token was fetched from. -func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureClient *azure.Client) (string, error) { +func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureClient *azure.Client) error { // Step 1: Get node managed identity info (uses Azure SDK with SP creds, works from Prow) identityInfo, err := azureClient.GetNodeIdentityInfo(ctx, creds.ResourceGroup) if err != nil { - return "", fmt.Errorf("failed to get node identity info: %v", err) + return fmt.Errorf("failed to get node identity info: %v", err) } log.Printf("Found node identity: clientID=%s, principalID=%s", identityInfo.ClientID, identityInfo.PrincipalID) // Step 2: Assign Storage File Data SMB MI Admin role to identity err = azureClient.AssignRoleToIdentity(ctx, creds.ResourceGroup, identityInfo.PrincipalID, wiStorageFileDataSMBMIAdmin) if err != nil { - return "", fmt.Errorf("failed to assign Storage File Data SMB MI Admin role: %v", err) + return fmt.Errorf("failed to assign Storage File Data SMB MI Admin role: %v", err) } log.Println("Storage File Data SMB MI Admin role assigned to node identity") @@ -944,18 +943,18 @@ func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureC kubeconfig := os.Getenv(kubeconfigEnvVar) restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfig) if err != nil { - return "", fmt.Errorf("failed to build kubeconfig: %v", err) + return fmt.Errorf("failed to build kubeconfig: %v", err) } cs, err := clientset.NewForConfig(restConfig) if err != nil { - return "", fmt.Errorf("failed to create kubernetes client: %v", err) + return fmt.Errorf("failed to create kubernetes client: %v", err) } - token, nodeName, err := getOAuthTokenFromNode(ctx, cs, identityInfo.ClientID) + token, err := getOAuthTokenFromNode(ctx, cs, identityInfo.ClientID) if err != nil { - return "", fmt.Errorf("failed to get storage OAuth token from node: %v", err) + return fmt.Errorf("failed to get storage OAuth token from node: %v", err) } - log.Printf("Obtained storage OAuth token from agent node %s via IMDS", nodeName) + log.Println("Obtained storage OAuth token from agent node via IMDS") // Step 4: Create Kubernetes Secret with the OAuth token secret := &corev1.Secret{ @@ -973,26 +972,25 @@ func setupOAuthToken(ctx context.Context, creds *credentials.Credentials, azureC if apierrors.IsAlreadyExists(err) { existing, getErr := cs.CoreV1().Secrets(oauthSecretNamespace).Get(ctx, oauthSecretName, metav1.GetOptions{}) if getErr != nil { - return "", fmt.Errorf("failed to get existing OAuth token secret: %v", getErr) + return fmt.Errorf("failed to get existing OAuth token secret: %v", getErr) } secret.ResourceVersion = existing.ResourceVersion _, err = cs.CoreV1().Secrets(oauthSecretNamespace).Update(ctx, secret, metav1.UpdateOptions{}) if err != nil { - return "", fmt.Errorf("failed to update OAuth token secret: %v", err) + return fmt.Errorf("failed to update OAuth token secret: %v", err) } } else { - return "", fmt.Errorf("failed to create OAuth token secret: %v", err) + return fmt.Errorf("failed to create OAuth token secret: %v", err) } } log.Printf("Created/updated OAuth token secret %s/%s", oauthSecretNamespace, oauthSecretName) - return nodeName, nil + return nil } // getOAuthTokenFromNode deploys a pod on a workload cluster agent node that uses IMDS // to obtain an Azure Storage OAuth token, then reads the token from the pod logs. -// Returns (token, nodeName, error). -func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID string) (string, string, error) { +func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID string) (string, error) { namespace := "default" // IMDS curl command that outputs only the access_token value @@ -1042,7 +1040,7 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID created, err := cs.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{}) if err != nil { - return "", "", fmt.Errorf("failed to create token fetcher pod: %v", err) + return "", fmt.Errorf("failed to create token fetcher pod: %v", err) } podName := created.Name defer func() { @@ -1056,36 +1054,21 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID err = waitForPodComplete(waitCtx, cs, namespace, podName) if err != nil { - return "", "", fmt.Errorf("token fetcher pod did not complete: %v", err) - } - - // Print which node the token fetcher pod ran on - completedPod, getErr := cs.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) - nodeName := "" - if getErr == nil { - nodeName = completedPod.Spec.NodeName - log.Printf("Token fetcher pod %s ran on node: %s", podName, nodeName) + return "", fmt.Errorf("token fetcher pod did not complete: %v", err) } // Read token from pod logs logBytes, err := cs.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{}).Do(ctx).Raw() if err != nil { - return "", "", fmt.Errorf("failed to get token fetcher pod logs: %v", err) + return "", fmt.Errorf("failed to get token fetcher pod logs: %v", err) } token := strings.TrimSpace(string(logBytes)) if token == "" { - return "", "", fmt.Errorf("token fetcher pod returned empty token") - } - - // Log token info for debugging (show length and first/last 10 chars) - if len(token) > 20 { - log.Printf("OAuth token obtained: length=%d, prefix=%s...suffix=%s", len(token), token[:10], token[len(token)-10:]) - } else { - log.Printf("OAuth token obtained: length=%d (token too short, possibly invalid)", len(token)) + return "", fmt.Errorf("token fetcher pod returned empty token") } - return token, nodeName, nil + return token, nil } // waitForPodComplete polls until the pod reaches Succeeded or Failed phase. diff --git a/test/e2e/testsuites/dynamically_provisioned_cmd_volume_tester.go b/test/e2e/testsuites/dynamically_provisioned_cmd_volume_tester.go index 40fe0dfa25..4657b0b31c 100644 --- a/test/e2e/testsuites/dynamically_provisioned_cmd_volume_tester.go +++ b/test/e2e/testsuites/dynamically_provisioned_cmd_volume_tester.go @@ -37,8 +37,6 @@ type DynamicallyProvisionedCmdVolumeTest struct { Pods []PodDetails StorageClassParameters map[string]string ServiceAccountName string - // NodeSelector constrains which node the test pod runs on - NodeSelector map[string]string } func (t *DynamicallyProvisionedCmdVolumeTest) Run(ctx context.Context, client clientset.Interface, namespace *v1.Namespace) { @@ -56,10 +54,6 @@ func (t *DynamicallyProvisionedCmdVolumeTest) Run(ctx context.Context, client cl // pod's service account; no in-container token mount is needed. } - if len(t.NodeSelector) > 0 { - tpod.SetNodeSelector(t.NodeSelector) - } - ginkgo.By("deploying the pod") tpod.Create(ctx) defer tpod.Cleanup(ctx) From cc850cbe6a8d7cb746b26b1cdf00ed94fb79d252 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 3 May 2026 02:08:33 +0000 Subject: [PATCH 42/68] refactor: consolidate storeAccountKey and requiresSmbOAuth logic - Merge storeAccountKey=false and requiresSmbOAuth into a single conditional block for mountWithManagedIdentity/mountWithWIToken/mountWithOAuthToken - Remove duplicate early secretName validation in NodePublishVolume (already validated in the mountWithOAuthToken block below) - Add mountWithOAuthToken to shouldUseServiceAccountToken function --- pkg/azurefile/controllerserver.go | 9 +++------ pkg/azurefile/nodeserver.go | 6 +++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index 412316378c..b116e732cb 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -341,8 +341,11 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) return nil, status.Errorf(codes.InvalidArgument, "only one of %s, %s, and %s can be true in storage class", mountWithManagedIdentityField, mountWithOAuthTokenField, mountWithWITokenField) } + var requiresSmbOAuth *bool if mountWithManagedIdentity || mountWithWIToken || mountWithOAuthToken { storeAccountKey = false + klog.V(2).Info("enabling smb oauth for managed identity or workload identity token based mount") + requiresSmbOAuth = to.Ptr(true) } if matchTags && account != "" { @@ -574,12 +577,6 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) } } - var requiresSmbOAuth *bool - if mountWithManagedIdentity || mountWithWIToken || mountWithOAuthToken { - klog.V(2).Info("enabling smb oauth for managed identity, workload identity token, or oauth token based mount") - requiresSmbOAuth = to.Ptr(true) - } - accountOptions := &storage.AccountOptions{ Name: account, Type: sku, diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 768bd36be2..1513f60887 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -189,9 +189,6 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu var oauthServer string isMountWithOAuthToken := context != nil && strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) if isMountWithOAuthToken { - if strings.TrimSpace(getValueInMap(context, secretNameField)) == "" { - return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: secretName is required for volume(%s) with mountWithOAuthToken", volumeID) - } oauthServer = getValueInMap(context, serverNameField) if oauthServer == "" { accountName := getValueInMap(context, storageAccountField) @@ -1014,6 +1011,9 @@ func shouldUseServiceAccountToken(attrib map[string]string) bool { if getValueInMap(attrib, mountWithWITokenField) == trueValue { return true } + if getValueInMap(attrib, mountWithOAuthTokenField) == trueValue { + return true + } if getValueInMap(attrib, clientIDField) != "" && !strings.EqualFold(getValueInMap(attrib, mountWithManagedIdentityField), trueValue) { return true } From d34d66b2ea2b7b9ab9313cfa409a9feadf35c63d Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 3 May 2026 02:23:24 +0000 Subject: [PATCH 43/68] refactor: consolidate OAuth token server resolution and validation into setCredentialCacheWithOAuthToken Move server resolution logic (from volumeContext, volumeID, or secret) and secretName validation into setCredentialCacheWithOAuthToken. The function now returns (server, error) and encapsulates: - secretName validation - server resolution (from context, volumeID parsing, or secret lookup) - oauthTokenSHAMap check (skip refresh if token unchanged) - credential cache refresh This removes duplicate server resolution code and makes both callers (pre-ensureMountPoint and mount path) simpler. --- pkg/azurefile/nodeserver.go | 99 +++++++++++++++----------------- pkg/azurefile/nodeserver_test.go | 4 +- 2 files changed, 48 insertions(+), 55 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 1513f60887..a476b99c3f 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -185,53 +185,19 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu mountOptions = append(mountOptions, "ro") } - // mountWithOAuthToken: validate inputs and resolve server before ensureMountPoint - var oauthServer string - isMountWithOAuthToken := context != nil && strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) - if isMountWithOAuthToken { - oauthServer = getValueInMap(context, serverNameField) - if oauthServer == "" { - accountName := getValueInMap(context, storageAccountField) - if accountName == "" { - // Try to parse account name from volume ID - _, parsedAccountName, _, _, _, _, parseErr := GetFileShareInfo(volumeID) - if parseErr == nil && parsedAccountName != "" { - accountName = parsedAccountName - } - } - if accountName == "" { - secretName := getValueInMap(context, secretNameField) - secretNamespace := getSecretNamespace(context) - if secretName != "" { - name, _, _, secretErr := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) - if secretErr != nil { - return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to get account from secret %s/%s: %v", secretNamespace, secretName, secretErr) - } - accountName = name - } - } - storageEndpointSuffix := getValueInMap(context, storageEndpointSuffixField) - if storageEndpointSuffix == "" { - storageEndpointSuffix = d.getStorageEndPointSuffix() - } - if accountName != "" { - oauthServer = fmt.Sprintf("%s.file.%s", accountName, storageEndpointSuffix) - } - } - if oauthServer == "" { - return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: server is empty for volume(%s) with mountWithOAuthToken", volumeID) - } - } - - // mountWithOAuthToken: refresh credential cache BEFORE ensureMountPoint so that + // mountWithOAuthToken: validate and refresh credential cache BEFORE ensureMountPoint so that // a stale mount (token expired → "required key not available") can recover. // ensureMountPoint does ReadDir to validate the mount, which will fail if the // kernel credential cache has an expired token, leading to an unmount attempt // that fails with "target is busy". Refreshing first gives the kernel a valid // token before the ReadDir check. + var oauthServer string + isMountWithOAuthToken := context != nil && strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) if isMountWithOAuthToken { - if err := d.setCredentialCacheWithOAuthToken(ctx, oauthServer, context); err != nil { - return nil, status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", volumeID, err) + var err error + oauthServer, err = d.setCredentialCacheWithOAuthToken(ctx, volumeID, context) + if err != nil { + return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: %v", err) } klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) } @@ -448,9 +414,6 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe if protocol == nfs { return nil, status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported with NFS protocol") } - if strings.TrimSpace(getValueInMap(context, secretNameField)) == "" { - return nil, status.Error(codes.InvalidArgument, "secretName is required when mountWithOAuthToken is true") - } if strings.EqualFold(getValueInMap(context, createFolderIfNotExistField), trueValue) { return nil, status.Error(codes.InvalidArgument, "createFolderIfNotExist is not supported with mountWithOAuthToken") } @@ -531,7 +494,7 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe sensitiveMountOptions = []string{"sec=krb5,cruid=0,upcall_target=mount"} klog.V(2).Infof("using OAuth token from secret for volume %s", volumeID) // always refresh credential cache when mountWithOAuthToken is set, even if mount does not happen - if err := d.setCredentialCacheWithOAuthToken(ctx, server, context); err != nil { + if _, err := d.setCredentialCacheWithOAuthToken(ctx, volumeID, context); err != nil { return nil, status.Errorf(codes.Internal, "setCredentialCacheWithOAuthToken failed for %s with error: %v", server, err) } } else { @@ -916,37 +879,67 @@ func (d *Driver) ensureMountPoint(target string, perm os.FileMode) (bool, error) // setCredentialCacheWithOAuthToken reads the OAuth token from the referenced // Kubernetes Secret via GetStorageAccountFromSecret and calls setCredentialCache // to update the credential cache. -func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, server string, volumeContext map[string]string) error { +func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID string, volumeContext map[string]string) (string, error) { secretName := getValueInMap(volumeContext, secretNameField) - secretNamespace := getSecretNamespace(volumeContext) if secretName == "" { - return fmt.Errorf("secretName is required when %s is true", mountWithOAuthTokenField) + return "", fmt.Errorf("secretName is required when %s is true", mountWithOAuthTokenField) } + // Resolve server name + server := getValueInMap(volumeContext, serverNameField) + if server == "" { + accountName := getValueInMap(volumeContext, storageAccountField) + if accountName == "" { + _, parsedAccountName, _, _, _, _, parseErr := GetFileShareInfo(volumeID) + if parseErr == nil && parsedAccountName != "" { + accountName = parsedAccountName + } + } + if accountName == "" { + secretNamespace := getSecretNamespace(volumeContext) + name, _, _, secretErr := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) + if secretErr != nil { + return "", fmt.Errorf("failed to get account from secret %s/%s: %v", secretNamespace, secretName, secretErr) + } + accountName = name + } + storageEndpointSuffix := getValueInMap(volumeContext, storageEndpointSuffixField) + if storageEndpointSuffix == "" { + storageEndpointSuffix = d.getStorageEndPointSuffix() + } + if accountName != "" { + server = fmt.Sprintf("%s.file.%s", accountName, storageEndpointSuffix) + } + } + if server == "" { + return "", fmt.Errorf("server is empty for volume(%s) with %s", volumeID, mountWithOAuthTokenField) + } + + secretNamespace := getSecretNamespace(volumeContext) _, _, oauthToken, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) if err != nil { - return fmt.Errorf("failed to get secret %s/%s: %v", secretNamespace, secretName, err) + return "", fmt.Errorf("failed to get secret %s/%s: %v", secretNamespace, secretName, err) } if oauthToken == "" { - return fmt.Errorf("%s not found in secret %s/%s", defaultSecretOAuthToken, secretNamespace, secretName) + return "", fmt.Errorf("%s not found in secret %s/%s", defaultSecretOAuthToken, secretNamespace, secretName) } // check if token has changed by comparing SHA256 hash tokenSHA := fmt.Sprintf("%x", sha256.Sum256([]byte(oauthToken))) if cachedSHA, ok := d.oauthTokenSHAMap.Load(server); ok && cachedSHA.(string) == tokenSHA { klog.V(4).Infof("setCredentialCacheWithOAuthToken: OAuth token unchanged for server %s, skipping refresh", server) - return nil + return server, nil } if output, err := setCredentialCache(server, "", "", "", oauthToken); err != nil { klog.Errorf("setCredentialCache failed for %s with output: %s, error: %v", server, strings.ReplaceAll(string(output), oauthToken, ""), err) - return fmt.Errorf("setCredentialCache failed for %s: %v", server, err) + return "", fmt.Errorf("setCredentialCache failed for %s: %v", server, err) } d.oauthTokenSHAMap.Store(server, tokenSHA) klog.V(2).Infof("setCredentialCacheWithOAuthToken: refreshed credential cache for server %s using secret %s/%s", server, secretNamespace, secretName) - return nil + return server, nil } func (d *Driver) mountWithProxy(ctx context.Context, source, target, fsType string, options, sensitiveMountOptions []string) error { diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index 4a7da8145a..15f150b249 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -339,7 +339,7 @@ func TestNodePublishVolume(t *testing.T) { }, }, expectedErr: testutil.TestError{ - DefaultError: status.Errorf(codes.InvalidArgument, "NodePublishVolume: secretName is required for volume(%s) with mountWithOAuthToken", "vol_1"), + DefaultError: status.Errorf(codes.InvalidArgument, "NodePublishVolume: secretName is required when %s is true", mountWithOAuthTokenField), }, }, { @@ -355,7 +355,7 @@ func TestNodePublishVolume(t *testing.T) { }, }, expectedErr: testutil.TestError{ - DefaultError: status.Errorf(codes.Internal, "NodePublishVolume: failed to refresh OAuth token for volume(%s): %v", "vol_1", fmt.Errorf("failed to get secret %s/%s: %v", "default", "test-secret", fmt.Errorf("could not get credentials from secret(%s): KubeClient is nil", "test-secret"))), + DefaultError: status.Errorf(codes.InvalidArgument, "NodePublishVolume: failed to get secret %s/%s: %v", "default", "test-secret", fmt.Errorf("could not get credentials from secret(%s): KubeClient is nil", "test-secret")), }, setup: func() { d.kubeClient = nil From ec43ac0dba7f3eab80a1be5bc984fd8345bd0f0c Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 3 May 2026 02:23:54 +0000 Subject: [PATCH 44/68] fix: update stale log message for identity-based mount --- pkg/azurefile/controllerserver.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index b116e732cb..22c8bec3bc 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -344,7 +344,7 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) var requiresSmbOAuth *bool if mountWithManagedIdentity || mountWithWIToken || mountWithOAuthToken { storeAccountKey = false - klog.V(2).Info("enabling smb oauth for managed identity or workload identity token based mount") + klog.V(2).Info("enabling smb oauth for identity-based mount") requiresSmbOAuth = to.Ptr(true) } From 4bd685d84cf17c5e94743eba86b1ae8783dd4da3 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 3 May 2026 02:27:29 +0000 Subject: [PATCH 45/68] fix: update setCredentialCacheWithOAuthToken test for new signature --- pkg/azurefile/nodeserver_test.go | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index 15f150b249..a54b8c7105 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -1434,25 +1434,23 @@ func makeFakeOutput(output string, err error) testingexec.FakeAction { func TestSetCredentialCacheWithOAuthToken(t *testing.T) { tests := []struct { desc string - server string volumeContext map[string]string setupDriver func(d *Driver) expectedErr string expectSkip bool }{ { - desc: "missing secretName", - server: "testaccount.file.core.windows.net", + desc: "missing secretName", volumeContext: map[string]string{ mountWithOAuthTokenField: "true", }, expectedErr: "secretName is required", }, { - desc: "kubeClient is nil", - server: "testaccount.file.core.windows.net", + desc: "kubeClient is nil", volumeContext: map[string]string{ secretNameField: "test-secret", + serverNameField: "testaccount.file.core.windows.net", }, setupDriver: func(d *Driver) { d.kubeClient = nil @@ -1460,10 +1458,10 @@ func TestSetCredentialCacheWithOAuthToken(t *testing.T) { expectedErr: "KubeClient is nil", }, { - desc: "oauthtoken missing in secret", - server: "testaccount.file.core.windows.net", + desc: "oauthtoken missing in secret", volumeContext: map[string]string{ secretNameField: "test-secret", + serverNameField: "testaccount.file.core.windows.net", }, setupDriver: func(d *Driver) { secret := &corev1.Secret{ @@ -1475,10 +1473,10 @@ func TestSetCredentialCacheWithOAuthToken(t *testing.T) { expectedErr: fmt.Sprintf("%s not found in secret", defaultSecretOAuthToken), }, { - desc: "skip refresh when token SHA unchanged", - server: "testaccount.file.core.windows.net", + desc: "skip refresh when token SHA unchanged", volumeContext: map[string]string{ secretNameField: "test-secret", + serverNameField: "testaccount.file.core.windows.net", }, setupDriver: func(d *Driver) { token := "test-oauth-token-value" @@ -1504,7 +1502,7 @@ func TestSetCredentialCacheWithOAuthToken(t *testing.T) { if test.setupDriver != nil { test.setupDriver(d) } - err := d.setCredentialCacheWithOAuthToken(context.Background(), test.server, test.volumeContext) + _, err := d.setCredentialCacheWithOAuthToken(context.Background(), "vol_1", test.volumeContext) if test.expectSkip { assert.NoError(t, err) } else if test.expectedErr != "" { From c34b38fbd04b7040b9f20939950fb8a6bdaaec53 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 3 May 2026 02:39:26 +0000 Subject: [PATCH 46/68] fix: restore secretName validation in NodeStageVolume, fix error message in mount path --- pkg/azurefile/nodeserver.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index a476b99c3f..9a2e559394 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -414,6 +414,9 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe if protocol == nfs { return nil, status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported with NFS protocol") } + if strings.TrimSpace(getValueInMap(context, secretNameField)) == "" { + return nil, status.Error(codes.InvalidArgument, "secretName is required when mountWithOAuthToken is true") + } if strings.EqualFold(getValueInMap(context, createFolderIfNotExistField), trueValue) { return nil, status.Error(codes.InvalidArgument, "createFolderIfNotExist is not supported with mountWithOAuthToken") } @@ -495,7 +498,7 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe klog.V(2).Infof("using OAuth token from secret for volume %s", volumeID) // always refresh credential cache when mountWithOAuthToken is set, even if mount does not happen if _, err := d.setCredentialCacheWithOAuthToken(ctx, volumeID, context); err != nil { - return nil, status.Errorf(codes.Internal, "setCredentialCacheWithOAuthToken failed for %s with error: %v", server, err) + return nil, status.Errorf(codes.Internal, "setCredentialCacheWithOAuthToken failed for volume(%s) with error: %v", volumeID, err) } } else { if accountName == "" || accountKey == "" { From 547c7152c53fa7cd59c31f7acf675c3d634fd002 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 3 May 2026 02:52:13 +0000 Subject: [PATCH 47/68] fix: remove mountWithOAuthToken from shouldUseServiceAccountToken OAuth token authentication uses a Kubernetes Secret, not a service account token. Including it in shouldUseServiceAccountToken caused NodeStageVolume to skip all validation and return early when serviceAccountToken was empty, making the NFS/secretName/createFolder checks unreachable. --- pkg/azurefile/nodeserver.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 9a2e559394..8bbc989ce4 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -1007,9 +1007,6 @@ func shouldUseServiceAccountToken(attrib map[string]string) bool { if getValueInMap(attrib, mountWithWITokenField) == trueValue { return true } - if getValueInMap(attrib, mountWithOAuthTokenField) == trueValue { - return true - } if getValueInMap(attrib, clientIDField) != "" && !strings.EqualFold(getValueInMap(attrib, mountWithManagedIdentityField), trueValue) { return true } From 977cb9847bc747803d03c1c869183cc5ac76ae11 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 3 May 2026 03:08:41 +0000 Subject: [PATCH 48/68] fix: use codes.Internal for setCredentialCacheWithOAuthToken errors in NodePublishVolume Validation errors (missing secretName) are caught in NodeStageVolume. By the time NodePublishVolume runs, failures from setCredentialCacheWithOAuthToken are server-side issues (secret fetch, credential cache), not input validation. --- pkg/azurefile/nodeserver.go | 2 +- pkg/azurefile/nodeserver_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 8bbc989ce4..bd25e87335 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -197,7 +197,7 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu var err error oauthServer, err = d.setCredentialCacheWithOAuthToken(ctx, volumeID, context) if err != nil { - return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: %v", err) + return nil, status.Errorf(codes.Internal, "NodePublishVolume: %v", err) } klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) } diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index a54b8c7105..aad94524b0 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -339,7 +339,7 @@ func TestNodePublishVolume(t *testing.T) { }, }, expectedErr: testutil.TestError{ - DefaultError: status.Errorf(codes.InvalidArgument, "NodePublishVolume: secretName is required when %s is true", mountWithOAuthTokenField), + DefaultError: status.Errorf(codes.Internal, "NodePublishVolume: secretName is required when %s is true", mountWithOAuthTokenField), }, }, { @@ -355,7 +355,7 @@ func TestNodePublishVolume(t *testing.T) { }, }, expectedErr: testutil.TestError{ - DefaultError: status.Errorf(codes.InvalidArgument, "NodePublishVolume: failed to get secret %s/%s: %v", "default", "test-secret", fmt.Errorf("could not get credentials from secret(%s): KubeClient is nil", "test-secret")), + DefaultError: status.Errorf(codes.Internal, "NodePublishVolume: failed to get secret %s/%s: %v", "default", "test-secret", fmt.Errorf("could not get credentials from secret(%s): KubeClient is nil", "test-secret")), }, setup: func() { d.kubeClient = nil From 3fffe46f788015ddb3e49f40ab3b36a6b8ce7682 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 3 May 2026 03:17:52 +0000 Subject: [PATCH 49/68] fix: improve error message when server cannot be resolved for mountWithOAuthToken --- pkg/azurefile/nodeserver.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index bd25e87335..e915518564 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -915,7 +915,7 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID } } if server == "" { - return "", fmt.Errorf("server is empty for volume(%s) with %s", volumeID, mountWithOAuthTokenField) + return "", fmt.Errorf("server is empty for volume(%s) with %s: set %q or %q in volume context, or provide account name in secret %q", volumeID, mountWithOAuthTokenField, serverNameField, storageAccountField, secretNameField) } secretNamespace := getSecretNamespace(volumeContext) From e1e72e26d7fe3d5cd2ceead0598d42cdad5e6392 Mon Sep 17 00:00:00 2001 From: Andy Zhang Date: Tue, 5 May 2026 04:24:16 +0000 Subject: [PATCH 50/68] fix: return InvalidArgument for missing secretName and consolidate secret fetch - Return codes.InvalidArgument instead of codes.Internal when secretName is missing in volume context (input validation error, not driver error). - Fetch secret once upfront in setCredentialCacheWithOAuthToken and reuse parsed values to avoid duplicate Kubernetes API calls. Addresses copilot review feedback. --- pkg/azurefile/nodeserver.go | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index e915518564..95247ecb65 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -197,6 +197,11 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu var err error oauthServer, err = d.setCredentialCacheWithOAuthToken(ctx, volumeID, context) if err != nil { + // Return InvalidArgument for input/volume-context problems + secretName := getValueInMap(context, secretNameField) + if secretName == "" { + return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: %v", err) + } return nil, status.Errorf(codes.Internal, "NodePublishVolume: %v", err) } klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) @@ -888,6 +893,13 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID return "", fmt.Errorf("secretName is required when %s is true", mountWithOAuthTokenField) } + // Fetch the secret once upfront to avoid duplicate API calls + secretNamespace := getSecretNamespace(volumeContext) + secretAccountName, _, oauthToken, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) + if err != nil { + return "", fmt.Errorf("failed to get secret %s/%s: %v", secretNamespace, secretName, err) + } + // Resolve server name server := getValueInMap(volumeContext, serverNameField) if server == "" { @@ -899,12 +911,7 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID } } if accountName == "" { - secretNamespace := getSecretNamespace(volumeContext) - name, _, _, secretErr := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) - if secretErr != nil { - return "", fmt.Errorf("failed to get account from secret %s/%s: %v", secretNamespace, secretName, secretErr) - } - accountName = name + accountName = secretAccountName } storageEndpointSuffix := getValueInMap(volumeContext, storageEndpointSuffixField) if storageEndpointSuffix == "" { @@ -918,12 +925,6 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID return "", fmt.Errorf("server is empty for volume(%s) with %s: set %q or %q in volume context, or provide account name in secret %q", volumeID, mountWithOAuthTokenField, serverNameField, storageAccountField, secretNameField) } - secretNamespace := getSecretNamespace(volumeContext) - _, _, oauthToken, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) - if err != nil { - return "", fmt.Errorf("failed to get secret %s/%s: %v", secretNamespace, secretName, err) - } - if oauthToken == "" { return "", fmt.Errorf("%s not found in secret %s/%s", defaultSecretOAuthToken, secretNamespace, secretName) } From 1918f6c43d04e10465388ad85ed71add7873ade8 Mon Sep 17 00:00:00 2001 From: Andy Zhang Date: Tue, 5 May 2026 06:06:17 +0000 Subject: [PATCH 51/68] test: update expected error code for missing secretName to InvalidArgument --- pkg/azurefile/nodeserver_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index aad94524b0..08dea6d8d3 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -339,7 +339,7 @@ func TestNodePublishVolume(t *testing.T) { }, }, expectedErr: testutil.TestError{ - DefaultError: status.Errorf(codes.Internal, "NodePublishVolume: secretName is required when %s is true", mountWithOAuthTokenField), + DefaultError: status.Errorf(codes.InvalidArgument, "NodePublishVolume: secretName is required when %s is true", mountWithOAuthTokenField), }, }, { From 85f32be015f9d5d4dab8c2a534c5df7fdc718460 Mon Sep 17 00:00:00 2001 From: Andy Zhang Date: Tue, 5 May 2026 13:48:32 +0000 Subject: [PATCH 52/68] fix: return InvalidArgument for all validation errors in OAuth flow setCredentialCacheWithOAuthToken now returns proper gRPC status errors: - codes.InvalidArgument for missing secretName, empty server, missing token - plain errors for system failures (secret fetch, credential cache) The caller propagates the status code from the helper instead of only checking for empty secretName. --- pkg/azurefile/nodeserver.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 95247ecb65..0c955f13ef 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -197,12 +197,13 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu var err error oauthServer, err = d.setCredentialCacheWithOAuthToken(ctx, volumeID, context) if err != nil { - // Return InvalidArgument for input/volume-context problems - secretName := getValueInMap(context, secretNameField) - if secretName == "" { - return nil, status.Errorf(codes.InvalidArgument, "NodePublishVolume: %v", err) + // Propagate gRPC status code from helper (InvalidArgument for + // validation errors, Internal for system failures) + code := status.Code(err) + if code == codes.OK { + code = codes.Internal } - return nil, status.Errorf(codes.Internal, "NodePublishVolume: %v", err) + return nil, status.Errorf(code, "NodePublishVolume: %v", err) } klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) } @@ -890,7 +891,7 @@ func (d *Driver) ensureMountPoint(target string, perm os.FileMode) (bool, error) func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID string, volumeContext map[string]string) (string, error) { secretName := getValueInMap(volumeContext, secretNameField) if secretName == "" { - return "", fmt.Errorf("secretName is required when %s is true", mountWithOAuthTokenField) + return "", status.Errorf(codes.InvalidArgument, "secretName is required when %s is true", mountWithOAuthTokenField) } // Fetch the secret once upfront to avoid duplicate API calls @@ -922,11 +923,11 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID } } if server == "" { - return "", fmt.Errorf("server is empty for volume(%s) with %s: set %q or %q in volume context, or provide account name in secret %q", volumeID, mountWithOAuthTokenField, serverNameField, storageAccountField, secretNameField) + return "", status.Errorf(codes.InvalidArgument, "server is empty for volume(%s) with %s: set %q or %q in volume context, or provide account name in secret %q", volumeID, mountWithOAuthTokenField, serverNameField, storageAccountField, secretNameField) } if oauthToken == "" { - return "", fmt.Errorf("%s not found in secret %s/%s", defaultSecretOAuthToken, secretNamespace, secretName) + return "", status.Errorf(codes.InvalidArgument, "%s not found in secret %s/%s", defaultSecretOAuthToken, secretNamespace, secretName) } // check if token has changed by comparing SHA256 hash From 5565e324db1c32ce33069b584546268a70bea9d5 Mon Sep 17 00:00:00 2001 From: Andy Zhang Date: Tue, 5 May 2026 14:02:42 +0000 Subject: [PATCH 53/68] fix: extract status message to avoid double-wrapping, treat Unknown as Internal --- pkg/azurefile/nodeserver.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 0c955f13ef..61b65ef44e 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -200,10 +200,11 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu // Propagate gRPC status code from helper (InvalidArgument for // validation errors, Internal for system failures) code := status.Code(err) - if code == codes.OK { + if code == codes.OK || code == codes.Unknown { code = codes.Internal } - return nil, status.Errorf(code, "NodePublishVolume: %v", err) + msg := status.Convert(err).Message() + return nil, status.Errorf(code, "NodePublishVolume: %s", msg) } klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) } From 5346a2c1e85073091ef19cca28c3fa6bd5961436 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 9 May 2026 07:02:45 +0000 Subject: [PATCH 54/68] simplify error handling for setCredentialCacheWithOAuthToken No need to propagate gRPC status codes from the internal helper - kubelet treats all mount failures the same regardless of code. Just return codes.Internal with the error message. --- pkg/azurefile/nodeserver.go | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 61b65ef44e..bf7602b03d 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -197,14 +197,7 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu var err error oauthServer, err = d.setCredentialCacheWithOAuthToken(ctx, volumeID, context) if err != nil { - // Propagate gRPC status code from helper (InvalidArgument for - // validation errors, Internal for system failures) - code := status.Code(err) - if code == codes.OK || code == codes.Unknown { - code = codes.Internal - } - msg := status.Convert(err).Message() - return nil, status.Errorf(code, "NodePublishVolume: %s", msg) + return nil, status.Errorf(codes.Internal, "NodePublishVolume: %v", err) } klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) } From 89040191da3c95add8c77a912ffb957b3d35bc1f Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 9 May 2026 07:16:56 +0000 Subject: [PATCH 55/68] fix: preserve gRPC status code from setCredentialCacheWithOAuthToken Use status.Convert to unwrap the error's code and message cleanly, avoiding nested 'rpc error' in the message while preserving the original status code (e.g. InvalidArgument for validation errors). --- pkg/azurefile/nodeserver.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index bf7602b03d..ca683a6174 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -197,7 +197,8 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu var err error oauthServer, err = d.setCredentialCacheWithOAuthToken(ctx, volumeID, context) if err != nil { - return nil, status.Errorf(codes.Internal, "NodePublishVolume: %v", err) + st := status.Convert(err) + return nil, status.Errorf(st.Code(), "NodePublishVolume: %s", st.Message()) } klog.V(2).Infof("NodePublishVolume: refreshed OAuth token credential cache for volume(%s) server(%s)", volumeID, oauthServer) } From 11fd442faa067fd2dc287d8a12cae74e7e57a38b Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 9 May 2026 07:32:49 +0000 Subject: [PATCH 56/68] fix: return codes.Internal for setCredentialCache failures Use status.Errorf instead of fmt.Errorf so the error surfaces as codes.Internal rather than codes.Unknown through status.Convert. --- pkg/azurefile/nodeserver.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index ca683a6174..f469a6a564 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -934,7 +934,7 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID if output, err := setCredentialCache(server, "", "", "", oauthToken); err != nil { klog.Errorf("setCredentialCache failed for %s with output: %s, error: %v", server, strings.ReplaceAll(string(output), oauthToken, ""), err) - return "", fmt.Errorf("setCredentialCache failed for %s: %v", server, err) + return "", status.Errorf(codes.Internal, "setCredentialCache failed for %s: %v", server, err) } d.oauthTokenSHAMap.Store(server, tokenSHA) From 4f17e7ba6efb61f284a67e1e5ae19d7573fa41c3 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 9 May 2026 08:13:38 +0000 Subject: [PATCH 57/68] fix: return codes.Internal for secret fetch failures Use status.Errorf instead of fmt.Errorf so the error surfaces as codes.Internal rather than codes.Unknown through status.Convert. --- pkg/azurefile/nodeserver.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index f469a6a564..786d903045 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -893,7 +893,7 @@ func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID secretNamespace := getSecretNamespace(volumeContext) secretAccountName, _, oauthToken, err := d.GetStorageAccountFromSecret(ctx, secretName, secretNamespace) if err != nil { - return "", fmt.Errorf("failed to get secret %s/%s: %v", secretNamespace, secretName, err) + return "", status.Errorf(codes.Internal, "failed to get secret %s/%s: %v", secretNamespace, secretName, err) } // Resolve server name From 4250b648b6adab9d973ba2d29ecfc353c0e3c708 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 9 May 2026 08:32:31 +0000 Subject: [PATCH 58/68] fix: address review comments - CreateVolume: validate secretName is provided when mountWithOAuthToken is true, fail early with InvalidArgument instead of deferring failure to NodePublish - e2e: use shared oauthSecretName/oauthSecretNamespace constants instead of hardcoded strings in dynamic provisioning test - utils_test: add test case for mutually exclusive token and tokenFile --- pkg/azurefile/controllerserver.go | 4 ++++ pkg/azurefile/utils_test.go | 10 +++++++++- test/e2e/dynamic_provisioning_test.go | 4 ++-- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index 22c8bec3bc..c3d8fce91e 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -341,6 +341,10 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) return nil, status.Errorf(codes.InvalidArgument, "only one of %s, %s, and %s can be true in storage class", mountWithManagedIdentityField, mountWithOAuthTokenField, mountWithWITokenField) } + if mountWithOAuthToken && secretName == "" { + return nil, status.Errorf(codes.InvalidArgument, "%s is required when %s is true", secretNameField, mountWithOAuthTokenField) + } + var requiresSmbOAuth *bool if mountWithManagedIdentity || mountWithWIToken || mountWithOAuthToken { storeAccountKey = false diff --git a/pkg/azurefile/utils_test.go b/pkg/azurefile/utils_test.go index 3a05ece3c4..a2f6ff2f0c 100644 --- a/pkg/azurefile/utils_test.go +++ b/pkg/azurefile/utils_test.go @@ -1497,6 +1497,7 @@ func TestSetCredentialCache(t *testing.T) { desc string server string token string + tokenFile string expectedError string }{ { @@ -1511,10 +1512,17 @@ func TestSetCredentialCache(t *testing.T) { token: "test-oauth-token", expectedError: "", // Will fail due to missing azfilesauthmanager, but must NOT fail with clientID/tenantID error }, + { + desc: "both token and tokenFile should fail", + server: "test.file.core.windows.net", + token: "test-oauth-token", + tokenFile: "/tmp/token", + expectedError: "token and tokenFile are mutually exclusive", + }, } for _, test := range tokenTests { - _, err := setCredentialCache(test.server, "", "", "", test.token) + _, err := setCredentialCache(test.server, "", "", test.tokenFile, test.token) if test.expectedError != "" { if err == nil { t.Errorf("test[%s]: expected error containing %q, got nil", test.desc, test.expectedError) diff --git a/test/e2e/dynamic_provisioning_test.go b/test/e2e/dynamic_provisioning_test.go index ef3ef9d454..4ec43473e2 100644 --- a/test/e2e/dynamic_provisioning_test.go +++ b/test/e2e/dynamic_provisioning_test.go @@ -1975,8 +1975,8 @@ var _ = ginkgo.Describe("Dynamic Provisioning", func() { StorageClassParameters: map[string]string{ "skuName": "Premium_LRS", "mountWithOAuthToken": "true", - "secretName": "azure-oauth-token-secret", - "secretNamespace": "default", + "secretName": oauthSecretName, + "secretNamespace": oauthSecretNamespace, }, } test.Run(ctx, cs, ns) From e4b42499e5b6354f6ef91c307e11d5590ed54838 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 10 May 2026 00:37:26 +0000 Subject: [PATCH 59/68] fix: reject mountWithOAuthToken for NFS and fix IMDS resource URL - CreateVolume: return InvalidArgument when mountWithOAuthToken is combined with NFS protocol, preventing provisioning shares that will fail at mount time - e2e: add trailing slash to IMDS resource URL (https://storage.azure.com/) to avoid invalid_resource rejection in some environments --- pkg/azurefile/controllerserver.go | 4 ++++ test/e2e/suite_test.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index c3d8fce91e..6ef6f742c6 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -345,6 +345,10 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) return nil, status.Errorf(codes.InvalidArgument, "%s is required when %s is true", secretNameField, mountWithOAuthTokenField) } + if mountWithOAuthToken && (protocol == nfs || fsType == nfs) { + return nil, status.Errorf(codes.InvalidArgument, "%s is not supported with NFS protocol", mountWithOAuthTokenField) + } + var requiresSmbOAuth *bool if mountWithManagedIdentity || mountWithWIToken || mountWithOAuthToken { storeAccountKey = false diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 78ab961fcd..417b3ad021 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -995,7 +995,7 @@ func getOAuthTokenFromNode(ctx context.Context, cs clientset.Interface, clientID // IMDS curl command that outputs only the access_token value curlCmd := fmt.Sprintf( - `curl -s -H "Metadata: true" "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%s&resource=https://storage.azure.com" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4`, + `curl -s -H "Metadata: true" "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%s&resource=https://storage.azure.com/" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4`, clientID, ) From cc88f7dc7baefc6b00d6d85919ff1b153879753c Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Wed, 13 May 2026 13:33:53 +0000 Subject: [PATCH 60/68] fix: also reject mountWithOAuthToken when fsType is nfs NFS can be indicated via either protocol=nfs or fsType=nfs. The mountWithOAuthToken guard only checked protocol, so a PV with fsType: nfs could bypass the check. Reject both. --- pkg/azurefile/nodeserver.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 786d903045..75b1befce3 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -412,7 +412,7 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe if runtime.GOOS == "windows" { return nil, status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported on Windows") } - if protocol == nfs { + if protocol == nfs || fsType == nfs { return nil, status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported with NFS protocol") } if strings.TrimSpace(getValueInMap(context, secretNameField)) == "" { From f0802ca5c94397b1e936748443d22b22fa1781ab Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Thu, 14 May 2026 03:27:09 +0000 Subject: [PATCH 61/68] test: add OIDC JWKS readiness check for workload identity e2e test In CAPZ clusters, the OIDC JWKS document is served from Azure Blob Storage and may not be immediately available after cluster creation. When the CSI driver tries to mount with workload identity, it exchanges the SA token for an Azure OAuth token via AAD. If AAD cannot find the signing key in the OIDC issuer's JWKS metadata, it returns AADSTS7000272 / 400 Bad Request and the mount fails. Changes: - Add 120s wait for FIC and RBAC propagation after setup - Add waitForOIDCJWKS() that polls the issuer's JWKS endpoint until it returns at least one signing key (up to 5 min timeout) --- test/e2e/suite_test.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 417b3ad021..576956ecc0 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -530,6 +530,20 @@ func setupWorkloadIdentity(ctx context.Context, cs clientset.Interface, azureCli } return identityInfo.ClientID, nil + // Wait for ARM eventual consistency (FIC + RBAC propagation) + log.Printf("Waiting 120s for FIC and RBAC propagation...") + time.Sleep(120 * time.Second) + + // Verify OIDC JWKS endpoint is accessible and contains signing keys. + // AAD validates SA tokens by fetching the JWKS from the OIDC issuer. + // In CAPZ clusters the JWKS is served from Azure Blob Storage and may + // not be immediately available after cluster creation, leading to + // AADSTS7000272 / 400 Bad Request errors during token exchange. + if err := waitForOIDCJWKS(oidcIssuerURL, 5*time.Minute); err != nil { + return fmt.Errorf("OIDC JWKS not ready: %v", err) + } + + return nil } // discoverOIDCIssuer retrieves the OIDC issuer URL from the kube-apiserver's @@ -555,6 +569,9 @@ func discoverOIDCIssuer(ctx context.Context, cs clientset.Interface) (string, er // waitForOIDCJWKS polls the OIDC issuer's JWKS endpoint until it returns a // valid response containing at least one signing key. +// valid response containing at least one signing key. This guards against the +// race where AAD tries to validate a SA token before the JWKS document is +// published (AADSTS7000272). func waitForOIDCJWKS(issuerURL string, timeout time.Duration) error { jwksURL := strings.TrimSuffix(issuerURL, "/") + "/openid/v1/jwks" log.Printf("Waiting up to %v for OIDC JWKS to be available at %s", timeout, jwksURL) @@ -570,6 +587,7 @@ func waitForOIDCJWKS(issuerURL string, timeout time.Duration) error { var lastErr error for time.Now().Before(deadline) { resp, err := httpClient.Get(jwksURL) //nolint:gosec + resp, err := httpClient.Get(jwksURL) //nolint:gosec // URL is constructed from cluster OIDC issuer, not user input if err != nil { lastErr = fmt.Errorf("GET %s: %v", jwksURL, err) log.Printf("JWKS not ready: %v, retrying...", lastErr) From ac1d9d28296a35f5accb6afeed11ae3abbb50a64 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Thu, 14 May 2026 05:45:09 +0000 Subject: [PATCH 62/68] fix: propagate original gRPC status code from setCredentialCacheWithOAuthToken Instead of wrapping the error with codes.Internal, return the error directly so that InvalidArgument and other specific status codes from setCredentialCacheWithOAuthToken are preserved. --- pkg/azurefile/nodeserver.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 75b1befce3..003dae5013 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -499,7 +499,7 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe klog.V(2).Infof("using OAuth token from secret for volume %s", volumeID) // always refresh credential cache when mountWithOAuthToken is set, even if mount does not happen if _, err := d.setCredentialCacheWithOAuthToken(ctx, volumeID, context); err != nil { - return nil, status.Errorf(codes.Internal, "setCredentialCacheWithOAuthToken failed for volume(%s) with error: %v", volumeID, err) + return nil, err } } else { if accountName == "" || accountKey == "" { From 9452ba5802ba08f5d402a6db8a7c2661b2dd760f Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Fri, 15 May 2026 10:01:39 +0000 Subject: [PATCH 63/68] address review: extract OAuth validation, improve logging and error messages - Extract mountWithOAuthToken validation to shared validateMountWithOAuthToken() - Log secret name and namespace when using OAuth token - Improve error message: clientID must be provided when tokenFile is set --- pkg/azurefile/nodeserver.go | 35 +++++++++++++++++++++++------------ pkg/azurefile/utils.go | 2 +- pkg/azurefile/utils_test.go | 4 ++-- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 003dae5013..29d8b6da0a 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -409,17 +409,8 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe } if mountWithOAuthToken { - if runtime.GOOS == "windows" { - return nil, status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported on Windows") - } - if protocol == nfs || fsType == nfs { - return nil, status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported with NFS protocol") - } - if strings.TrimSpace(getValueInMap(context, secretNameField)) == "" { - return nil, status.Error(codes.InvalidArgument, "secretName is required when mountWithOAuthToken is true") - } - if strings.EqualFold(getValueInMap(context, createFolderIfNotExistField), trueValue) { - return nil, status.Error(codes.InvalidArgument, "createFolderIfNotExist is not supported with mountWithOAuthToken") + if err := validateMountWithOAuthToken(protocol, fsType, context); err != nil { + return nil, err } } @@ -496,7 +487,9 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe } } else if mountWithOAuthToken && runtime.GOOS != "windows" { sensitiveMountOptions = []string{"sec=krb5,cruid=0,upcall_target=mount"} - klog.V(2).Infof("using OAuth token from secret for volume %s", volumeID) + secretName := getValueInMap(context, secretNameField) + secretNamespace := getSecretNamespace(context) + klog.V(2).Infof("using OAuth token from secret(%s/%s) for volume %s", secretNamespace, secretName, volumeID) // always refresh credential cache when mountWithOAuthToken is set, even if mount does not happen if _, err := d.setCredentialCacheWithOAuthToken(ctx, volumeID, context); err != nil { return nil, err @@ -883,6 +876,24 @@ func (d *Driver) ensureMountPoint(target string, perm os.FileMode) (bool, error) // setCredentialCacheWithOAuthToken reads the OAuth token from the referenced // Kubernetes Secret via GetStorageAccountFromSecret and calls setCredentialCache // to update the credential cache. +// validateMountWithOAuthToken validates that the volume context is compatible +// with mountWithOAuthToken. Shared by NodeStageVolume and NodePublishVolume. +func validateMountWithOAuthToken(protocol, fsType string, volumeContext map[string]string) error { + if runtime.GOOS == "windows" { + return status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported on Windows") + } + if protocol == nfs || fsType == nfs { + return status.Error(codes.InvalidArgument, "mountWithOAuthToken is not supported with NFS protocol") + } + if strings.TrimSpace(getValueInMap(volumeContext, secretNameField)) == "" { + return status.Error(codes.InvalidArgument, "secretName is required when mountWithOAuthToken is true") + } + if strings.EqualFold(getValueInMap(volumeContext, createFolderIfNotExistField), trueValue) { + return status.Error(codes.InvalidArgument, "createFolderIfNotExist is not supported with mountWithOAuthToken") + } + return nil +} + func (d *Driver) setCredentialCacheWithOAuthToken(ctx context.Context, volumeID string, volumeContext map[string]string) (string, error) { secretName := getValueInMap(volumeContext, secretNameField) if secretName == "" { diff --git a/pkg/azurefile/utils.go b/pkg/azurefile/utils.go index 320d652f0c..8ffcc01379 100644 --- a/pkg/azurefile/utils.go +++ b/pkg/azurefile/utils.go @@ -447,7 +447,7 @@ func setCredentialCache(server, clientID, tenantID, tokenFile, token string) ([] args = []string{"set", serverURL, token} case tokenFile != "": if clientID == "" { - return nil, fmt.Errorf("clientID must be provided") + return nil, fmt.Errorf("clientID must be provided when tokenFile is set") } if tenantID == "" { return nil, fmt.Errorf("tenantID must be provided when tokenFile is provided") diff --git a/pkg/azurefile/utils_test.go b/pkg/azurefile/utils_test.go index a2f6ff2f0c..8502252e39 100644 --- a/pkg/azurefile/utils_test.go +++ b/pkg/azurefile/utils_test.go @@ -1443,7 +1443,7 @@ func TestSetCredentialCache(t *testing.T) { clientID: "", tenantID: "test-tenant-id", tokenFile: "test-token-file", - expectedError: "clientID must be provided", + expectedError: "clientID must be provided when tokenFile is set", }, { desc: "empty tenantID with tokenFile", @@ -1532,7 +1532,7 @@ func TestSetCredentialCache(t *testing.T) { } else if err != nil { // Token mode should not return clientID/tenantID validation errors errMsg := err.Error() - if strings.Contains(errMsg, "clientID must be provided") || strings.Contains(errMsg, "tenantID must be provided") { + if strings.Contains(errMsg, "clientID must be provided when tokenFile is set") || strings.Contains(errMsg, "tenantID must be provided") { t.Errorf("test[%s]: token mode should bypass clientID/tenantID validation, got: %v", test.desc, err) } // Other errors (e.g., azfilesauthmanager not found) are expected in test environment From 44d26914f61c56a65b0b0ae1a9b2d699f195e3d9 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Fri, 15 May 2026 12:09:16 +0000 Subject: [PATCH 64/68] fix: remove stale comment above validateMountWithOAuthToken --- pkg/azurefile/nodeserver.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 29d8b6da0a..78c874c72b 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -873,9 +873,6 @@ func (d *Driver) ensureMountPoint(target string, perm os.FileMode) (bool, error) return !notMnt, nil } -// setCredentialCacheWithOAuthToken reads the OAuth token from the referenced -// Kubernetes Secret via GetStorageAccountFromSecret and calls setCredentialCache -// to update the credential cache. // validateMountWithOAuthToken validates that the volume context is compatible // with mountWithOAuthToken. Shared by NodeStageVolume and NodePublishVolume. func validateMountWithOAuthToken(protocol, fsType string, volumeContext map[string]string) error { From 46bff0994aab656754d8b62ec308011558cf90a9 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 16 May 2026 14:34:07 +0000 Subject: [PATCH 65/68] fix: remove dead code and duplicate line from rebase --- test/e2e/suite_test.go | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index 576956ecc0..fa87054421 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -530,20 +530,6 @@ func setupWorkloadIdentity(ctx context.Context, cs clientset.Interface, azureCli } return identityInfo.ClientID, nil - // Wait for ARM eventual consistency (FIC + RBAC propagation) - log.Printf("Waiting 120s for FIC and RBAC propagation...") - time.Sleep(120 * time.Second) - - // Verify OIDC JWKS endpoint is accessible and contains signing keys. - // AAD validates SA tokens by fetching the JWKS from the OIDC issuer. - // In CAPZ clusters the JWKS is served from Azure Blob Storage and may - // not be immediately available after cluster creation, leading to - // AADSTS7000272 / 400 Bad Request errors during token exchange. - if err := waitForOIDCJWKS(oidcIssuerURL, 5*time.Minute); err != nil { - return fmt.Errorf("OIDC JWKS not ready: %v", err) - } - - return nil } // discoverOIDCIssuer retrieves the OIDC issuer URL from the kube-apiserver's @@ -586,7 +572,6 @@ func waitForOIDCJWKS(issuerURL string, timeout time.Duration) error { deadline := time.Now().Add(timeout) var lastErr error for time.Now().Before(deadline) { - resp, err := httpClient.Get(jwksURL) //nolint:gosec resp, err := httpClient.Get(jwksURL) //nolint:gosec // URL is constructed from cluster OIDC issuer, not user input if err != nil { lastErr = fmt.Errorf("GET %s: %v", jwksURL, err) From 7a4d55260ad1b8af01de9a73ac38e5a9e3bee4ac Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sat, 16 May 2026 14:54:11 +0000 Subject: [PATCH 66/68] fix: move azidentity from indirect to direct dependency in go.mod --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index ccee511a6d..8dff3f95fb 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.25.0 require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7 v7.3.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi v1.3.0 @@ -53,7 +54,6 @@ require ( require ( cel.dev/expr v0.25.1 // indirect cyphar.com/go-pathrs v0.2.1 // indirect - github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerregistry/armcontainerregistry v1.2.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v6 v6.6.0 // indirect From 2e7cbc1bbc4f01a702dd1d24192ebdbbdf6b4feb Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 17 May 2026 01:57:28 +0000 Subject: [PATCH 67/68] test: increase storage account limit from 17 to 20 in AfterSuite check --- test/e2e/suite_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go index fa87054421..9cda503377 100644 --- a/test/e2e/suite_test.go +++ b/test/e2e/suite_test.go @@ -376,7 +376,7 @@ func checkAccountCreationLeak(_ context.Context) { } ginkgo.By(fmt.Sprintf("GetAccountNumByResourceGroup(%s) returns %d accounts", creds.ResourceGroup, accountNum)) - accountLimitInTest := 17 + accountLimitInTest := 20 gomega.Expect(accountNum >= accountLimitInTest).To(gomega.BeFalse()) } From 79bdbdbe1fcc4b7366f4639d70f11ebabb9d939c Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Tue, 19 May 2026 02:07:39 +0000 Subject: [PATCH 68/68] fix: block mountWithOAuthToken for ephemeral inline volumes Explicitly reject OAuth token authentication for inline (ephemeral) volumes, similar to the existing managed identity check. Inline volumes should not use identity-based auth as it would allow arbitrary pods to access fileshares with the provided token's privileges. --- pkg/azurefile/nodeserver.go | 5 ++++- pkg/azurefile/nodeserver_test.go | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pkg/azurefile/nodeserver.go b/pkg/azurefile/nodeserver.go index 78c874c72b..fa72c0650b 100644 --- a/pkg/azurefile/nodeserver.go +++ b/pkg/azurefile/nodeserver.go @@ -98,11 +98,14 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu // ephemeral volume if strings.EqualFold(context[ephemeralField], trueValue) { setKeyValueInMap(context, secretNamespaceField, context[podNamespaceField]) - // When Managed Identity is used for ephemeral volumes then reject the request. + // When Managed Identity or OAuth token is used for ephemeral volumes then reject the request. // Allowing access for inline volume with identity will open up risk of arbitrary pods accessing fileshares with node identity permissions. if strings.EqualFold(getValueInMap(context, mountWithManagedIdentityField), trueValue) { return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("mountWithManagedIdentity cannot be used for ephemeral volumes, please use either %s or secret based authentication", mountWithWITokenField)) } + if strings.EqualFold(getValueInMap(context, mountWithOAuthTokenField), trueValue) { + return nil, status.Error(codes.InvalidArgument, "mountWithOAuthToken cannot be used for ephemeral volumes, please use secret based authentication") + } useWIToken := strings.EqualFold(getValueInMap(context, mountWithWITokenField), trueValue) if !d.allowInlineVolumeKeyAccessWithIdentity && !useWIToken { // only get storage account from secret when not using managed identity or workload identity diff --git a/pkg/azurefile/nodeserver_test.go b/pkg/azurefile/nodeserver_test.go index 08dea6d8d3..58f0ed3a78 100644 --- a/pkg/azurefile/nodeserver_test.go +++ b/pkg/azurefile/nodeserver_test.go @@ -238,6 +238,24 @@ func TestNodePublishVolume(t *testing.T) { WindowsError: status.Error(codes.InvalidArgument, fmt.Sprintf("mountWithManagedIdentity cannot be used for ephemeral volumes, please use either %s or secret based authentication", mountWithWITokenField)), }, }, + { + desc: "[Error] Ephemeral volume with mountWithOAuthToken should return error", + req: &csi.NodePublishVolumeRequest{VolumeCapability: &csi.VolumeCapability{AccessMode: &volumeCap}, + VolumeId: "csi-94637b24200724b604b0e2c92e0fcdfabb0e109f656857c5a3c9585777c8ed85", + TargetPath: targetTest, + Readonly: true, + VolumeContext: map[string]string{ + ephemeralField: "true", + storageAccountField: "teststorageaccount", + shareNameField: "testshare", + mountWithOAuthTokenField: "true", + }, + }, + expectedErr: testutil.TestError{ + DefaultError: status.Error(codes.InvalidArgument, "mountWithOAuthToken cannot be used for ephemeral volumes, please use secret based authentication"), + WindowsError: status.Error(codes.InvalidArgument, "mountWithOAuthToken cannot be used for ephemeral volumes, please use secret based authentication"), + }, + }, { desc: "[Error] Ephemeral volume with mountWithWIToken should preserve storageAccount", req: &csi.NodePublishVolumeRequest{VolumeCapability: &csi.VolumeCapability{AccessMode: &volumeCap},