diff --git a/pkg/azurefile/controllerserver.go b/pkg/azurefile/controllerserver.go index 0e07c13f3b..1539afc223 100644 --- a/pkg/azurefile/controllerserver.go +++ b/pkg/azurefile/controllerserver.go @@ -710,6 +710,11 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) } klog.V(2).Infof("begin to create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d) protocol(%s)", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, shareProtocol) + // shouldCleanupShare indicates whether the file share should be cleaned up on failure. + // When fileShareName is user-specified (non-empty), we skip cleanup since the share + // may be pre-existing and not owned by CSI. For auto-generated share names, CSI owns + // the lifecycle and should clean up on failure to avoid orphaned shares. + shouldCleanupShare := (fileShareName == "") if err := d.CreateFileShare(ctx, accountOptions, shareOptions, secret, useDataPlaneAPI); err != nil { if strings.Contains(err.Error(), accountLimitExceedManagementAPI) || strings.Contains(err.Error(), accountLimitExceedDataPlaneAPI) { klog.Warningf("create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d), error: %v, skip matching current account", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, err) @@ -739,6 +744,7 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) if req.GetVolumeContentSource() != nil { accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, false) if err != nil { + d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "getAzcopyAuth failure") return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err) } copyErr := d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix) @@ -746,11 +752,13 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) klog.Warningf("azcopy copy failed with AuthorizationPermissionMismatch error, should assign \"Storage File Data Privileged Contributor\" role to controller identity, fall back to use sas token, original error: %v", copyErr) accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, true) if err != nil { + d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "fallback getAzcopyAuth failure") return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err) } copyErr = d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix) } if copyErr != nil { + d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, fmt.Sprintf("copyVolume(%s) failure", validFileShareName)) return nil, copyErr } // storeAccountKey is not needed here since copy volume is only using SAS token @@ -841,6 +849,30 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) }, nil } +// cleanupShareOnFailure is a best-effort rollback helper that deletes the file share +// when shouldCleanupShare is true (i.e., the share name was auto-generated by CSI). +// It checks for a running azcopy job first — if a job is still in progress, the share +// is preserved so retries can resume rather than starting from zero. +func (d *Driver) cleanupShareOnFailure(shouldCleanupShare bool, accountName, shareName, subsID, resourceGroup string, secret map[string]string, useDataPlaneAPI, reason string) { + if shouldCleanupShare { + // Check if an azcopy job is still running for this share — if so, + // skip cleanup to avoid orphaning the job and losing partial progress. + jobState, _, err := d.azcopy.GetAzcopyJob(shareName, []string{}) + if err == nil && jobState == util.AzcopyJobRunning { + klog.V(2).Infof("skip cleanup of file share(%s) on account(%s): azcopy job is still running", shareName, accountName) + return + } + klog.V(2).Infof("%s on account(%s), cleaning up file share(%s)", reason, accountName, shareName) + // Use a background context for cleanup to avoid inheriting a cancelled/expired + // context from the original CreateVolume request (e.g., after azcopy timeout). + cleanupCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + if cleanupErr := d.DeleteFileShare(cleanupCtx, subsID, resourceGroup, accountName, shareName, secret, useDataPlaneAPI); cleanupErr != nil { + klog.Warningf("failed to clean up file share(%s) on account(%s) rg(%s) after %s: %v", shareName, accountName, resourceGroup, reason, cleanupErr) + } + } +} + // DeleteVolume delete an azure file func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (resp *csi.DeleteVolumeResponse, returnedErr error) { volumeID := req.GetVolumeId()