Skip to content
Open
32 changes: 32 additions & 0 deletions pkg/azurefile/controllerserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,11 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
}

klog.V(2).Infof("begin to create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d) protocol(%s)", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, shareProtocol)
// shouldCleanupShare indicates whether the file share should be cleaned up on failure.
// When fileShareName is user-specified (non-empty), we skip cleanup since the share
// may be pre-existing and not owned by CSI. For auto-generated share names, CSI owns
// the lifecycle and should clean up on failure to avoid orphaned shares.
shouldCleanupShare := (fileShareName == "")
if err := d.CreateFileShare(ctx, accountOptions, shareOptions, secret, useDataPlaneAPI); err != nil {
if strings.Contains(err.Error(), accountLimitExceedManagementAPI) || strings.Contains(err.Error(), accountLimitExceedDataPlaneAPI) {
Comment thread
andyzhangx marked this conversation as resolved.
klog.Warningf("create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d), error: %v, skip matching current account", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, err)
Expand Down Expand Up @@ -739,18 +744,21 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
if req.GetVolumeContentSource() != nil {
accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, false)
if err != nil {
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "getAzcopyAuth failure")
return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err)
Comment thread
andyzhangx marked this conversation as resolved.
}
copyErr := d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix)
if accountSASToken == "" && copyErr != nil && strings.Contains(copyErr.Error(), authorizationPermissionMismatch) {
klog.Warningf("azcopy copy failed with AuthorizationPermissionMismatch error, should assign \"Storage File Data Privileged Contributor\" role to controller identity, fall back to use sas token, original error: %v", copyErr)
accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, true)
if err != nil {
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "fallback getAzcopyAuth failure")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "fallback getAzcopyAuth failure")
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "sas token fallback getAzcopyAuth failure")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we should also check status of ongoing azcopy job (might be triggered by previous reconciliations) before triggering cleanup.

return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err)
}
copyErr = d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix)
}
if copyErr != nil {
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, fmt.Sprintf("copyVolume(%s) failure", validFileShareName))
return nil, copyErr
}
// storeAccountKey is not needed here since copy volume is only using SAS token
Expand Down Expand Up @@ -841,6 +849,30 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
}, nil
}

// cleanupShareOnFailure is a best-effort rollback helper that deletes the file share
// when shouldCleanupShare is true (i.e., the share name was auto-generated by CSI).
// It checks for a running azcopy job first — if a job is still in progress, the share
// is preserved so retries can resume rather than starting from zero.
func (d *Driver) cleanupShareOnFailure(shouldCleanupShare bool, accountName, shareName, subsID, resourceGroup string, secret map[string]string, useDataPlaneAPI, reason string) {
if shouldCleanupShare {
// Check if an azcopy job is still running for this share — if so,
// skip cleanup to avoid orphaning the job and losing partial progress.
jobState, _, err := d.azcopy.GetAzcopyJob(shareName, []string{})
if err == nil && jobState == util.AzcopyJobRunning {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is also good to skip deletion when job is succeeded (Helps to avoid deletion if there is race b/w original timeout and job completion.

klog.V(2).Infof("skip cleanup of file share(%s) on account(%s): azcopy job is still running", shareName, accountName)
return
}
klog.V(2).Infof("%s on account(%s), cleaning up file share(%s)", reason, accountName, shareName)
// Use a background context for cleanup to avoid inheriting a cancelled/expired
// context from the original CreateVolume request (e.g., after azcopy timeout).
cleanupCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
Comment thread
andyzhangx marked this conversation as resolved.
defer cancel()
if cleanupErr := d.DeleteFileShare(cleanupCtx, subsID, resourceGroup, accountName, shareName, secret, useDataPlaneAPI); cleanupErr != nil {
klog.Warningf("failed to clean up file share(%s) on account(%s) rg(%s) after %s: %v", shareName, accountName, resourceGroup, reason, cleanupErr)
}
Comment thread
andyzhangx marked this conversation as resolved.
}
}

// DeleteVolume delete an azure file
func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (resp *csi.DeleteVolumeResponse, returnedErr error) {
volumeID := req.GetVolumeId()
Expand Down
Loading