Skip to content
Open
23 changes: 23 additions & 0 deletions pkg/azurefile/controllerserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,11 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
}

klog.V(2).Infof("begin to create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d) protocol(%s)", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, shareProtocol)
// shouldCleanupShare indicates whether the file share should be cleaned up on failure.
// When fileShareName is user-specified (non-empty), we skip cleanup since the share
// may be pre-existing and not owned by CSI. For auto-generated share names, CSI owns
// the lifecycle and should clean up on failure to avoid orphaned shares.
shouldCleanupShare := (fileShareName == "")
if err := d.CreateFileShare(ctx, accountOptions, shareOptions, secret, useDataPlaneAPI); err != nil {
if strings.Contains(err.Error(), accountLimitExceedManagementAPI) || strings.Contains(err.Error(), accountLimitExceedDataPlaneAPI) {
Comment thread
andyzhangx marked this conversation as resolved.
klog.Warningf("create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d), error: %v, skip matching current account", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, err)
Expand Down Expand Up @@ -739,18 +744,21 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
if req.GetVolumeContentSource() != nil {
accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, false)
if err != nil {
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "getAzcopyAuth failure")
return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err)
Comment thread
andyzhangx marked this conversation as resolved.
}
copyErr := d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix)
if accountSASToken == "" && copyErr != nil && strings.Contains(copyErr.Error(), authorizationPermissionMismatch) {
klog.Warningf("azcopy copy failed with AuthorizationPermissionMismatch error, should assign \"Storage File Data Privileged Contributor\" role to controller identity, fall back to use sas token, original error: %v", copyErr)
accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, true)
if err != nil {
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "fallback getAzcopyAuth failure")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "fallback getAzcopyAuth failure")
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "sas token fallback getAzcopyAuth failure")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we should also check status of ongoing azcopy job (might be triggered by previous reconciliations) before triggering cleanup.

return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err)
}
copyErr = d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix)
}
if copyErr != nil {
d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, fmt.Sprintf("copyVolume(%s) failure", validFileShareName))
Comment thread
andyzhangx marked this conversation as resolved.
return nil, copyErr
}
// storeAccountKey is not needed here since copy volume is only using SAS token
Expand Down Expand Up @@ -841,6 +849,21 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
}, nil
}

// cleanupShareOnFailure is a best-effort rollback helper that deletes the file share
// when shouldCleanupShare is true (i.e., the share name was auto-generated by CSI).
func (d *Driver) cleanupShareOnFailure(shouldCleanupShare bool, accountName, shareName, subsID, resourceGroup string, secret map[string]string, useDataPlaneAPI, reason string) {
if shouldCleanupShare {
klog.V(2).Infof("%s on account(%s), cleaning up file share(%s)", reason, accountName, shareName)
// Use a background context for cleanup to avoid inheriting a cancelled/expired
// context from the original CreateVolume request (e.g., after azcopy timeout).
cleanupCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
Comment thread
andyzhangx marked this conversation as resolved.
defer cancel()
if cleanupErr := d.DeleteFileShare(cleanupCtx, subsID, resourceGroup, accountName, shareName, secret, useDataPlaneAPI); cleanupErr != nil {
klog.Warningf("failed to clean up file share(%s) on account(%s) rg(%s) after %s: %v", shareName, accountName, resourceGroup, reason, cleanupErr)
}
Comment thread
andyzhangx marked this conversation as resolved.
}
}

// DeleteVolume delete an azure file
func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (resp *csi.DeleteVolumeResponse, returnedErr error) {
volumeID := req.GetVolumeId()
Expand Down
Loading