-
Notifications
You must be signed in to change notification settings - Fork 170
[WIP] fix: clean up orphaned file share on CreateVolume failure #3151
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
1613d0e
2eaf4d7
9425103
713523c
0cbce99
f6fa674
7597928
8f87d0b
98bf8ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -710,6 +710,11 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) | |||||
| } | ||||||
|
|
||||||
| klog.V(2).Infof("begin to create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d) protocol(%s)", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, shareProtocol) | ||||||
| // shouldCleanupShare indicates whether the file share should be cleaned up on failure. | ||||||
| // When fileShareName is user-specified (non-empty), we skip cleanup since the share | ||||||
| // may be pre-existing and not owned by CSI. For auto-generated share names, CSI owns | ||||||
| // the lifecycle and should clean up on failure to avoid orphaned shares. | ||||||
| shouldCleanupShare := (fileShareName == "") | ||||||
| if err := d.CreateFileShare(ctx, accountOptions, shareOptions, secret, useDataPlaneAPI); err != nil { | ||||||
| if strings.Contains(err.Error(), accountLimitExceedManagementAPI) || strings.Contains(err.Error(), accountLimitExceedDataPlaneAPI) { | ||||||
| klog.Warningf("create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d), error: %v, skip matching current account", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, err) | ||||||
|
|
@@ -739,18 +744,21 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) | |||||
| if req.GetVolumeContentSource() != nil { | ||||||
| accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, false) | ||||||
| if err != nil { | ||||||
| d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "getAzcopyAuth failure") | ||||||
| return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err) | ||||||
|
andyzhangx marked this conversation as resolved.
|
||||||
| } | ||||||
| copyErr := d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix) | ||||||
| if accountSASToken == "" && copyErr != nil && strings.Contains(copyErr.Error(), authorizationPermissionMismatch) { | ||||||
| klog.Warningf("azcopy copy failed with AuthorizationPermissionMismatch error, should assign \"Storage File Data Privileged Contributor\" role to controller identity, fall back to use sas token, original error: %v", copyErr) | ||||||
| accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, true) | ||||||
| if err != nil { | ||||||
| d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "fallback getAzcopyAuth failure") | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe we should also check status of ongoing azcopy job (might be triggered by previous reconciliations) before triggering cleanup. |
||||||
| return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err) | ||||||
| } | ||||||
| copyErr = d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix) | ||||||
| } | ||||||
| if copyErr != nil { | ||||||
| d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, fmt.Sprintf("copyVolume(%s) failure", validFileShareName)) | ||||||
| return nil, copyErr | ||||||
| } | ||||||
| // storeAccountKey is not needed here since copy volume is only using SAS token | ||||||
|
|
@@ -841,6 +849,30 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) | |||||
| }, nil | ||||||
| } | ||||||
|
|
||||||
| // cleanupShareOnFailure is a best-effort rollback helper that deletes the file share | ||||||
| // when shouldCleanupShare is true (i.e., the share name was auto-generated by CSI). | ||||||
| // It checks for a running azcopy job first — if a job is still in progress, the share | ||||||
| // is preserved so retries can resume rather than starting from zero. | ||||||
| func (d *Driver) cleanupShareOnFailure(shouldCleanupShare bool, accountName, shareName, subsID, resourceGroup string, secret map[string]string, useDataPlaneAPI, reason string) { | ||||||
| if shouldCleanupShare { | ||||||
| // Check if an azcopy job is still running for this share — if so, | ||||||
| // skip cleanup to avoid orphaning the job and losing partial progress. | ||||||
| jobState, _, err := d.azcopy.GetAzcopyJob(shareName, []string{}) | ||||||
| if err == nil && jobState == util.AzcopyJobRunning { | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is also good to skip deletion when job is succeeded (Helps to avoid deletion if there is race b/w original timeout and job completion. |
||||||
| klog.V(2).Infof("skip cleanup of file share(%s) on account(%s): azcopy job is still running", shareName, accountName) | ||||||
| return | ||||||
| } | ||||||
| klog.V(2).Infof("%s on account(%s), cleaning up file share(%s)", reason, accountName, shareName) | ||||||
| // Use a background context for cleanup to avoid inheriting a cancelled/expired | ||||||
| // context from the original CreateVolume request (e.g., after azcopy timeout). | ||||||
| cleanupCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) | ||||||
|
andyzhangx marked this conversation as resolved.
|
||||||
| defer cancel() | ||||||
| if cleanupErr := d.DeleteFileShare(cleanupCtx, subsID, resourceGroup, accountName, shareName, secret, useDataPlaneAPI); cleanupErr != nil { | ||||||
| klog.Warningf("failed to clean up file share(%s) on account(%s) rg(%s) after %s: %v", shareName, accountName, resourceGroup, reason, cleanupErr) | ||||||
| } | ||||||
|
andyzhangx marked this conversation as resolved.
|
||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // DeleteVolume delete an azure file | ||||||
| func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (resp *csi.DeleteVolumeResponse, returnedErr error) { | ||||||
| volumeID := req.GetVolumeId() | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.