-
Notifications
You must be signed in to change notification settings - Fork 170
[WIP] fix: clean up orphaned file share on CreateVolume failure #3151
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
andyzhangx
wants to merge
9
commits into
kubernetes-sigs:master
Choose a base branch
from
andyzhangx:fix-orphan-fileshare-cleanup
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+32
−0
Open
Changes from 8 commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
1613d0e
fix: clean up orphaned file share on CreateVolume failure
andyzhangx 2eaf4d7
fix: only skip cleanup for user-specified share names on ShareAlready…
andyzhangx 9425103
fix: derive shareCreatedByCSI from fileShareName to avoid deleting pr…
andyzhangx 713523c
refactor: extract cleanupShareOnFailure helper to reduce duplication
andyzhangx 0cbce99
refactor: extract cleanupShareOnFailure as Driver method with backgro…
andyzhangx f6fa674
cleanup: remove redundant comment in ShareAlreadyExists branch
andyzhangx 7597928
fix: move DeleteVolume doc comment to correct function
andyzhangx 8f87d0b
fix: remove unused ctx parameter from cleanupShareOnFailure
andyzhangx 98bf8ae
fix: skip share cleanup when azcopy job is still running
andyzhangx File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -710,6 +710,11 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) | |||||
| } | ||||||
|
|
||||||
| klog.V(2).Infof("begin to create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d) protocol(%s)", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, shareProtocol) | ||||||
| // shouldCleanupShare indicates whether the file share should be cleaned up on failure. | ||||||
| // When fileShareName is user-specified (non-empty), we skip cleanup since the share | ||||||
| // may be pre-existing and not owned by CSI. For auto-generated share names, CSI owns | ||||||
| // the lifecycle and should clean up on failure to avoid orphaned shares. | ||||||
| shouldCleanupShare := (fileShareName == "") | ||||||
| if err := d.CreateFileShare(ctx, accountOptions, shareOptions, secret, useDataPlaneAPI); err != nil { | ||||||
| if strings.Contains(err.Error(), accountLimitExceedManagementAPI) || strings.Contains(err.Error(), accountLimitExceedDataPlaneAPI) { | ||||||
| klog.Warningf("create file share(%s) on account(%s) type(%s) subID(%s) rg(%s) location(%s) size(%d), error: %v, skip matching current account", validFileShareName, accountName, sku, subsID, resourceGroup, location, fileShareSize, err) | ||||||
|
|
@@ -739,18 +744,21 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) | |||||
| if req.GetVolumeContentSource() != nil { | ||||||
| accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, false) | ||||||
| if err != nil { | ||||||
| d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "getAzcopyAuth failure") | ||||||
| return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err) | ||||||
|
andyzhangx marked this conversation as resolved.
|
||||||
| } | ||||||
| copyErr := d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix) | ||||||
| if accountSASToken == "" && copyErr != nil && strings.Contains(copyErr.Error(), authorizationPermissionMismatch) { | ||||||
| klog.Warningf("azcopy copy failed with AuthorizationPermissionMismatch error, should assign \"Storage File Data Privileged Contributor\" role to controller identity, fall back to use sas token, original error: %v", copyErr) | ||||||
| accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secret, secretName, secretNamespace, true) | ||||||
| if err != nil { | ||||||
| d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, "fallback getAzcopyAuth failure") | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe we should also check status of ongoing azcopy job (might be triggered by previous reconciliations) before triggering cleanup. |
||||||
| return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err) | ||||||
| } | ||||||
| copyErr = d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, secretNamespace, shareOptions, accountOptions, storageEndpointSuffix) | ||||||
| } | ||||||
| if copyErr != nil { | ||||||
| d.cleanupShareOnFailure(shouldCleanupShare, accountName, validFileShareName, subsID, resourceGroup, secret, useDataPlaneAPI, fmt.Sprintf("copyVolume(%s) failure", validFileShareName)) | ||||||
|
andyzhangx marked this conversation as resolved.
|
||||||
| return nil, copyErr | ||||||
| } | ||||||
| // storeAccountKey is not needed here since copy volume is only using SAS token | ||||||
|
|
@@ -841,6 +849,21 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) | |||||
| }, nil | ||||||
| } | ||||||
|
|
||||||
| // cleanupShareOnFailure is a best-effort rollback helper that deletes the file share | ||||||
| // when shouldCleanupShare is true (i.e., the share name was auto-generated by CSI). | ||||||
| func (d *Driver) cleanupShareOnFailure(shouldCleanupShare bool, accountName, shareName, subsID, resourceGroup string, secret map[string]string, useDataPlaneAPI, reason string) { | ||||||
| if shouldCleanupShare { | ||||||
| klog.V(2).Infof("%s on account(%s), cleaning up file share(%s)", reason, accountName, shareName) | ||||||
| // Use a background context for cleanup to avoid inheriting a cancelled/expired | ||||||
| // context from the original CreateVolume request (e.g., after azcopy timeout). | ||||||
| cleanupCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) | ||||||
|
andyzhangx marked this conversation as resolved.
|
||||||
| defer cancel() | ||||||
| if cleanupErr := d.DeleteFileShare(cleanupCtx, subsID, resourceGroup, accountName, shareName, secret, useDataPlaneAPI); cleanupErr != nil { | ||||||
| klog.Warningf("failed to clean up file share(%s) on account(%s) rg(%s) after %s: %v", shareName, accountName, resourceGroup, reason, cleanupErr) | ||||||
| } | ||||||
|
andyzhangx marked this conversation as resolved.
|
||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // DeleteVolume delete an azure file | ||||||
| func (d *Driver) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (resp *csi.DeleteVolumeResponse, returnedErr error) { | ||||||
| volumeID := req.GetVolumeId() | ||||||
|
|
||||||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.