Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pkg/provider/azure_loadbalancer_backendpool.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,11 @@ func (bc *backendPoolTypeNodeIPConfig) ReconcileBackendPools(
}
}

if nodeName == "" {
logger.V(2).Info("empty nodeName, skipping ipConfID", "serviceName", serviceName, "ipConfID", ipConfID)
continue
}

// If a node is not supposed to be included in the LB, it
// would not be in the `nodes` slice. We need to check the nodes that
// have been added to the LB's backendpool, find the unwanted ones and
Expand Down Expand Up @@ -363,6 +368,10 @@ func (bc *backendPoolTypeNodeIPConfig) GetBackendPrivateIPs(ctx context.Context,
logger.Error(err, "failed: GetNodeNameByIPConfigurationID", "service", serviceName)
continue
}
if nodeName == "" {
logger.V(2).Info("empty nodeName, skipping ipConfigID", "serviceName", serviceName, "ipConfigID", ipConfigID)
continue
}
privateIPsSet, ok := bc.nodePrivateIPs[strings.ToLower(nodeName)]
if !ok {
klog.Warningf("bc.GetBackendPrivateIPs for service (%s): failed to get private IPs of node %s", serviceName, nodeName)
Expand Down
9 changes: 9 additions & 0 deletions pkg/provider/azure_vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -1697,6 +1697,11 @@ func (ss *ScaleSet) GetNodeNameByIPConfigurationID(ctx context.Context, ipConfig
return "", "", err
}

if vmManagementType == ManagedByNoVM {
logger.V(2).Info("No VM attached, skipping node", "ipConfigurationID", ipConfigurationID)
return "", "", nil
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO: Some logger.Info(...) will be useful here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added logs aptly

}

if vmManagementType == ManagedByAvSet {
// vm is managed by availability set.
return ss.availabilitySet.GetNodeNameByIPConfigurationID(ctx, ipConfigurationID)
Expand Down Expand Up @@ -1963,6 +1968,10 @@ func (ss *ScaleSet) ensureBackendPoolDeleted(ctx context.Context, service *v1.Se
allErrs = append(allErrs, err)
continue
}
if nodeName == "" {
logger.V(2).Info("Empty nodeName, skipping node", "service", getServiceName(service), "ipConfigurationID", ipConfigurationID)
continue
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO: Some logger.Info(...) will be useful here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added logs aptly

}

nodeResourceGroup, nodeVMSS, nodeInstanceID, nodeVMSSVM, err := ss.ensureBackendPoolDeletedFromNode(ctx, nodeName, backendPoolIDs)
if err != nil {
Expand Down
10 changes: 9 additions & 1 deletion pkg/provider/azure_vmss_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ const (
ManagedByVmssFlex VMManagementType = "ManagedByVmssFlex"
ManagedByAvSet VMManagementType = "ManagedByAvSet"
ManagedByUnknownVMSet VMManagementType = "ManagedByUnknownVMSet"
ManagedByNoVM VMManagementType = "ManagedByNoVM"
)

func (ss *ScaleSet) newVMSSCache() (azcache.Resource, error) {
Expand Down Expand Up @@ -530,6 +531,9 @@ func (ss *ScaleSet) getVMManagementTypeByIPConfigurationID(ctx context.Context,
if err != nil {
return ManagedByUnknownVMSet, fmt.Errorf("failed to get vm name by ip config ID %s: %w", ipConfigurationID, err)
}
if vmName == "" {
return ManagedByNoVM, nil
}
if cachedAvSetVMs.Has(vmName) {
return ManagedByAvSet, nil
}
Expand All @@ -542,10 +546,14 @@ func (az *Cloud) GetVMNameByIPConfigurationName(ctx context.Context, nicResource
if rerr != nil {
return "", fmt.Errorf("failed to get interface of name %s: %w", nicName, rerr)
}
// Return empty when VM association is missing
if nic.Properties == nil || nic.Properties.VirtualMachine == nil || nic.Properties.VirtualMachine.ID == nil {
return "", fmt.Errorf("failed to get vm ID of nic %s", ptr.Deref(nic.Name, ""))
return "", nil
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you manually create the nic? When will the nic has no vm association? If this is a problem we must have been received a lot of tickets, but we haven't. May I know if there is a special use case from your side?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A customer had reported that when creating a Kubernetes service of type LoadBalancer, the operation fails if any VMs in the node pool are still provisioning or stuck in creation. The expectation is that the load balancer should skip VMs that are not yet ready and add them later once they complete provisioning, rather than failing the entire operation.

SyncLoadBalancerFailed

Error syncing load balancer: failed to ensure load balancer: failed to get vm name by ip config ID /subscriptions/.../xxxx-xxxx-xxxx-nic/ipConfigurations/pipConfig: failed to get vm ID of xxxx-xxxx-xxxx-nic

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Steps to Reproduce

  1. Create an ARO-HCP cluster
  2. Create a node pool that requests VMs exceeding existing quota (to simulate delayed/stuck provisioning)
  3. Wait until node provisioning gets stuck or delayed
  4. Attempt to provision a service of type LoadBalancer

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A customer had reported that when creating a Kubernetes service of type LoadBalancer, the operation fails if any VMs in the node pool are still provisioning or stuck in creation. The expectation is that the load balancer should skip VMs that are not yet ready and add them later once they complete provisioning, rather than failing the entire operation.

SyncLoadBalancerFailed

Error syncing load balancer: failed to ensure load balancer: failed to get vm name by ip config ID /subscriptions/.../xxxx-xxxx-xxxx-nic/ipConfigurations/pipConfig: failed to get vm ID of xxxx-xxxx-xxxx-nic

If this is the case, in the next reconcile when the vm has been ready, it should auto fixed.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, what is ARO-HCP cluster? Is this issue only in ARO-HCP clusters?

Copy link
Copy Markdown
Author

@cgiradkar cgiradkar May 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, what is ARO-HCP cluster? Is this issue only in ARO-HCP clusters?

ARO-HCP: https://github.com/Azure/ARO-HCP : uses standard OpenShift cloud-controller-manager (which is based on cloud-provider-azure)
This issue will occur in any cluster using cloud-provider-azure

Copy link
Copy Markdown
Author

@cgiradkar cgiradkar May 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is the case, in the next reconcile when the vm has been ready, it should auto fixed.

In this specific case, the whole array of VMS are never gonna get provisioned (due to said constraints), so waiting for next cycle wont do it as the state didnt change so no processing would follow from this file.
This PR makes the process of VM state transiotion (among an array of VMs to be provisioned) more granular even if the re-sync duration is bearable.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But if there is a genuine issue from the NRP/NIC side, this change will silently hide the error.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it be an acceptable solution of we log it and proceed? This use case is an edge case but still encountered in production.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be more reasonable to solve the root cause, so the ccm reconcile will be unblocked. This change introduces behavior change to solve an edge case, and I'm not sure if it's worth it, as it may introduce other issues.

}
vmID := ptr.Deref(nic.Properties.VirtualMachine.ID, "")
if vmID == "" {
return "", nil
}
matches := vmIDRE.FindStringSubmatch(vmID)
if len(matches) != 2 {
return "", fmt.Errorf("invalid virtual machine ID %s", vmID)
Expand Down
7 changes: 3 additions & 4 deletions pkg/provider/azure_vmss_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package provider

import (
"context"
"errors"
"fmt"
"net/http"
"testing"
Expand Down Expand Up @@ -382,11 +381,11 @@ func TestGetVMManagementTypeByIPConfigurationID(t *testing.T) {
expectedVMManagementType: ManagedByAvSet,
},
{
description: "getVMManagementTypeByIPConfigurationID should return an error if nic.VirtualMachine.ID is empty",
description: "getVMManagementTypeByIPConfigurationID should return ManagedByNoVM if nic.VirtualMachine.ID is empty",
ipConfigurationID: "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/networkInterfaces/testvm3-interface/ipConfigurations/pipConfig",
expectedNIC: "testvm3",
expectedVMManagementType: ManagedByUnknownVMSet,
expectedErr: fmt.Errorf("failed to get vm name by ip config ID /subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/networkInterfaces/testvm3-interface/ipConfigurations/pipConfig: %w", errors.New("failed to get vm ID of nic testvm3")),
expectedVMManagementType: ManagedByNoVM,
expectedErr: nil,
},
{
description: "getVMManagementTypeByIPConfigurationID should return an error if failed to get nic",
Expand Down
54 changes: 54 additions & 0 deletions pkg/provider/azure_vmss_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,7 @@ func TestGetNodeNameByIPConfigurationID(t *testing.T) {
expectedNodeName string
expectedScaleSetName string
expectError bool
nicWithoutVM bool
}{
{
description: "GetNodeNameByIPConfigurationID should get node's Name when the node is existing",
Expand All @@ -711,6 +712,15 @@ func TestGetNodeNameByIPConfigurationID(t *testing.T) {
vmList: []string{"vmssee6c2000004", "vmssee6c2000005"},
expectError: true,
},
{
description: "GetNodeNameByIPConfigurationID should return empty strings for NIC without VM attachment",
scaleSet: "scaleset4",
ipConfigurationID: "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/networkInterfaces/orphaned-nic/ipConfigurations/ipconfig1",
vmList: []string{"vmssee6c2000006", "vmssee6c2000007"},
expectedNodeName: "",
expectedScaleSetName: "",
nicWithoutVM: true,
},
}

for _, test := range testCases {
Expand Down Expand Up @@ -738,6 +748,17 @@ func TestGetNodeNameByIPConfigurationID(t *testing.T) {
}
mockVMsClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]*armcompute.VirtualMachine{}, nil).AnyTimes()

if test.nicWithoutVM {
mockNICClient := ss.ComputeClientFactory.GetInterfaceClient().(*mock_interfaceclient.MockInterface)
nicWithoutVM := &armnetwork.Interface{
Name: ptr.To("orphaned-nic"),
Properties: &armnetwork.InterfacePropertiesFormat{
VirtualMachine: nil,
},
}
mockNICClient.EXPECT().Get(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nicWithoutVM, nil).AnyTimes()
}

nodeName, scalesetName, err := ss.GetNodeNameByIPConfigurationID(context.TODO(), test.ipConfigurationID)
if test.expectError {
assert.Error(t, err, test.description)
Expand Down Expand Up @@ -3667,6 +3688,31 @@ func TestEnsureBackendPoolDeleted(t *testing.T) {
},
},
},
{
description: "EnsureBackendPoolDeleted should skip IP configuration with no attached VM",
backendpoolID: testLBBackendpoolID0,
backendAddressPools: []*armnetwork.BackendAddressPool{
{
ID: ptr.To(testLBBackendpoolID0),
Properties: &armnetwork.BackendAddressPoolPropertiesFormat{
BackendIPConfigurations: []*armnetwork.InterfaceIPConfiguration{
{
Name: ptr.To("ip-vmss"),
ID: ptr.To("/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmss/virtualMachines/0/networkInterfaces/nic"),
},
{
Name: ptr.To("ip-orphan"),
ID: ptr.To("/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/networkInterfaces/orphan-nic/ipConfigurations/ipconfig1"),
},
},
},
},
{
ID: ptr.To(testLBBackendpoolID1),
},
},
expectedVMSSVMPutTimes: 1,
},
}

for _, test := range testCases {
Expand Down Expand Up @@ -3699,6 +3745,14 @@ func TestEnsureBackendPoolDeleted(t *testing.T) {
mockVMsClient := ss.ComputeClientFactory.GetVirtualMachineClient().(*mock_virtualmachineclient.MockInterface)
mockVMsClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]*armcompute.VirtualMachine{}, nil).AnyTimes()

mockInterfaceClient := ss.ComputeClientFactory.GetInterfaceClient().(*mock_interfaceclient.MockInterface)
orphanNIC := &armnetwork.Interface{
Properties: &armnetwork.InterfacePropertiesFormat{
VirtualMachine: nil,
},
}
mockInterfaceClient.EXPECT().Get(gomock.Any(), "rg", "orphan-nic", nil).Return(orphanNIC, nil).AnyTimes()

updated, err := ss.EnsureBackendPoolDeleted(context.TODO(), &v1.Service{}, []string{test.backendpoolID}, testVMSSName, test.backendAddressPools, true)
assert.Equal(t, test.expectedErr, err != nil, test.description+errMsgSuffix)
if !test.expectedErr && test.expectedVMSSVMPutTimes > 0 {
Expand Down
5 changes: 5 additions & 0 deletions pkg/provider/azure_vmssflex.go
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,11 @@ func (fs *FlexScaleSet) getNodeInformationByIPConfigurationID(ctx context.Contex
if err != nil {
return "", "", "", fmt.Errorf("failed to get vm name of ip config ID %s: %w", ipConfigurationID, err)
}
if vmName == "" {
// skip this node
logger.Info("Empty vmName, skipping node.", "ipConfigurationID", ipConfigurationID)
return "", "", "", nil
}

nodeName, err := fs.getNodeNameByVMName(ctx, vmName)
if err != nil {
Expand Down
15 changes: 15 additions & 0 deletions pkg/provider/azure_vmssflex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,21 @@ func TestGetNodeNameByIPConfigurationIDVmssFlex(t *testing.T) {
expectedVMSetName: "",
expectedErr: fmt.Errorf("failed to get resource group and name from ip config ID /subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/networkInterfaces//ipConfigurations/pipConfig: %w", errors.New("invalid ip config ID /subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/networkInterfaces//ipConfigurations/pipConfig")),
},
{
description: "GetNodeNameByIPConfigurationID should return empty strings when NIC has no attached VM",
ipConfigurationID: "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/networkInterfaces/orphan-nic/ipConfigurations/pipConfig",
testVMListWithoutInstanceView: testVMListWithoutInstanceView,
testVMListWithOnlyInstanceView: testVMListWithOnlyInstanceView,
vmListErr: nil,
nic: func() *armnetwork.Interface {
nic := generateTestNic("orphan-nic", false, to.Ptr(armnetwork.ProvisioningStateSucceeded), "")
nic.Properties.VirtualMachine = nil
return nic
}(),
expectedNodeName: "",
expectedVMSetName: "",
expectedErr: nil,
},
}

for _, tc := range testCases {
Expand Down
Loading