Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cloudmock/aws/mockautoscaling/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ type MockAutoscaling struct {
Groups map[string]*autoscalingtypes.AutoScalingGroup
WarmPoolInstances map[string][]autoscalingtypes.Instance
LifecycleHooks map[string]*autoscalingtypes.LifecycleHook
// ScalingActivities is the canned response for DescribeScalingActivities,
// keyed by Auto Scaling group name.
ScalingActivities map[string][]autoscalingtypes.Activity
// DescribeScalingActivitiesCalls counts the number of paginated calls made.
DescribeScalingActivitiesCalls int
}

var _ awsinterfaces.AutoScalingAPI = &MockAutoscaling{}
65 changes: 65 additions & 0 deletions cloudmock/aws/mockautoscaling/scaling_activities.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
Copyright 2026 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package mockautoscaling

import (
"context"
"fmt"
"strconv"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/autoscaling"
)

// DescribeScalingActivities returns the canned activities for the named group.
// To exercise pagination, the mock returns one activity per page when the input
// has MaxRecords == 1.
func (m *MockAutoscaling) DescribeScalingActivities(ctx context.Context, input *autoscaling.DescribeScalingActivitiesInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DescribeScalingActivitiesOutput, error) {
m.mutex.Lock()
defer m.mutex.Unlock()

m.DescribeScalingActivitiesCalls++

name := aws.ToString(input.AutoScalingGroupName)
activities := m.ScalingActivities[name]

start := 0
if input.NextToken != nil {
s, err := strconv.Atoi(aws.ToString(input.NextToken))
if err != nil {
return nil, fmt.Errorf("invalid NextToken %q: %w", aws.ToString(input.NextToken), err)
}
start = s
}

pageSize := len(activities) - start
if input.MaxRecords != nil && int(*input.MaxRecords) > 0 && int(*input.MaxRecords) < pageSize {
pageSize = int(*input.MaxRecords)
}
end := start + pageSize
if end > len(activities) {
end = len(activities)
}

out := &autoscaling.DescribeScalingActivitiesOutput{
Activities: activities[start:end],
}
if end < len(activities) {
out.NextToken = aws.String(strconv.Itoa(end))
}
return out, nil
}
4 changes: 4 additions & 0 deletions cloudmock/gce/mockcompute/instance_group_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ func (c *instanceGroupManagerClient) ListManagedInstances(ctx context.Context, p
return l, nil
}

func (c *instanceGroupManagerClient) ListErrors(ctx context.Context, project, zone, name string) ([]*compute.InstanceManagedByIgmError, error) {
return nil, nil
}

func (c *instanceGroupManagerClient) RecreateInstances(project, zone, name, id string) (*compute.Operation, error) {
return doneOperation(), nil
}
Expand Down
9 changes: 8 additions & 1 deletion pkg/cloudinstances/cloud_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ limitations under the License.

package cloudinstances

import v1 "k8s.io/api/core/v1"
import (
"time"

v1 "k8s.io/api/core/v1"
)

// CloudInstanceStatusDetached means the instance needs update and has been detached.
const CloudInstanceStatusDetached = "Detached"
Expand Down Expand Up @@ -52,4 +56,7 @@ type CloudInstance struct {
ExternalIP string
// State indicates if the instance has joined the cluster and if it needs any updates.
State State
// CreationTimestamp is when the cloud provider accepted the request to create
// the instance. Used as a watermark when surfacing cloud-side provisioning errors.
CreationTimestamp time.Time
}
49 changes: 49 additions & 0 deletions pkg/cloudinstances/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
Copyright 2026 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package cloudinstances

import (
"context"
"time"
)

// CloudGroupError describes a failure the cloud provider encountered while
// trying to provision an instance in a CloudInstanceGroup. Identical errors
// (same Code + Message) are aggregated into a single entry.
type CloudGroupError struct {
// Code is the cloud provider's structured error code (e.g.
// "ZONE_RESOURCE_POOL_EXHAUSTED_WITH_DETAILS").
Code string
// Message is the human-readable error message.
Message string
// Instance is the name of the most recent affected instance, if known.
Instance string
// Count is the number of times an identical error was observed.
Count int
// FirstSeen and LastSeen bracket the observed occurrences.
FirstSeen time.Time
LastSeen time.Time
}

// CloudGroupErrorReporter is an optional capability exposed by cloud
// implementations to surface provisioning errors during cluster validation.
//
// Validation calls this only for groups that have fewer instances than their
// target size, so the implementation may assume something is already wrong.
type CloudGroupErrorReporter interface {
GetCloudGroupErrors(ctx context.Context, group *CloudInstanceGroup) ([]CloudGroupError, error)
}
39 changes: 37 additions & 2 deletions pkg/validation/validate_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"net/url"
"sort"
"strings"
"time"

"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/rest"
Expand Down Expand Up @@ -188,7 +189,11 @@ func (v *clusterValidatorImpl) Validate(ctx context.Context) (*ValidationCluster
return nil, err
}

readyNodes, nodeInstanceGroupMapping := validation.validateNodes(cloudGroups, v.allInstanceGroups, v.filterInstanceGroups)
var errorReporter cloudinstances.CloudGroupErrorReporter
if r, ok := v.cloud.(cloudinstances.CloudGroupErrorReporter); ok {
errorReporter = r
}
readyNodes, nodeInstanceGroupMapping := validation.validateNodes(ctx, cloudGroups, v.allInstanceGroups, v.filterInstanceGroups, errorReporter)

if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes, nodeInstanceGroupMapping, v.filterPodsForValidation); err != nil {
return nil, fmt.Errorf("cannot get pod health for %q: %v", v.cluster.Name, err)
Expand Down Expand Up @@ -307,7 +312,36 @@ func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kuber
return nil
}

func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances.CloudInstanceGroup, groups []*kops.InstanceGroup, shouldValidateInstanceGroup func(ig *kops.InstanceGroup) bool) ([]v1.Node, map[string]*kops.InstanceGroup) {
// reportCloudGroupErrors queries the cloud provider for provisioning errors on
// a group that's short on instances and appends them as validation failures.
// errorReporter may be nil if the cloud does not support this capability.
func (v *ValidationCluster) reportCloudGroupErrors(ctx context.Context, errorReporter cloudinstances.CloudGroupErrorReporter, cloudGroup *cloudinstances.CloudInstanceGroup) {
if errorReporter == nil {
return
}
cloudErrors, err := errorReporter.GetCloudGroupErrors(ctx, cloudGroup)
if err != nil {
klog.Warningf("error getting cloud provider errors for InstanceGroup %q: %v", cloudGroup.InstanceGroup.Name, err)
return
}
for _, e := range cloudErrors {
message := fmt.Sprintf("cloud provider error %s: %s", e.Code, e.Message)
if e.Instance != "" {
message = fmt.Sprintf("cloud provider error %s for instance %s: %s", e.Code, e.Instance, e.Message)
}
if e.Count > 1 {
message = fmt.Sprintf("%s (observed %d times, most recent %s)", message, e.Count, e.LastSeen.Format(time.RFC3339))
}
v.addError(&ValidationError{
Kind: "InstanceGroup",
Name: cloudGroup.InstanceGroup.Name,
Message: message,
InstanceGroup: cloudGroup.InstanceGroup,
})
}
}

func (v *ValidationCluster) validateNodes(ctx context.Context, cloudGroups map[string]*cloudinstances.CloudInstanceGroup, groups []*kops.InstanceGroup, shouldValidateInstanceGroup func(ig *kops.InstanceGroup) bool, errorReporter cloudinstances.CloudGroupErrorReporter) ([]v1.Node, map[string]*kops.InstanceGroup) {
var readyNodes []v1.Node
groupsSeen := map[string]bool{}
nodeInstanceGroupMapping := map[string]*kops.InstanceGroup{}
Expand Down Expand Up @@ -338,6 +372,7 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
cloudGroup.TargetSize),
InstanceGroup: cloudGroup.InstanceGroup,
})
v.reportCloudGroupErrors(ctx, errorReporter, cloudGroup)
}

for _, member := range allMembers {
Expand Down
65 changes: 65 additions & 0 deletions pkg/validation/validate_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand All @@ -41,6 +42,10 @@ type MockCloud struct {
t *testing.T
expectedCluster *kopsapi.Cluster
expectedInstanceGroups []kopsapi.InstanceGroup

// cloudGroupErrors is keyed by InstanceGroup name. When non-nil, MockCloud
// implements cloudinstances.CloudGroupErrorReporter and returns these errors.
cloudGroupErrors map[string][]cloudinstances.CloudGroupError
}

var _ fi.Cloud = (*MockCloud)(nil)
Expand Down Expand Up @@ -913,3 +918,63 @@ func Test_ValidateDetachedNodesNotValidated(t *testing.T) {
printDebug(t, v)
}
}

// errorReportingMockCloud wraps MockCloud to implement CloudGroupErrorReporter.
type errorReportingMockCloud struct {
*MockCloud
}

func (c *errorReportingMockCloud) GetCloudGroupErrors(_ context.Context, group *cloudinstances.CloudInstanceGroup) ([]cloudinstances.CloudGroupError, error) {
return c.cloudGroupErrors[group.InstanceGroup.Name], nil
}

func Test_ValidateCloudGroupErrorsSurfaced(t *testing.T) {
ctx := context.TODO()

cluster := &kopsapi.Cluster{
ObjectMeta: metav1.ObjectMeta{Name: "testcluster.k8s.local"},
Spec: kopsapi.ClusterSpec{
ExternalDNS: &kopsapi.ExternalDNSConfig{
Provider: kopsapi.ExternalDNSProviderDNSController,
},
},
}

ig := kopsapi.InstanceGroup{
ObjectMeta: metav1.ObjectMeta{Name: "node-1"},
Spec: kopsapi.InstanceGroupSpec{Role: kopsapi.InstanceGroupRoleNode},
}
groups := map[string]*cloudinstances.CloudInstanceGroup{
"node-1": {
InstanceGroup: &ig,
MinSize: 1,
TargetSize: 2,
},
}

mockcloud := BuildMockCloud(t, groups, cluster, []kopsapi.InstanceGroup{ig})
mockcloud.cloudGroupErrors = map[string][]cloudinstances.CloudGroupError{
"node-1": {
{
Code: "ZONE_RESOURCE_POOL_EXHAUSTED_WITH_DETAILS",
Message: "The zone does not have enough resources available",
Instance: "node-1a",
Count: 3,
LastSeen: time.Date(2026, 5, 12, 20, 50, 0, 0, time.UTC),
},
},
}
reporter := &errorReportingMockCloud{MockCloud: mockcloud}

restConfig := &rest.Config{Host: "https://api.testcluster.k8s.local"}
validator, err := NewClusterValidator(cluster, reporter, &kopsapi.InstanceGroupList{Items: []kopsapi.InstanceGroup{ig}}, nil, nil, restConfig, fake.NewClientset())
require.NoError(t, err)
v, err := validator.Validate(ctx)
require.NoError(t, err)

require.Len(t, v.Failures, 2)
assert.Equal(t, "InstanceGroup \"node-1\" did not have enough nodes 0 vs 2", v.Failures[0].Message)
assert.Contains(t, v.Failures[1].Message, "ZONE_RESOURCE_POOL_EXHAUSTED_WITH_DETAILS")
assert.Contains(t, v.Failures[1].Message, "node-1a")
assert.Contains(t, v.Failures[1].Message, "observed 3 times")
}
Loading
Loading