diff --git a/cloudmock/aws/mockautoscaling/api.go b/cloudmock/aws/mockautoscaling/api.go index 1efe249a05d33..e05b7aee1a342 100644 --- a/cloudmock/aws/mockautoscaling/api.go +++ b/cloudmock/aws/mockautoscaling/api.go @@ -31,6 +31,11 @@ type MockAutoscaling struct { Groups map[string]*autoscalingtypes.AutoScalingGroup WarmPoolInstances map[string][]autoscalingtypes.Instance LifecycleHooks map[string]*autoscalingtypes.LifecycleHook + // ScalingActivities is the canned response for DescribeScalingActivities, + // keyed by Auto Scaling group name. + ScalingActivities map[string][]autoscalingtypes.Activity + // DescribeScalingActivitiesCalls counts the number of paginated calls made. + DescribeScalingActivitiesCalls int } var _ awsinterfaces.AutoScalingAPI = &MockAutoscaling{} diff --git a/cloudmock/aws/mockautoscaling/scaling_activities.go b/cloudmock/aws/mockautoscaling/scaling_activities.go new file mode 100644 index 0000000000000..1b82de4aa2582 --- /dev/null +++ b/cloudmock/aws/mockautoscaling/scaling_activities.go @@ -0,0 +1,65 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package mockautoscaling + +import ( + "context" + "fmt" + "strconv" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/autoscaling" +) + +// DescribeScalingActivities returns the canned activities for the named group. +// To exercise pagination, the mock returns one activity per page when the input +// has MaxRecords == 1. +func (m *MockAutoscaling) DescribeScalingActivities(ctx context.Context, input *autoscaling.DescribeScalingActivitiesInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DescribeScalingActivitiesOutput, error) { + m.mutex.Lock() + defer m.mutex.Unlock() + + m.DescribeScalingActivitiesCalls++ + + name := aws.ToString(input.AutoScalingGroupName) + activities := m.ScalingActivities[name] + + start := 0 + if input.NextToken != nil { + s, err := strconv.Atoi(aws.ToString(input.NextToken)) + if err != nil { + return nil, fmt.Errorf("invalid NextToken %q: %w", aws.ToString(input.NextToken), err) + } + start = s + } + + pageSize := len(activities) - start + if input.MaxRecords != nil && int(*input.MaxRecords) > 0 && int(*input.MaxRecords) < pageSize { + pageSize = int(*input.MaxRecords) + } + end := start + pageSize + if end > len(activities) { + end = len(activities) + } + + out := &autoscaling.DescribeScalingActivitiesOutput{ + Activities: activities[start:end], + } + if end < len(activities) { + out.NextToken = aws.String(strconv.Itoa(end)) + } + return out, nil +} diff --git a/cloudmock/gce/mockcompute/instance_group_manager.go b/cloudmock/gce/mockcompute/instance_group_manager.go index 6309b34d4430d..8a8417dea497f 100644 --- a/cloudmock/gce/mockcompute/instance_group_manager.go +++ b/cloudmock/gce/mockcompute/instance_group_manager.go @@ -174,6 +174,10 @@ func (c *instanceGroupManagerClient) ListManagedInstances(ctx context.Context, p return l, nil } +func (c *instanceGroupManagerClient) ListErrors(ctx context.Context, project, zone, name string) ([]*compute.InstanceManagedByIgmError, error) { + return nil, nil +} + func (c *instanceGroupManagerClient) RecreateInstances(project, zone, name, id string) (*compute.Operation, error) { return doneOperation(), nil } diff --git a/pkg/cloudinstances/cloud_instance.go b/pkg/cloudinstances/cloud_instance.go index 8d46db307ea3e..0de191ae2cbc8 100644 --- a/pkg/cloudinstances/cloud_instance.go +++ b/pkg/cloudinstances/cloud_instance.go @@ -16,7 +16,11 @@ limitations under the License. package cloudinstances -import v1 "k8s.io/api/core/v1" +import ( + "time" + + v1 "k8s.io/api/core/v1" +) // CloudInstanceStatusDetached means the instance needs update and has been detached. const CloudInstanceStatusDetached = "Detached" @@ -52,4 +56,7 @@ type CloudInstance struct { ExternalIP string // State indicates if the instance has joined the cluster and if it needs any updates. State State + // CreationTimestamp is when the cloud provider accepted the request to create + // the instance. Used as a watermark when surfacing cloud-side provisioning errors. + CreationTimestamp time.Time } diff --git a/pkg/cloudinstances/errors.go b/pkg/cloudinstances/errors.go new file mode 100644 index 0000000000000..745ee924bb398 --- /dev/null +++ b/pkg/cloudinstances/errors.go @@ -0,0 +1,49 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cloudinstances + +import ( + "context" + "time" +) + +// CloudGroupError describes a failure the cloud provider encountered while +// trying to provision an instance in a CloudInstanceGroup. Identical errors +// (same Code + Message) are aggregated into a single entry. +type CloudGroupError struct { + // Code is the cloud provider's structured error code (e.g. + // "ZONE_RESOURCE_POOL_EXHAUSTED_WITH_DETAILS"). + Code string + // Message is the human-readable error message. + Message string + // Instance is the name of the most recent affected instance, if known. + Instance string + // Count is the number of times an identical error was observed. + Count int + // FirstSeen and LastSeen bracket the observed occurrences. + FirstSeen time.Time + LastSeen time.Time +} + +// CloudGroupErrorReporter is an optional capability exposed by cloud +// implementations to surface provisioning errors during cluster validation. +// +// Validation calls this only for groups that have fewer instances than their +// target size, so the implementation may assume something is already wrong. +type CloudGroupErrorReporter interface { + GetCloudGroupErrors(ctx context.Context, group *CloudInstanceGroup) ([]CloudGroupError, error) +} diff --git a/pkg/validation/validate_cluster.go b/pkg/validation/validate_cluster.go index 395799bb4f831..7f43dc328fe36 100644 --- a/pkg/validation/validate_cluster.go +++ b/pkg/validation/validate_cluster.go @@ -23,6 +23,7 @@ import ( "net/url" "sort" "strings" + "time" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/rest" @@ -188,7 +189,11 @@ func (v *clusterValidatorImpl) Validate(ctx context.Context) (*ValidationCluster return nil, err } - readyNodes, nodeInstanceGroupMapping := validation.validateNodes(cloudGroups, v.allInstanceGroups, v.filterInstanceGroups) + var errorReporter cloudinstances.CloudGroupErrorReporter + if r, ok := v.cloud.(cloudinstances.CloudGroupErrorReporter); ok { + errorReporter = r + } + readyNodes, nodeInstanceGroupMapping := validation.validateNodes(ctx, cloudGroups, v.allInstanceGroups, v.filterInstanceGroups, errorReporter) if err := validation.collectPodFailures(ctx, v.k8sClient, readyNodes, nodeInstanceGroupMapping, v.filterPodsForValidation); err != nil { return nil, fmt.Errorf("cannot get pod health for %q: %v", v.cluster.Name, err) @@ -307,7 +312,36 @@ func (v *ValidationCluster) collectPodFailures(ctx context.Context, client kuber return nil } -func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances.CloudInstanceGroup, groups []*kops.InstanceGroup, shouldValidateInstanceGroup func(ig *kops.InstanceGroup) bool) ([]v1.Node, map[string]*kops.InstanceGroup) { +// reportCloudGroupErrors queries the cloud provider for provisioning errors on +// a group that's short on instances and appends them as validation failures. +// errorReporter may be nil if the cloud does not support this capability. +func (v *ValidationCluster) reportCloudGroupErrors(ctx context.Context, errorReporter cloudinstances.CloudGroupErrorReporter, cloudGroup *cloudinstances.CloudInstanceGroup) { + if errorReporter == nil { + return + } + cloudErrors, err := errorReporter.GetCloudGroupErrors(ctx, cloudGroup) + if err != nil { + klog.Warningf("error getting cloud provider errors for InstanceGroup %q: %v", cloudGroup.InstanceGroup.Name, err) + return + } + for _, e := range cloudErrors { + message := fmt.Sprintf("cloud provider error %s: %s", e.Code, e.Message) + if e.Instance != "" { + message = fmt.Sprintf("cloud provider error %s for instance %s: %s", e.Code, e.Instance, e.Message) + } + if e.Count > 1 { + message = fmt.Sprintf("%s (observed %d times, most recent %s)", message, e.Count, e.LastSeen.Format(time.RFC3339)) + } + v.addError(&ValidationError{ + Kind: "InstanceGroup", + Name: cloudGroup.InstanceGroup.Name, + Message: message, + InstanceGroup: cloudGroup.InstanceGroup, + }) + } +} + +func (v *ValidationCluster) validateNodes(ctx context.Context, cloudGroups map[string]*cloudinstances.CloudInstanceGroup, groups []*kops.InstanceGroup, shouldValidateInstanceGroup func(ig *kops.InstanceGroup) bool, errorReporter cloudinstances.CloudGroupErrorReporter) ([]v1.Node, map[string]*kops.InstanceGroup) { var readyNodes []v1.Node groupsSeen := map[string]bool{} nodeInstanceGroupMapping := map[string]*kops.InstanceGroup{} @@ -338,6 +372,7 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances cloudGroup.TargetSize), InstanceGroup: cloudGroup.InstanceGroup, }) + v.reportCloudGroupErrors(ctx, errorReporter, cloudGroup) } for _, member := range allMembers { diff --git a/pkg/validation/validate_cluster_test.go b/pkg/validation/validate_cluster_test.go index 9bfb86c8609ed..d1fa9e99390fa 100644 --- a/pkg/validation/validate_cluster_test.go +++ b/pkg/validation/validate_cluster_test.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -41,6 +42,10 @@ type MockCloud struct { t *testing.T expectedCluster *kopsapi.Cluster expectedInstanceGroups []kopsapi.InstanceGroup + + // cloudGroupErrors is keyed by InstanceGroup name. When non-nil, MockCloud + // implements cloudinstances.CloudGroupErrorReporter and returns these errors. + cloudGroupErrors map[string][]cloudinstances.CloudGroupError } var _ fi.Cloud = (*MockCloud)(nil) @@ -913,3 +918,63 @@ func Test_ValidateDetachedNodesNotValidated(t *testing.T) { printDebug(t, v) } } + +// errorReportingMockCloud wraps MockCloud to implement CloudGroupErrorReporter. +type errorReportingMockCloud struct { + *MockCloud +} + +func (c *errorReportingMockCloud) GetCloudGroupErrors(_ context.Context, group *cloudinstances.CloudInstanceGroup) ([]cloudinstances.CloudGroupError, error) { + return c.cloudGroupErrors[group.InstanceGroup.Name], nil +} + +func Test_ValidateCloudGroupErrorsSurfaced(t *testing.T) { + ctx := context.TODO() + + cluster := &kopsapi.Cluster{ + ObjectMeta: metav1.ObjectMeta{Name: "testcluster.k8s.local"}, + Spec: kopsapi.ClusterSpec{ + ExternalDNS: &kopsapi.ExternalDNSConfig{ + Provider: kopsapi.ExternalDNSProviderDNSController, + }, + }, + } + + ig := kopsapi.InstanceGroup{ + ObjectMeta: metav1.ObjectMeta{Name: "node-1"}, + Spec: kopsapi.InstanceGroupSpec{Role: kopsapi.InstanceGroupRoleNode}, + } + groups := map[string]*cloudinstances.CloudInstanceGroup{ + "node-1": { + InstanceGroup: &ig, + MinSize: 1, + TargetSize: 2, + }, + } + + mockcloud := BuildMockCloud(t, groups, cluster, []kopsapi.InstanceGroup{ig}) + mockcloud.cloudGroupErrors = map[string][]cloudinstances.CloudGroupError{ + "node-1": { + { + Code: "ZONE_RESOURCE_POOL_EXHAUSTED_WITH_DETAILS", + Message: "The zone does not have enough resources available", + Instance: "node-1a", + Count: 3, + LastSeen: time.Date(2026, 5, 12, 20, 50, 0, 0, time.UTC), + }, + }, + } + reporter := &errorReportingMockCloud{MockCloud: mockcloud} + + restConfig := &rest.Config{Host: "https://api.testcluster.k8s.local"} + validator, err := NewClusterValidator(cluster, reporter, &kopsapi.InstanceGroupList{Items: []kopsapi.InstanceGroup{ig}}, nil, nil, restConfig, fake.NewClientset()) + require.NoError(t, err) + v, err := validator.Validate(ctx) + require.NoError(t, err) + + require.Len(t, v.Failures, 2) + assert.Equal(t, "InstanceGroup \"node-1\" did not have enough nodes 0 vs 2", v.Failures[0].Message) + assert.Contains(t, v.Failures[1].Message, "ZONE_RESOURCE_POOL_EXHAUSTED_WITH_DETAILS") + assert.Contains(t, v.Failures[1].Message, "node-1a") + assert.Contains(t, v.Failures[1].Message, "observed 3 times") +} diff --git a/upup/pkg/fi/cloudup/awsup/aws_cloud.go b/upup/pkg/fi/cloudup/awsup/aws_cloud.go index c17af21dd8432..a50736730d845 100644 --- a/upup/pkg/fi/cloudup/awsup/aws_cloud.go +++ b/upup/pkg/fi/cloudup/awsup/aws_cloud.go @@ -20,6 +20,8 @@ import ( "context" "fmt" "os" + "regexp" + "sort" "strconv" "strings" "sync" @@ -1084,8 +1086,121 @@ func buildCloudInstance(i autoscalingtypes.Instance, instances map[string]*ec2ty return nil } +// asgInstanceIDRegex extracts the first EC2 instance ID (i-xxxxxxxx) found in a +// scaling activity's StatusMessage or Description. The capture is best-effort: +// if no ID is found, CloudGroupError.Instance is left blank. +var asgInstanceIDRegex = regexp.MustCompile(`i-[0-9a-f]{8,17}`) + +// GetCloudGroupErrors returns recent provisioning failures observed by the AWS +// Auto Scaling group backing this CloudInstanceGroup. See getCloudGroupErrors +// for the filtering and aggregation logic. +func (c *awsCloudImplementation) GetCloudGroupErrors(ctx context.Context, group *cloudinstances.CloudInstanceGroup) ([]cloudinstances.CloudGroupError, error) { + return getCloudGroupErrors(ctx, c, group) +} + +// getCloudGroupErrors returns recent failed scaling activities for the ASG +// backing the given group. +// +// Results are filtered to errors timestamped after the most recent successful +// instance creation in the group (the LaunchTime of an existing instance). If +// the group has no instances at all, no filter is applied so the full retention +// window is surfaced — that is precisely the failure mode this exists to +// diagnose. Pagination short-circuits once activities pre-date the watermark, +// since DescribeScalingActivities returns most-recent-first. +func getCloudGroupErrors(ctx context.Context, c AWSCloud, group *cloudinstances.CloudInstanceGroup) ([]cloudinstances.CloudGroupError, error) { + asg, ok := group.Raw.(*autoscalingtypes.AutoScalingGroup) + if !ok || asg == nil { + return nil, nil + } + asgName := aws.ToString(asg.AutoScalingGroupName) + if asgName == "" { + return nil, nil + } + + var watermark time.Time + for _, m := range group.Ready { + if m.CreationTimestamp.After(watermark) { + watermark = m.CreationTimestamp + } + } + for _, m := range group.NeedUpdate { + if m.CreationTimestamp.After(watermark) { + watermark = m.CreationTimestamp + } + } + + type key struct{ code, message string } + agg := map[key]*cloudinstances.CloudGroupError{} + + paginator := autoscaling.NewDescribeScalingActivitiesPaginator(c.Autoscaling(), &autoscaling.DescribeScalingActivitiesInput{ + AutoScalingGroupName: aws.String(asgName), + }) +paginate: + for paginator.HasMorePages() { + page, err := paginator.NextPage(ctx) + if err != nil { + return nil, fmt.Errorf("describing scaling activities for ASG %q: %w", asgName, err) + } + for _, a := range page.Activities { + if a.StatusCode != autoscalingtypes.ScalingActivityStatusCodeFailed && a.StatusCode != autoscalingtypes.ScalingActivityStatusCodeCancelled { + continue + } + var ts time.Time + if a.StartTime != nil { + ts = *a.StartTime + } + if !watermark.IsZero() && !ts.IsZero() && ts.Before(watermark) { + // Activities are returned most-recent-first; everything after + // this is older, so stop paginating. + break paginate + } + message := aws.ToString(a.StatusMessage) + if message == "" { + message = aws.ToString(a.Cause) + } + k := key{string(a.StatusCode), message} + e, ok := agg[k] + if !ok { + instance := asgInstanceIDRegex.FindString(message) + if instance == "" { + instance = asgInstanceIDRegex.FindString(aws.ToString(a.Description)) + } + e = &cloudinstances.CloudGroupError{ + Code: string(a.StatusCode), + Message: message, + Instance: instance, + FirstSeen: ts, + LastSeen: ts, + } + agg[k] = e + } + e.Count++ + if !ts.IsZero() { + if e.FirstSeen.IsZero() || ts.Before(e.FirstSeen) { + e.FirstSeen = ts + } + if ts.After(e.LastSeen) { + e.LastSeen = ts + } + } + } + } + + out := make([]cloudinstances.CloudGroupError, 0, len(agg)) + for _, e := range agg { + out = append(out, *e) + } + sort.Slice(out, func(i, j int) bool { + return out[i].LastSeen.After(out[j].LastSeen) + }) + return out, nil +} + func addCloudInstanceData(cm *cloudinstances.CloudInstance, instance *ec2types.Instance) { cm.MachineType = string(instance.InstanceType) + if instance.LaunchTime != nil { + cm.CreationTimestamp = *instance.LaunchTime + } for _, tag := range instance.Tags { key := aws.ToString(tag.Key) if !strings.HasPrefix(key, TagNameRolePrefix) { diff --git a/upup/pkg/fi/cloudup/awsup/aws_cloud_test.go b/upup/pkg/fi/cloudup/awsup/aws_cloud_test.go new file mode 100644 index 0000000000000..c546365ddc7c2 --- /dev/null +++ b/upup/pkg/fi/cloudup/awsup/aws_cloud_test.go @@ -0,0 +1,218 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package awsup + +import ( + "context" + "testing" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + autoscalingtypes "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" + + "k8s.io/kops/cloudmock/aws/mockautoscaling" + "k8s.io/kops/pkg/cloudinstances" +) + +func TestGetCloudGroupErrors(t *testing.T) { + asgName := "test-asg" + asg := &autoscalingtypes.AutoScalingGroup{AutoScalingGroupName: aws.String(asgName)} + + t0 := time.Date(2026, 5, 12, 20, 35, 0, 0, time.UTC) // before watermark + t1 := time.Date(2026, 5, 12, 20, 45, 0, 0, time.UTC) // watermark (matches a LaunchTime) + t2 := time.Date(2026, 5, 12, 20, 50, 0, 0, time.UTC) // after watermark + t3 := time.Date(2026, 5, 12, 20, 55, 0, 0, time.UTC) // after watermark + + insufficientCapacity := "Launching a new EC2 instance: i-0abc1234. Status Reason: Insufficient capacity." + + tests := []struct { + name string + group *cloudinstances.CloudInstanceGroup + activities []autoscalingtypes.Activity + wantErrors []cloudinstances.CloudGroupError + wantInstanceMatch string + }{ + { + name: "empty group surfaces all failed activities", + group: &cloudinstances.CloudInstanceGroup{ + Raw: asg, + TargetSize: 2, + }, + activities: []autoscalingtypes.Activity{ + {StartTime: &t3, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String(insufficientCapacity)}, + {StartTime: &t0, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String(insufficientCapacity)}, + }, + wantErrors: []cloudinstances.CloudGroupError{ + { + Code: string(autoscalingtypes.ScalingActivityStatusCodeFailed), + Message: insufficientCapacity, + Instance: "i-0abc1234", + Count: 2, + }, + }, + }, + { + name: "watermark filters out activities older than newest instance", + group: &cloudinstances.CloudInstanceGroup{ + Raw: asg, + TargetSize: 2, + Ready: []*cloudinstances.CloudInstance{ + {ID: "i-existing", CreationTimestamp: t1}, + }, + }, + activities: []autoscalingtypes.Activity{ + {StartTime: &t2, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String(insufficientCapacity)}, + {StartTime: &t0, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("ancient error")}, + }, + wantErrors: []cloudinstances.CloudGroupError{ + { + Code: string(autoscalingtypes.ScalingActivityStatusCodeFailed), + Message: insufficientCapacity, + Instance: "i-0abc1234", + Count: 1, + }, + }, + }, + { + name: "non-failed activities are ignored", + group: &cloudinstances.CloudInstanceGroup{ + Raw: asg, + TargetSize: 2, + }, + activities: []autoscalingtypes.Activity{ + {StartTime: &t3, StatusCode: autoscalingtypes.ScalingActivityStatusCodeSuccessful, StatusMessage: aws.String("ok")}, + {StartTime: &t2, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String(insufficientCapacity)}, + {StartTime: &t1, StatusCode: autoscalingtypes.ScalingActivityStatusCodeInProgress, StatusMessage: aws.String("still going")}, + }, + wantErrors: []cloudinstances.CloudGroupError{ + { + Code: string(autoscalingtypes.ScalingActivityStatusCodeFailed), + Message: insufficientCapacity, + Instance: "i-0abc1234", + Count: 1, + }, + }, + }, + { + name: "identical messages are aggregated", + group: &cloudinstances.CloudInstanceGroup{ + Raw: asg, + TargetSize: 3, + }, + activities: []autoscalingtypes.Activity{ + {StartTime: &t3, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("same error")}, + {StartTime: &t2, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("same error")}, + {StartTime: &t1, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("different error")}, + }, + wantErrors: []cloudinstances.CloudGroupError{ + {Code: "Failed", Message: "same error", Count: 2, FirstSeen: t2, LastSeen: t3}, + {Code: "Failed", Message: "different error", Count: 1, FirstSeen: t1, LastSeen: t1}, + }, + }, + { + name: "cancelled activities are surfaced", + group: &cloudinstances.CloudInstanceGroup{ + Raw: asg, + TargetSize: 1, + }, + activities: []autoscalingtypes.Activity{ + {StartTime: &t3, StatusCode: autoscalingtypes.ScalingActivityStatusCodeCancelled, StatusMessage: aws.String("cancelled")}, + }, + wantErrors: []cloudinstances.CloudGroupError{ + {Code: "Cancelled", Message: "cancelled", Count: 1}, + }, + }, + { + name: "non-ASG Raw returns nil", + group: &cloudinstances.CloudInstanceGroup{Raw: "not-an-asg"}, + activities: nil, + wantErrors: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cloud := BuildMockAWSCloud("us-east-1", "a") + cloud.MockAutoscaling = &mockautoscaling.MockAutoscaling{ + ScalingActivities: map[string][]autoscalingtypes.Activity{ + asgName: tt.activities, + }, + } + got, err := cloud.GetCloudGroupErrors(context.Background(), tt.group) + if err != nil { + t.Fatalf("GetCloudGroupErrors returned error: %v", err) + } + if len(got) != len(tt.wantErrors) { + t.Fatalf("got %d errors, want %d: %+v", len(got), len(tt.wantErrors), got) + } + for i, want := range tt.wantErrors { + if got[i].Code != want.Code || got[i].Message != want.Message || got[i].Count != want.Count { + t.Errorf("error %d: got %+v, want %+v", i, got[i], want) + } + if want.Instance != "" && got[i].Instance != want.Instance { + t.Errorf("error %d instance: got %q, want %q", i, got[i].Instance, want.Instance) + } + if !want.FirstSeen.IsZero() && !got[i].FirstSeen.Equal(want.FirstSeen) { + t.Errorf("error %d FirstSeen: got %v, want %v", i, got[i].FirstSeen, want.FirstSeen) + } + if !want.LastSeen.IsZero() && !got[i].LastSeen.Equal(want.LastSeen) { + t.Errorf("error %d LastSeen: got %v, want %v", i, got[i].LastSeen, want.LastSeen) + } + } + }) + } +} + +func TestGetCloudGroupErrors_PaginationShortCircuit(t *testing.T) { + asgName := "test-asg" + asg := &autoscalingtypes.AutoScalingGroup{AutoScalingGroupName: aws.String(asgName)} + + watermark := time.Date(2026, 5, 12, 20, 45, 0, 0, time.UTC) + tBefore := watermark.Add(-1 * time.Hour) + tAfter := watermark.Add(1 * time.Minute) + + // 3 recent failures + 5 older ones. The watermark must filter out everything + // at-or-before the existing instance's CreationTimestamp. + activities := []autoscalingtypes.Activity{ + {StartTime: &tAfter, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("err A")}, + {StartTime: &tAfter, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("err B")}, + {StartTime: &tAfter, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("err C")}, + {StartTime: &tBefore, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("old 1")}, + {StartTime: &tBefore, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("old 2")}, + {StartTime: &tBefore, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("old 3")}, + {StartTime: &tBefore, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("old 4")}, + {StartTime: &tBefore, StatusCode: autoscalingtypes.ScalingActivityStatusCodeFailed, StatusMessage: aws.String("old 5")}, + } + + cloud := BuildMockAWSCloud("us-east-1", "a") + cloud.MockAutoscaling = &mockautoscaling.MockAutoscaling{ + ScalingActivities: map[string][]autoscalingtypes.Activity{asgName: activities}, + } + + got, err := getCloudGroupErrors(context.Background(), cloud, &cloudinstances.CloudInstanceGroup{ + Raw: asg, + Ready: []*cloudinstances.CloudInstance{ + {ID: "i-existing", CreationTimestamp: watermark}, + }, + }) + if err != nil { + t.Fatalf("getCloudGroupErrors: %v", err) + } + if len(got) != 3 { + t.Fatalf("expected 3 errors after watermark filter, got %d: %+v", len(got), got) + } +} diff --git a/upup/pkg/fi/cloudup/awsup/mock_aws_cloud.go b/upup/pkg/fi/cloudup/awsup/mock_aws_cloud.go index 070251339638c..b0951045d03d3 100644 --- a/upup/pkg/fi/cloudup/awsup/mock_aws_cloud.go +++ b/upup/pkg/fi/cloudup/awsup/mock_aws_cloud.go @@ -106,6 +106,10 @@ func (c *MockAWSCloud) GetCloudGroups(cluster *kops.Cluster, instancegroups []*k return getCloudGroups(ctx, c, cluster, instancegroups, warnUnmatched, nodes) } +func (c *MockAWSCloud) GetCloudGroupErrors(ctx context.Context, group *cloudinstances.CloudInstanceGroup) ([]cloudinstances.CloudGroupError, error) { + return getCloudGroupErrors(ctx, c, group) +} + func (c *MockCloud) ProviderID() kops.CloudProviderID { return kops.CloudProviderAWS } diff --git a/upup/pkg/fi/cloudup/gce/compute.go b/upup/pkg/fi/cloudup/gce/compute.go index 92fcd1fb3e872..6af12e2b5b011 100644 --- a/upup/pkg/fi/cloudup/gce/compute.go +++ b/upup/pkg/fi/cloudup/gce/compute.go @@ -674,6 +674,7 @@ type InstanceGroupManagerClient interface { Get(project, zone, name string) (*compute.InstanceGroupManager, error) List(ctx context.Context, project, zone string) ([]*compute.InstanceGroupManager, error) ListManagedInstances(ctx context.Context, project, zone, name string) ([]*compute.ManagedInstance, error) + ListErrors(ctx context.Context, project, zone, name string) ([]*compute.InstanceManagedByIgmError, error) RecreateInstances(project, zone, name, id string) (*compute.Operation, error) SetTargetPools(project, zone, name string, targetPools []string) (*compute.Operation, error) SetInstanceTemplate(project, zone, name, instanceTemplateURL string) (*compute.Operation, error) @@ -720,6 +721,17 @@ func (c *instanceGroupManagerClientImpl) ListManagedInstances(ctx context.Contex return instances, nil } +func (c *instanceGroupManagerClientImpl) ListErrors(ctx context.Context, project, zone, name string) ([]*compute.InstanceManagedByIgmError, error) { + var items []*compute.InstanceManagedByIgmError + if err := c.srv.ListErrors(project, zone, name).Pages(ctx, func(page *compute.InstanceGroupManagersListErrorsResponse) error { + items = append(items, page.Items...) + return nil + }); err != nil { + return nil, err + } + return items, nil +} + func (c *instanceGroupManagerClientImpl) RecreateInstances(project, zone, name, id string) (*compute.Operation, error) { req := &compute.InstanceGroupManagersRecreateInstancesRequest{ Instances: []string{ diff --git a/upup/pkg/fi/cloudup/gce/instancegroups.go b/upup/pkg/fi/cloudup/gce/instancegroups.go index b20f90c34e60e..3b89e481e6fb3 100644 --- a/upup/pkg/fi/cloudup/gce/instancegroups.go +++ b/upup/pkg/fi/cloudup/gce/instancegroups.go @@ -21,8 +21,8 @@ import ( "encoding/base32" "fmt" "hash/fnv" + "sort" "strings" - "time" compute "google.golang.org/api/compute/v1" @@ -298,9 +298,96 @@ func matchInstanceGroup(mig *compute.InstanceGroupManager, c *kops.Cluster, inst return matches[0], nil } +// GetCloudGroupErrors returns recent provisioning errors the GCE managed +// instance group has hit. To avoid surfacing errors from a previous, healthier +// incarnation of the group, results are filtered to only errors timestamped +// after the most recent successful instance creation in the group. If the +// group is empty (no instances were ever created), no filter is applied. +func (c *gceCloudImplementation) GetCloudGroupErrors(ctx context.Context, group *cloudinstances.CloudInstanceGroup) ([]cloudinstances.CloudGroupError, error) { + mig, ok := group.Raw.(*compute.InstanceGroupManager) + if !ok || mig == nil { + return nil, nil + } + u, err := ParseGoogleCloudURL(mig.SelfLink) + if err != nil { + return nil, fmt.Errorf("parsing MIG self link %q: %w", mig.SelfLink, err) + } + + var watermark time.Time + for _, m := range group.Ready { + if m.CreationTimestamp.After(watermark) { + watermark = m.CreationTimestamp + } + } + for _, m := range group.NeedUpdate { + if m.CreationTimestamp.After(watermark) { + watermark = m.CreationTimestamp + } + } + + items, err := c.Compute().InstanceGroupManagers().ListErrors(ctx, u.Project, u.Zone, u.Name) + if err != nil { + return nil, fmt.Errorf("listing errors for MIG %q: %w", mig.Name, err) + } + + type key struct{ code, message string } + agg := map[key]*cloudinstances.CloudGroupError{} + for _, it := range items { + if it == nil || it.Error == nil { + continue + } + var ts time.Time + if it.Timestamp != "" { + if t, err := time.Parse(time.RFC3339, it.Timestamp); err == nil { + ts = t + } + } + if !watermark.IsZero() && !ts.IsZero() && ts.Before(watermark) { + continue + } + k := key{it.Error.Code, it.Error.Message} + e, ok := agg[k] + if !ok { + e = &cloudinstances.CloudGroupError{ + Code: it.Error.Code, + Message: it.Error.Message, + FirstSeen: ts, + LastSeen: ts, + } + if it.InstanceActionDetails != nil { + e.Instance = LastComponent(it.InstanceActionDetails.Instance) + } + agg[k] = e + } + e.Count++ + if !ts.IsZero() { + if e.FirstSeen.IsZero() || ts.Before(e.FirstSeen) { + e.FirstSeen = ts + } + if ts.After(e.LastSeen) { + e.LastSeen = ts + } + } + } + + out := make([]cloudinstances.CloudGroupError, 0, len(agg)) + for _, e := range agg { + out = append(out, *e) + } + sort.Slice(out, func(i, j int) bool { + return out[i].LastSeen.After(out[j].LastSeen) + }) + return out, nil +} + func addCloudInstanceData(cm *cloudinstances.CloudInstance, instance *compute.Instance) { cm.MachineType = LastComponent(instance.MachineType) cm.Status = instance.Status + if instance.CreationTimestamp != "" { + if t, err := time.Parse(time.RFC3339, instance.CreationTimestamp); err == nil { + cm.CreationTimestamp = t + } + } if instance.Status == "RUNNING" { cm.State = cloudinstances.CloudInstanceStatusUpToDate } diff --git a/util/pkg/awsinterfaces/autoscaling.go b/util/pkg/awsinterfaces/autoscaling.go index 73480cd0030aa..7a54068caeabe 100644 --- a/util/pkg/awsinterfaces/autoscaling.go +++ b/util/pkg/awsinterfaces/autoscaling.go @@ -36,6 +36,7 @@ type AutoScalingAPI interface { DeleteWarmPool(ctx context.Context, params *autoscaling.DeleteWarmPoolInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DeleteWarmPoolOutput, error) DescribeAutoScalingGroups(ctx context.Context, params *autoscaling.DescribeAutoScalingGroupsInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DescribeAutoScalingGroupsOutput, error) DescribeLifecycleHooks(ctx context.Context, params *autoscaling.DescribeLifecycleHooksInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DescribeLifecycleHooksOutput, error) + DescribeScalingActivities(ctx context.Context, params *autoscaling.DescribeScalingActivitiesInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DescribeScalingActivitiesOutput, error) DescribeTags(ctx context.Context, params *autoscaling.DescribeTagsInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DescribeTagsOutput, error) DescribeWarmPool(ctx context.Context, params *autoscaling.DescribeWarmPoolInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DescribeWarmPoolOutput, error) DetachInstances(ctx context.Context, params *autoscaling.DetachInstancesInput, optFns ...func(*autoscaling.Options)) (*autoscaling.DetachInstancesOutput, error)