From 9685baf11110c235318e0c385c3fb2cfbd1d4706 Mon Sep 17 00:00:00 2001 From: Nick Carboni Date: Fri, 8 May 2026 16:41:04 -0400 Subject: [PATCH] Reset finalizing timeout on transition from installing-pending-user-action When a non-essential worker gets stuck in installing-pending-user-action the rest of the cluster can still progress to finalizing. The cluster-level Done finalization timeout (70 min) can then expire before the per-host timeout chain completes (40 min Rebooting + 60 min pending-user-action = 100 min). In this situation, when the stuck host moves from `installing-pending-user-action` to either `error` or `installing-in-progress` the entire cluster will fail instead of evicting the stuck host and completing with the remaining nodes. This commit changesthe cluster state transition so it will reset progress_finalizing_stage_started_at and progress_finalizing_stage_timed_out when transitioning FROM InstallingPendingUserAction TO Finalizing. This gives the cluster a fresh 70-minute timeout window, allowing sufficient time for the host to either finish installation (if the boot issue was fixed) or error (in which case the cluster will succeed without it). Co-Authored-By: Claude Sonnet 4.5 Resolves https://redhat.atlassian.net/browse/MGMT-24352 --- internal/cluster/transition.go | 7 ++ internal/cluster/transition_test.go | 149 ++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) diff --git a/internal/cluster/transition.go b/internal/cluster/transition.go index e2d6b0771442..f22d1f9acde5 100644 --- a/internal/cluster/transition.go +++ b/internal/cluster/transition.go @@ -884,6 +884,13 @@ func addExtraParams(cluster *common.Cluster, srcState string) ([]interface{}, er if srcState == models.ClusterStatusPreparingForInstallation { extra = append(extra, initProgressParamsInstallingStage()...) } + case models.ClusterStatusFinalizing: + // Reset the finalizing stage timeout when returning from pending-user-action + // to give the cluster a fresh timeout window + if srcState == models.ClusterStatusInstallingPendingUserAction { + extra = append(extra, "progress_finalizing_stage_started_at", strfmt.DateTime(time.Now())) + extra = append(extra, "progress_finalizing_stage_timed_out", false) + } } return extra, nil } diff --git a/internal/cluster/transition_test.go b/internal/cluster/transition_test.go index 681d4f08021b..8d5dd41606ec 100644 --- a/internal/cluster/transition_test.go +++ b/internal/cluster/transition_test.go @@ -6085,3 +6085,152 @@ func makeFreeNetworksAddressesStr(elems ...*models.FreeNetworkAddresses) string Expect(err).ToNot(HaveOccurred()) return string(b) } + +var _ = Describe("Finalizing stage timeout reset", func() { + var ( + ctx = context.Background() + db *gorm.DB + clusterApi *Manager + mockEvents *eventsapi.MockHandler + mockHostAPI *host.MockAPI + mockMetric *metrics.MockAPI + mockS3Api *s3wrapper.MockAPI + ctrl *gomock.Controller + dbName string + clusterId strfmt.UUID + operatorsApi operators.API + ) + + BeforeEach(func() { + db, dbName = common.PrepareTestDB() + ctrl = gomock.NewController(GinkgoT()) + mockEvents = eventsapi.NewMockHandler(ctrl) + mockHostAPI = host.NewMockAPI(ctrl) + mockMetric = metrics.NewMockAPI(ctrl) + mockS3Api = s3wrapper.NewMockAPI(ctrl) + operatorsApi = operators.NewManager(common.GetTestLog(), nil, operators.Options{}, nil) + clusterApi = NewManager(getDefaultConfig(), common.GetTestLog().WithField("pkg", "cluster-monitor"), db, + commontesting.GetDummyNotificationStream(ctrl), mockEvents, nil, mockHostAPI, mockMetric, nil, nil, + operatorsApi, nil, mockS3Api, nil, nil, nil, false, nil) + clusterId = strfmt.UUID(uuid.New().String()) + }) + + AfterEach(func() { + ctrl.Finish() + common.DeleteTestDB(db, dbName) + }) + + It("should reset finalizing stage fields on transition from InstallingPendingUserAction to Finalizing", func() { + oldTimestamp := time.Now().Add(-2 * time.Hour) + cluster := common.Cluster{ + Cluster: models.Cluster{ + ID: &clusterId, + Status: swag.String(models.ClusterStatusInstallingPendingUserAction), + StatusInfo: swag.String("Waiting for user action"), + BaseDNSDomain: "test.com", + PullSecretSet: true, + OpenshiftVersion: common.TestDefaultConfig.OpenShiftVersion, + StatusUpdatedAt: strfmt.DateTime(time.Now()), + InstallStartedAt: strfmt.DateTime(time.Now().Add(-1 * time.Hour)), + MachineNetworks: common.TestIPv4Networking.MachineNetworks, + APIVips: common.TestIPv4Networking.APIVips, + IngressVips: common.TestIPv4Networking.IngressVips, + }, + } + Expect(db.Create(&cluster).Error).ShouldNot(HaveOccurred()) + + Expect(db.Model(&common.Cluster{}).Where("id = ?", clusterId.String()).Updates(map[string]interface{}{ + "progress_finalizing_stage_started_at": oldTimestamp, + "progress_finalizing_stage_timed_out": true, + }).Error).ShouldNot(HaveOccurred()) + + hid1, hid2, hid3 := strfmt.UUID(uuid.New().String()), strfmt.UUID(uuid.New().String()), strfmt.UUID(uuid.New().String()) + hid4, hid5, hid6 := strfmt.UUID(uuid.New().String()), strfmt.UUID(uuid.New().String()), strfmt.UUID(uuid.New().String()) + + hosts := []models.Host{ + {ID: &hid1, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleMaster}, + {ID: &hid2, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleMaster}, + {ID: &hid3, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleMaster}, + {ID: &hid4, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleWorker}, + {ID: &hid5, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleWorker}, + {ID: &hid6, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstallingInProgress), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleWorker}, + } + + for i := range hosts { + Expect(db.Create(&hosts[i]).Error).ShouldNot(HaveOccurred()) + } + + mockEvents.EXPECT().SendClusterEvent(gomock.Any(), eventstest.NewEventMatcher( + eventstest.WithNameMatcher(eventgen.ClusterStatusUpdatedEventName), + eventstest.WithClusterIdMatcher(clusterId.String()))).AnyTimes() + + cluster = getClusterFromDB(clusterId, db) + clusterAfterRefresh, err := clusterApi.RefreshStatus(ctx, &cluster, db) + Expect(err).ShouldNot(HaveOccurred()) + Expect(clusterAfterRefresh).ToNot(BeNil()) + Expect(swag.StringValue(clusterAfterRefresh.Status)).To(Equal(models.ClusterStatusFinalizing)) + + var progressStartedAt time.Time + var progressTimedOut bool + row := db.Raw("SELECT progress_finalizing_stage_started_at, progress_finalizing_stage_timed_out FROM clusters WHERE id = ?", clusterId.String()).Row() + Expect(row.Scan(&progressStartedAt, &progressTimedOut)).ShouldNot(HaveOccurred()) + + Expect(progressStartedAt).To(BeTemporally("~", time.Now(), 5*time.Second)) + Expect(progressStartedAt).NotTo(BeTemporally("~", oldTimestamp, 1*time.Minute)) + Expect(progressTimedOut).To(BeFalse()) + }) + + It("should NOT reset finalizing stage fields on transition from Installing to Finalizing", func() { + cluster := common.Cluster{ + Cluster: models.Cluster{ + ID: &clusterId, + Status: swag.String(models.ClusterStatusInstalling), + StatusInfo: swag.String("Installing"), + BaseDNSDomain: "test.com", + PullSecretSet: true, + OpenshiftVersion: common.TestDefaultConfig.OpenShiftVersion, + StatusUpdatedAt: strfmt.DateTime(time.Now()), + InstallStartedAt: strfmt.DateTime(time.Now().Add(-1 * time.Hour)), + MachineNetworks: common.TestIPv4Networking.MachineNetworks, + APIVips: common.TestIPv4Networking.APIVips, + IngressVips: common.TestIPv4Networking.IngressVips, + }, + } + Expect(db.Create(&cluster).Error).ShouldNot(HaveOccurred()) + + hid1, hid2, hid3 := strfmt.UUID(uuid.New().String()), strfmt.UUID(uuid.New().String()), strfmt.UUID(uuid.New().String()) + hid4, hid5 := strfmt.UUID(uuid.New().String()), strfmt.UUID(uuid.New().String()) + + hosts := []models.Host{ + {ID: &hid1, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleMaster}, + {ID: &hid2, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleMaster}, + {ID: &hid3, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleMaster}, + {ID: &hid4, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleWorker}, + {ID: &hid5, InfraEnvID: clusterId, ClusterID: &clusterId, Status: swag.String(models.HostStatusInstalled), Inventory: common.GenerateTestDefaultInventory(), Role: models.HostRoleWorker}, + } + + for i := range hosts { + Expect(db.Create(&hosts[i]).Error).ShouldNot(HaveOccurred()) + } + + mockEvents.EXPECT().SendClusterEvent(gomock.Any(), eventstest.NewEventMatcher( + eventstest.WithNameMatcher(eventgen.ClusterStatusUpdatedEventName), + eventstest.WithClusterIdMatcher(clusterId.String()))).AnyTimes() + + cluster = getClusterFromDB(clusterId, db) + clusterAfterRefresh, err := clusterApi.RefreshStatus(ctx, &cluster, db) + Expect(err).ShouldNot(HaveOccurred()) + Expect(clusterAfterRefresh).ToNot(BeNil()) + Expect(swag.StringValue(clusterAfterRefresh.Status)).To(Equal(models.ClusterStatusFinalizing)) + + var progressStartedAt *time.Time + var progressTimedOut *bool + row := db.Raw("SELECT progress_finalizing_stage_started_at, progress_finalizing_stage_timed_out FROM clusters WHERE id = ?", clusterId.String()).Row() + err = row.Scan(&progressStartedAt, &progressTimedOut) + Expect(err).ShouldNot(HaveOccurred()) + + if progressStartedAt != nil { + Expect(*progressStartedAt).NotTo(BeTemporally("~", time.Now(), 5*time.Second)) + } + }) +})