From 5a08b6df24b03c368d69a05d73a8945f9a4729d8 Mon Sep 17 00:00:00 2001 From: Vincenzo Mauro Date: Fri, 8 May 2026 13:53:01 +0200 Subject: [PATCH] remove etcd preDrain hook strip workaround from node replacement test --- .../tnf_node_replacement_const.go | 8 -- .../tnf_node_replacement_finish.go | 97 ------------------- 2 files changed, 105 deletions(-) diff --git a/test/extended/edge_topologies/tnf_node_replacement_const.go b/test/extended/edge_topologies/tnf_node_replacement_const.go index 7418aa2f4b46..2efe9b4610e6 100644 --- a/test/extended/edge_topologies/tnf_node_replacement_const.go +++ b/test/extended/edge_topologies/tnf_node_replacement_const.go @@ -69,14 +69,6 @@ const ( bmhMachineDeleteWaitTimeout = 21 * time.Minute // bmhMachineDeletePollInterval is the sleep between polls while the object still exists (Delete is idempotent). bmhMachineDeletePollInterval = 30 * time.Second - // machinePreDrainHookStripPollInterval is how often the test strips CEO's EtcdQuorumOperator preDrain hook during - // Machine delete (workaround until CEO removes the hook when only a learner remains in member list). - machinePreDrainHookStripPollInterval = 5 * time.Second - - // etcdMachinePreDrainHookName / etcdMachinePreDrainHookOwner match cluster-etcd-operator's machine deletion hook - // (MachineDeletionHookName / MachineDeletionHookOwner in pkg/operator/ceohelpers). - etcdMachinePreDrainHookName = "EtcdQuorumOperator" - etcdMachinePreDrainHookOwner = "clusteroperator/etcd" // machineDeletionDiagnosticsLogTailLines is how many lines of each controller pod log to capture on BMH/Machine delete timeout. machineDeletionDiagnosticsLogTailLines = 500 // deleteGetTimeout caps the existence-check Get after each Delete; 20s is enough for a simple Get. diff --git a/test/extended/edge_topologies/tnf_node_replacement_finish.go b/test/extended/edge_topologies/tnf_node_replacement_finish.go index f6188790452b..5ca7c14dc5d7 100644 --- a/test/extended/edge_topologies/tnf_node_replacement_finish.go +++ b/test/extended/edge_topologies/tnf_node_replacement_finish.go @@ -123,85 +123,6 @@ func gvrForResourceType(resourceType string) (schema.GroupVersionResource, error } } -// runMachineEtcdPreDrainHookStripper removes CEO's EtcdQuorumOperator preDrain hook from the Machine on a ticker until ctx -// is cancelled. First strip runs immediately, then every machinePreDrainHookStripPollInterval. This avoids MAO blocking -// drain when CEO keeps the hook while etcd still lists the node IP as a learner (see strip comment in deleteOcResourceWithRetry). -func runMachineEtcdPreDrainHookStripper(ctx context.Context, oc *exutil.CLI, namespace, machineName string) { - dyn, err := dynamic.NewForConfig(oc.AdminConfig()) - if err != nil { - e2e.Logf("[Machine preDrain hook strip] dynamic client: %v", err) - return - } - stripOnce := func() { - c, cancel := context.WithTimeout(ctx, shortK8sClientTimeout) - defer cancel() - stripped, err := stripEtcdQuorumOperatorPreDrainHook(c, dyn, namespace, machineName) - if err != nil { - if !apierrors.IsNotFound(err) { - e2e.Logf("[Machine preDrain hook strip] strip attempt: %v", err) - } - return - } - if stripped { - e2e.Logf("[Machine preDrain hook strip] removed EtcdQuorumOperator preDrain hook from Machine %s/%s (workaround for CEO/learner deadlock)", namespace, machineName) - } - } - stripOnce() - ticker := time.NewTicker(machinePreDrainHookStripPollInterval) - defer ticker.Stop() - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - stripOnce() - } - } -} - -// stripEtcdQuorumOperatorPreDrainHook removes spec.lifecycleHooks.preDrain entries matching etcdMachinePreDrainHookName/Owner. -// Returns (true, nil) if the Machine was updated, (false, nil) if the hook was absent or preDrain missing. -func stripEtcdQuorumOperatorPreDrainHook(ctx context.Context, dyn dynamic.Interface, namespace, machineName string) (bool, error) { - u, err := dyn.Resource(apis.MachineGVR).Namespace(namespace).Get(ctx, machineName, metav1.GetOptions{}) - if err != nil { - return false, err - } - hooks, found, err := unstructured.NestedSlice(u.Object, "spec", "lifecycleHooks", "preDrain") - if err != nil { - return false, err - } - if !found { - return false, nil - } - newHooks := make([]interface{}, 0, len(hooks)) - removed := false - for _, h := range hooks { - m, ok := h.(map[string]interface{}) - if !ok { - newHooks = append(newHooks, h) - continue - } - name, _, _ := unstructured.NestedString(m, "name") - owner, _, _ := unstructured.NestedString(m, "owner") - if name == etcdMachinePreDrainHookName && owner == etcdMachinePreDrainHookOwner { - removed = true - continue - } - newHooks = append(newHooks, h) - } - if !removed { - return false, nil - } - if err := unstructured.SetNestedSlice(u.Object, newHooks, "spec", "lifecycleHooks", "preDrain"); err != nil { - return false, err - } - _, err = dyn.Resource(apis.MachineGVR).Namespace(namespace).Update(ctx, u, metav1.UpdateOptions{}) - if err != nil { - return false, err - } - return true, nil -} - // machineAPIDeleteDiagnosticPodPrefixes matches controller pods in openshift-machine-api whose logs explain stuck BMH/Machine deletes. var machineAPIDeleteDiagnosticPodPrefixes = []string{ "machine-api-controllers-", @@ -358,24 +279,6 @@ func deleteOcResourceWithRetry(oc *exutil.CLI, resourceType, resourceName, names return fmt.Errorf("create dynamic client: %w", err) } - // Workaround: cluster-etcd-operator may keep the EtcdQuorumOperator preDrain hook while etcd still lists the - // node's IP as a learner (podman-etcd RA can re-add the peer as a learner on the survivor). MAO then blocks - // drain forever. Strip the hook periodically during the delete window so machine-controller can proceed. - // Product fix: CEO should clear the hook when only a learner matches (see bug report / etcd BZ). - if resourceType == machineResourceType { - stripCtx, stripCancel := context.WithCancel(context.Background()) - var stripWG sync.WaitGroup - stripWG.Add(1) - go func() { - defer stripWG.Done() - runMachineEtcdPreDrainHookStripper(stripCtx, oc, namespace, resourceName) - }() - defer func() { - stripCancel() - stripWG.Wait() - }() - } - opName := fmt.Sprintf("delete %s %s", resourceType, resourceName) start := time.Now() deadline := start.Add(bmhMachineDeleteWaitTimeout)