From 1ecf9aad095ef01307eb31206ca10c4b2dae1bcc Mon Sep 17 00:00:00 2001 From: "ohotnikov.ivan" <49371933+IvanHunters@users.noreply.github.com> Date: Thu, 26 Mar 2026 18:24:03 +0300 Subject: [PATCH] fix(agent): remove reconnection delay on DuplicateServerError When clientsCount < serverCount after a client disconnects, the agent needs to establish a new connection quickly. The previous code added a ~1s delay (resetBackoff + jitter) on each DuplicateServerError, which occurs frequently due to DNS load balancing connecting to already-connected proxy pods. This delay causes slow reconnection (average ~5s per client) and cascading failures when multiple clients disconnect simultaneously, leaving nodes in permanent NotReady state. By removing the else block, duration remains 0, enabling immediate retry and fast reconnection through DNS load balancing. Signed-off-by: ohotnikov.ivan <49371933+IvanHunters@users.noreply.github.com> --- pkg/agent/clientset.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/agent/clientset.go b/pkg/agent/clientset.go index 886bf85b7..7a24cb3fa 100644 --- a/pkg/agent/clientset.go +++ b/pkg/agent/clientset.go @@ -226,10 +226,10 @@ func (cs *ClientSet) sync() { klog.V(4).InfoS("duplicate server", "serverID", dse.ServerID, "serverCount", serverCount, "clientsCount", clientsCount) if serverCount != 0 && clientsCount >= serverCount { duration = backoff.Step() - } else { - backoff = cs.resetBackoff() - duration = wait.Jitter(backoff.Duration, backoff.Jitter) } + // When clientsCount < serverCount, we need a new connection. + // Leave duration at 0 to retry immediately without delay, + // allowing fast reconnection via DNS load balancing. } else { klog.ErrorS(err, "cannot connect once") duration = backoff.Step()