From f70725c08c0c5ecc0745a43f660979b8e9311cc9 Mon Sep 17 00:00:00 2001
From: Matt Boersma <Matt.Boersma@microsoft.com>
Date: Thu, 2 Oct 2025 11:35:55 -0600
Subject: [PATCH 1/2] Add xtrace debugging

---
 hack/version.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hack/version.sh b/hack/version.sh
index 7c6bd7b3f0a..54164195b5e 100755
--- a/hack/version.sh
+++ b/hack/version.sh
@@ -17,6 +17,8 @@ set -o errexit
 set -o nounset
 set -o pipefail
 
+set -o xtrace
+
 version::get_version_vars() {
     # shellcheck disable=SC1083
     GIT_COMMIT="$(git rev-parse HEAD^{commit})"
@@ -101,4 +103,4 @@ version::ldflags() {
   echo "${ldflags[*]-}"
 }
 
-version::ldflags
\ No newline at end of file
+version::ldflags

From 10049ce4fe90fef92d1eee33937089bf7120eb8f Mon Sep 17 00:00:00 2001
From: Matt Boersma <Matt.Boersma@microsoft.com>
Date: Fri, 1 May 2026 11:07:57 -0600
Subject: [PATCH 2/2] test/e2e: make collectNodes log dump best-effort

The collectNodes helper runs from [AfterEach] to dump per-node logs and
descriptions for the workload cluster. It currently uses
Expect(...).To(Succeed()) when listing nodes, which turns any transient
inability to reach the workload cluster API server into a hard spec
failure during teardown.

In practice the workload cluster's Azure load balancer / API server is
sometimes briefly unreachable while the spec is being torn down, which
has been causing otherwise-successful runs of the apiversion-upgrade job
to fail in [AfterEach] with i/o timeout against
*.cloudapp.azure.com:6443.

Match the pattern already used a few lines above for streaming pod logs:
log the error and continue instead of failing the spec.

Signed-off-by: Matt Boersma <Matt.Boersma@microsoft.com>
---
 test/e2e/azure_clusterproxy.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/e2e/azure_clusterproxy.go b/test/e2e/azure_clusterproxy.go
index 3aae66ffa85..da4703816a4 100644
--- a/test/e2e/azure_clusterproxy.go
+++ b/test/e2e/azure_clusterproxy.go
@@ -190,7 +190,14 @@ func (acp *AzureClusterProxy) collectNodes(ctx context.Context, namespace string
 	workload := acp.GetWorkloadCluster(ctx, namespace, name)
 	nodes := &corev1.NodeList{}
 
-	Expect(workload.GetClient().List(ctx, nodes)).To(Succeed())
+	// Failing to collect node logs should not cause the test to fail. The workload cluster
+	// API server may be unreachable during teardown (for example due to a transient Azure
+	// load balancer / DNS issue), and we should not turn an otherwise-successful spec into
+	// a failure during [AfterEach] log collection.
+	if err := workload.GetClient().List(ctx, nodes); err != nil {
+		Logf("Failed to list nodes for workload cluster %s/%s: %v", namespace, name, err)
+		return
+	}
 
 	var err error
 	var nodeDescribe string