From f70725c08c0c5ecc0745a43f660979b8e9311cc9 Mon Sep 17 00:00:00 2001 From: Matt Boersma Date: Thu, 2 Oct 2025 11:35:55 -0600 Subject: [PATCH 1/2] Add xtrace debugging --- hack/version.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hack/version.sh b/hack/version.sh index 7c6bd7b3f0a..54164195b5e 100755 --- a/hack/version.sh +++ b/hack/version.sh @@ -17,6 +17,8 @@ set -o errexit set -o nounset set -o pipefail +set -o xtrace + version::get_version_vars() { # shellcheck disable=SC1083 GIT_COMMIT="$(git rev-parse HEAD^{commit})" @@ -101,4 +103,4 @@ version::ldflags() { echo "${ldflags[*]-}" } -version::ldflags \ No newline at end of file +version::ldflags From 10049ce4fe90fef92d1eee33937089bf7120eb8f Mon Sep 17 00:00:00 2001 From: Matt Boersma Date: Fri, 1 May 2026 11:07:57 -0600 Subject: [PATCH 2/2] test/e2e: make collectNodes log dump best-effort The collectNodes helper runs from [AfterEach] to dump per-node logs and descriptions for the workload cluster. It currently uses Expect(...).To(Succeed()) when listing nodes, which turns any transient inability to reach the workload cluster API server into a hard spec failure during teardown. In practice the workload cluster's Azure load balancer / API server is sometimes briefly unreachable while the spec is being torn down, which has been causing otherwise-successful runs of the apiversion-upgrade job to fail in [AfterEach] with i/o timeout against *.cloudapp.azure.com:6443. Match the pattern already used a few lines above for streaming pod logs: log the error and continue instead of failing the spec. Signed-off-by: Matt Boersma --- test/e2e/azure_clusterproxy.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/e2e/azure_clusterproxy.go b/test/e2e/azure_clusterproxy.go index 3aae66ffa85..da4703816a4 100644 --- a/test/e2e/azure_clusterproxy.go +++ b/test/e2e/azure_clusterproxy.go @@ -190,7 +190,14 @@ func (acp *AzureClusterProxy) collectNodes(ctx context.Context, namespace string workload := acp.GetWorkloadCluster(ctx, namespace, name) nodes := &corev1.NodeList{} - Expect(workload.GetClient().List(ctx, nodes)).To(Succeed()) + // Failing to collect node logs should not cause the test to fail. The workload cluster + // API server may be unreachable during teardown (for example due to a transient Azure + // load balancer / DNS issue), and we should not turn an otherwise-successful spec into + // a failure during [AfterEach] log collection. + if err := workload.GetClient().List(ctx, nodes); err != nil { + Logf("Failed to list nodes for workload cluster %s/%s: %v", namespace, name, err) + return + } var err error var nodeDescribe string