From bf3f7e1c030bc306af3da64e5665ff94809f92e8 Mon Sep 17 00:00:00 2001 From: Ayoub Nasr Date: Fri, 15 May 2026 09:43:19 +0200 Subject: [PATCH 1/6] Flush mine during upgrade MK8S-217 The mine can be flushed and updated during a metalk8s upgrade to avoid errors where the mine gets corrupted. --- scripts/upgrade.sh.in | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/upgrade.sh.in b/scripts/upgrade.sh.in index 6c80ec0ff9..da535be24d 100755 --- a/scripts/upgrade.sh.in +++ b/scripts/upgrade.sh.in @@ -92,6 +92,18 @@ upgrade_bootstrap () { metalk8s.salt.master.installed saltenv="$SALTENV" } +flush_and_refresh_mine() { + # After upgrading salt-master, the mine cache files on the master may have + # been corrupted by a non-atomic write interrupted mid-SIGTERM (kubelet + # kills the container when the manifest changes). mine.update alone cannot + # fix this: it reads the corrupt file first, fails, and silently discards + # the new data. mine.flush deletes the file (no read needed), after which + # mine.update writes a clean cache from scratch. + get_salt_container > /dev/null # wait for new salt-master to be running + "$SALT_CALL" mine.flush + "$SALT_CALL" mine.update +} + launch_pre_upgrade () { SALT_MASTER_CALL=("${EXEC_CONTAINER_COMMAND[@]}" "$(get_salt_container)") "${SALT_MASTER_CALL[@]}" salt-run saltutil.sync_all \ @@ -209,6 +221,7 @@ run "Performing Pre-Upgrade checks" precheck_upgrade "$BASE_DIR"/backup.sh --no-replication run "Upgrading bootstrap" upgrade_bootstrap +run "Refreshing Salt mine on bootstrap" flush_and_refresh_mine run "Setting cluster version to $DESTINATION_VERSION" patch_kubesystem_namespace run "Launching the pre-upgrade" launch_pre_upgrade run "Upgrading etcd cluster" upgrade_etcd From b809723b61035ceab968ddea0d4bb7ba1228eec6 Mon Sep 17 00:00:00 2001 From: Ayoub Nasr Date: Fri, 15 May 2026 09:48:30 +0200 Subject: [PATCH 2/6] CHANGELOG: add entry for fix mine cache corrupted --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f195f607f1..0a6e02c938 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ [v0.0.1-beta.2](https://github.com/scality/disk-management-agent/releases/tag/v0.0.1-beta.2) (PR[#4933](https://github.com/scality/metalk8s/pull/4933)) +### Bug Fixes + +- Fix a bug where the salt mine fails silently during upgrades due to a corrupted mine cache. + (PR[#4934](https://github.com/scality/metalk8s/pull/4934)) + ## Release 133.0.4 ## Release 133.0.3 From 4112ab5c1bddf516c8ce717a45cb35bf3abc6f01 Mon Sep 17 00:00:00 2001 From: Ayoub Nasr Date: Fri, 15 May 2026 14:11:36 +0200 Subject: [PATCH 3/6] Gate dex_ca_b64 mine function behind dex.enabled MK8S-218 in deployments where dex is disabled, the mine.update functions are flooded with warnings from the dex mine function failures we can fix this by gating this mine function behind the actual state of dex. the function won't be declared if dex is disabled. --- pillar/metalk8s/roles/ca.sls | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pillar/metalk8s/roles/ca.sls b/pillar/metalk8s/roles/ca.sls index bb17740ada..f343084e30 100644 --- a/pillar/metalk8s/roles/ca.sls +++ b/pillar/metalk8s/roles/ca.sls @@ -15,9 +15,11 @@ mine_functions: - mine_function: hashutil.base64_encodefile - /etc/kubernetes/pki/sa.pub +{%- if pillar.addons.dex.enabled %} dex_ca_b64: - mine_function: hashutil.base64_encodefile - /etc/metalk8s/pki/dex/ca.crt +{%- endif %} ingress_ca_b64: - mine_function: hashutil.base64_encodefile From 2c8d871632a6b6779aad0555d0561c56c86d528d Mon Sep 17 00:00:00 2001 From: Ayoub Nasr Date: Fri, 15 May 2026 14:14:49 +0200 Subject: [PATCH 4/6] CHANGELOG: add entry for fix dex_ca_b64 mine function --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a6e02c938..ecd083d664 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,10 @@ - Fix a bug where the salt mine fails silently during upgrades due to a corrupted mine cache. (PR[#4934](https://github.com/scality/metalk8s/pull/4934)) +- Fix a bug where the salt mine fails prints many warnings when dex is disabled. + (PR[#4934](https://github.com/scality/metalk8s/pull/4934)) + + ## Release 133.0.4 ## Release 133.0.3 From e70fc16bf83ab321ae3fbc6081b25707f2593de9 Mon Sep 17 00:00:00 2001 From: Ayoub Nasr Date: Fri, 15 May 2026 14:19:44 +0200 Subject: [PATCH 5/6] Flush and Update mine on all minions during upgrade --- scripts/upgrade.sh.in | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/upgrade.sh.in b/scripts/upgrade.sh.in index da535be24d..a9ee7342a8 100755 --- a/scripts/upgrade.sh.in +++ b/scripts/upgrade.sh.in @@ -99,9 +99,11 @@ flush_and_refresh_mine() { # fix this: it reads the corrupt file first, fails, and silently discards # the new data. mine.flush deletes the file (no read needed), after which # mine.update writes a clean cache from scratch. - get_salt_container > /dev/null # wait for new salt-master to be running - "$SALT_CALL" mine.flush - "$SALT_CALL" mine.update + # Run on all minions so the master has a clean, up-to-date mine cache for + # the entire cluster before any upgrade state queries it. + SALT_MASTER_CALL=("${EXEC_CONTAINER_COMMAND[@]}" "$(get_salt_container)") + "${SALT_MASTER_CALL[@]}" salt '*' mine.flush + "${SALT_MASTER_CALL[@]}" salt '*' mine.update } launch_pre_upgrade () { From dc852dbd3d76bd0c6bf0af21293dddb8aebf770f Mon Sep 17 00:00:00 2001 From: Ayoub Nasr Date: Fri, 15 May 2026 15:14:12 +0200 Subject: [PATCH 6/6] small fixes --- CHANGELOG.md | 2 +- pillar/metalk8s/roles/ca.sls | 2 ++ scripts/upgrade.sh.in | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ecd083d664..1dc52d2663 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ - Fix a bug where the salt mine fails silently during upgrades due to a corrupted mine cache. (PR[#4934](https://github.com/scality/metalk8s/pull/4934)) -- Fix a bug where the salt mine fails prints many warnings when dex is disabled. +- Fix a bug where the salt mine fails and prints many warnings when dex is disabled. (PR[#4934](https://github.com/scality/metalk8s/pull/4934)) diff --git a/pillar/metalk8s/roles/ca.sls b/pillar/metalk8s/roles/ca.sls index f343084e30..d2d5e9c269 100644 --- a/pillar/metalk8s/roles/ca.sls +++ b/pillar/metalk8s/roles/ca.sls @@ -72,6 +72,7 @@ x509_signing_policies: - keyUsage: critical digitalSignature, keyEncipherment - extendedKeyUsage: serverAuth - authorityKeyIdentifier: keyid +{%- if pillar.addons.dex.enabled %} dex_server_policy: - minions: '*' - signing_private_key: /etc/metalk8s/pki/dex/ca.key @@ -79,6 +80,7 @@ x509_signing_policies: - keyUsage: critical digitalSignature, keyEncipherment - extendedKeyUsage: serverAuth - authorityKeyIdentifier: keyid +{%- endif %} backup_server_policy: - minions: '*' - signing_private_key: /etc/metalk8s/pki/backup-server/ca.key diff --git a/scripts/upgrade.sh.in b/scripts/upgrade.sh.in index a9ee7342a8..4267fda745 100755 --- a/scripts/upgrade.sh.in +++ b/scripts/upgrade.sh.in @@ -223,7 +223,7 @@ run "Performing Pre-Upgrade checks" precheck_upgrade "$BASE_DIR"/backup.sh --no-replication run "Upgrading bootstrap" upgrade_bootstrap -run "Refreshing Salt mine on bootstrap" flush_and_refresh_mine +run "Refreshing Salt mine on nodes" flush_and_refresh_mine run "Setting cluster version to $DESTINATION_VERSION" patch_kubesystem_namespace run "Launching the pre-upgrade" launch_pre_upgrade run "Upgrading etcd cluster" upgrade_etcd