diff --git a/CHANGELOG.md b/CHANGELOG.md index f195f607f1..1dc52d2663 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,15 @@ [v0.0.1-beta.2](https://github.com/scality/disk-management-agent/releases/tag/v0.0.1-beta.2) (PR[#4933](https://github.com/scality/metalk8s/pull/4933)) +### Bug Fixes + +- Fix a bug where the salt mine fails silently during upgrades due to a corrupted mine cache. + (PR[#4934](https://github.com/scality/metalk8s/pull/4934)) + +- Fix a bug where the salt mine fails and prints many warnings when dex is disabled. + (PR[#4934](https://github.com/scality/metalk8s/pull/4934)) + + ## Release 133.0.4 ## Release 133.0.3 diff --git a/pillar/metalk8s/roles/ca.sls b/pillar/metalk8s/roles/ca.sls index bb17740ada..d2d5e9c269 100644 --- a/pillar/metalk8s/roles/ca.sls +++ b/pillar/metalk8s/roles/ca.sls @@ -15,9 +15,11 @@ mine_functions: - mine_function: hashutil.base64_encodefile - /etc/kubernetes/pki/sa.pub +{%- if pillar.addons.dex.enabled %} dex_ca_b64: - mine_function: hashutil.base64_encodefile - /etc/metalk8s/pki/dex/ca.crt +{%- endif %} ingress_ca_b64: - mine_function: hashutil.base64_encodefile @@ -70,6 +72,7 @@ x509_signing_policies: - keyUsage: critical digitalSignature, keyEncipherment - extendedKeyUsage: serverAuth - authorityKeyIdentifier: keyid +{%- if pillar.addons.dex.enabled %} dex_server_policy: - minions: '*' - signing_private_key: /etc/metalk8s/pki/dex/ca.key @@ -77,6 +80,7 @@ x509_signing_policies: - keyUsage: critical digitalSignature, keyEncipherment - extendedKeyUsage: serverAuth - authorityKeyIdentifier: keyid +{%- endif %} backup_server_policy: - minions: '*' - signing_private_key: /etc/metalk8s/pki/backup-server/ca.key diff --git a/scripts/upgrade.sh.in b/scripts/upgrade.sh.in index 6c80ec0ff9..4267fda745 100755 --- a/scripts/upgrade.sh.in +++ b/scripts/upgrade.sh.in @@ -92,6 +92,20 @@ upgrade_bootstrap () { metalk8s.salt.master.installed saltenv="$SALTENV" } +flush_and_refresh_mine() { + # After upgrading salt-master, the mine cache files on the master may have + # been corrupted by a non-atomic write interrupted mid-SIGTERM (kubelet + # kills the container when the manifest changes). mine.update alone cannot + # fix this: it reads the corrupt file first, fails, and silently discards + # the new data. mine.flush deletes the file (no read needed), after which + # mine.update writes a clean cache from scratch. + # Run on all minions so the master has a clean, up-to-date mine cache for + # the entire cluster before any upgrade state queries it. + SALT_MASTER_CALL=("${EXEC_CONTAINER_COMMAND[@]}" "$(get_salt_container)") + "${SALT_MASTER_CALL[@]}" salt '*' mine.flush + "${SALT_MASTER_CALL[@]}" salt '*' mine.update +} + launch_pre_upgrade () { SALT_MASTER_CALL=("${EXEC_CONTAINER_COMMAND[@]}" "$(get_salt_container)") "${SALT_MASTER_CALL[@]}" salt-run saltutil.sync_all \ @@ -209,6 +223,7 @@ run "Performing Pre-Upgrade checks" precheck_upgrade "$BASE_DIR"/backup.sh --no-replication run "Upgrading bootstrap" upgrade_bootstrap +run "Refreshing Salt mine on nodes" flush_and_refresh_mine run "Setting cluster version to $DESTINATION_VERSION" patch_kubesystem_namespace run "Launching the pre-upgrade" launch_pre_upgrade run "Upgrading etcd cluster" upgrade_etcd