diff --git a/packages/k8s/src/k8s/index.ts b/packages/k8s/src/k8s/index.ts index ae773da3..1de535a7 100644 --- a/packages/k8s/src/k8s/index.ts +++ b/packages/k8s/src/k8s/index.ts @@ -29,12 +29,121 @@ const kc = new k8s.KubeConfig() kc.loadFromDefault() -const k8sApi = kc.makeApiClient(k8s.CoreV1Api) -const k8sBatchV1Api = kc.makeApiClient(k8s.BatchV1Api) -const k8sAuthorizationV1Api = kc.makeApiClient(k8s.AuthorizationV1Api) - const DEFAULT_WAIT_FOR_POD_TIME_SECONDS = 10 * 60 // 10 min +const RETRYABLE_STATUS_CODES = new Set([408, 429, 500, 502, 503, 504]) +const RETRYABLE_NETWORK_CODES = new Set([ + 'ECONNREFUSED', + 'ECONNRESET', + 'ETIMEDOUT', + 'ENETUNREACH', + 'EAI_AGAIN', + 'ENOTFOUND' +]) +const MAX_RETRIES = 3 +const RETRY_BASE_DELAY_MS = 1000 + +export function isRetryableError(err: unknown): boolean { + if (err instanceof k8s.ApiException) { + return RETRYABLE_STATUS_CODES.has(err.code) + } + + let current: unknown = err + while (current instanceof Error) { + if ( + 'code' in current && + typeof current.code === 'string' && + RETRYABLE_NETWORK_CODES.has(current.code) + ) { + return true + } + current = (current as { cause?: unknown }).cause + } + + return false +} + +function retryDelay(attempt: number): number { + return RETRY_BASE_DELAY_MS * 2 ** attempt * (0.5 + Math.random()) +} + +export function retryAfterDelay(err: k8s.ApiException, attempt: number): number { + const headerRetrySeconds = err.headers?.['retry-after'] ?? err.headers?.['Retry-After'] + if (!headerRetrySeconds) { + return retryDelay(attempt) + } + + const seconds = Number(headerRetrySeconds) + if (!Number.isFinite(seconds) || seconds <= 0) { + return retryDelay(attempt) + } + + // Cap the delay to 30 seconds + const maxDelaySeconds = 30; + return Math.min(seconds * 1000, maxDelaySeconds * 1000) +} + +function describeError(err: unknown): string { + if (err instanceof k8s.ApiException) { + return `status ${err.code}` + } + let current: unknown = err + while (current instanceof Error) { + if ( + 'code' in current && + typeof current.code === 'string' && + RETRYABLE_NETWORK_CODES.has(current.code) + ) { + return current.code + } + current = (current as { cause?: unknown }).cause + } + return String(err) +} + +function withRetryClient(client: T): T { + const callWithRetry = async ( + fn: (...args: unknown[]) => unknown, + name: string, + args: unknown[] + ): Promise => { + for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { + try { + return await fn(...args) + } catch (err) { + if (!isRetryableError(err) || attempt === MAX_RETRIES) { + throw err + } + const delay = + err instanceof k8s.ApiException && err.code === 429 + ? retryAfterDelay(err, attempt) + : retryDelay(attempt) + core.warning( + `K8s API call ${name} failed (${describeError(err)}), retrying in ${Math.round(delay)}ms (attempt ${attempt + 1}/${MAX_RETRIES})` + ) + await sleep(delay) + } + } + } + + return new Proxy(client, { + get(target, prop, receiver) { + const value = Reflect.get(target, prop, receiver) + if (typeof value !== 'function') { + return value + } + return async (...args: unknown[]) => + callWithRetry(value.bind(target), String(prop), args) + } + }) +} + +const k8sApi = withRetryClient(kc.makeApiClient(k8s.CoreV1Api)) +const k8sBatchV1Api = withRetryClient(kc.makeApiClient(k8s.BatchV1Api)) +const k8sAuthorizationV1Api = withRetryClient( + kc.makeApiClient(k8s.AuthorizationV1Api) +) + export const requiredPermissions = [ { group: '', @@ -176,10 +285,20 @@ export async function createJobPod( mergePodSpecWithOptions(appPod.spec, extension.spec) } - return await k8sApi.createNamespacedPod({ - namespace: namespace(), - body: appPod - }) + try { + return await k8sApi.createNamespacedPod({ + namespace: namespace(), + body: appPod + }) + } catch (err) { + if (err instanceof k8s.ApiException && err.code === 409) { + return await k8sApi.readNamespacedPod({ + name, + namespace: namespace() + }) + } + throw err + } } export async function createContainerStepPod( @@ -229,18 +348,35 @@ export async function createContainerStepPod( mergePodSpecWithOptions(appPod.spec, extension.spec) } - return await k8sApi.createNamespacedPod({ - namespace: namespace(), - body: appPod - }) + try { + return await k8sApi.createNamespacedPod({ + namespace: namespace(), + body: appPod + }) + } catch (err) { + if (err instanceof k8s.ApiException && err.code === 409) { + return await k8sApi.readNamespacedPod({ + name, + namespace: namespace() + }) + } + throw err + } } export async function deletePod(name: string): Promise { - await k8sApi.deleteNamespacedPod({ - name, - namespace: namespace(), - gracePeriodSeconds: 0 - }) + try { + await k8sApi.deleteNamespacedPod({ + name, + namespace: namespace(), + gracePeriodSeconds: 0 + }) + } catch (err) { + if (err instanceof k8s.ApiException && err.code === 404) { + return + } + throw err + } } export async function execPodStep( @@ -614,11 +750,20 @@ export async function createDockerSecret( 'base64' ) } - - return await k8sApi.createNamespacedSecret({ - namespace: namespace(), - body: secret - }) + try { + return await k8sApi.createNamespacedSecret({ + namespace: namespace(), + body: secret + }) + } catch (err) { + if (err instanceof k8s.ApiException && err.code === 409) { + return await k8sApi.readNamespacedSecret({ + name: secretName, + namespace: namespace() + }) + } + throw err + } } export async function createSecretForEnvs(envs: { @@ -642,18 +787,32 @@ export async function createSecretForEnvs(envs: { secret.data[key] = Buffer.from(value).toString('base64') } - await k8sApi.createNamespacedSecret({ - namespace: namespace(), - body: secret - }) + try { + await k8sApi.createNamespacedSecret({ + namespace: namespace(), + body: secret + }) + } catch (err) { + if (!(err instanceof k8s.ApiException && err.code === 409)) { + throw err + } + } + return secretName } export async function deleteSecret(name: string): Promise { - await k8sApi.deleteNamespacedSecret({ - name, - namespace: namespace() - }) + try { + await k8sApi.deleteNamespacedSecret({ + name, + namespace: namespace() + }) + } catch (err) { + if (err instanceof k8s.ApiException && err.code === 404) { + return + } + throw err + } } export async function pruneSecrets(): Promise { diff --git a/packages/k8s/tests/k8s-retry-test.ts b/packages/k8s/tests/k8s-retry-test.ts new file mode 100644 index 00000000..ed99461f --- /dev/null +++ b/packages/k8s/tests/k8s-retry-test.ts @@ -0,0 +1,165 @@ +import * as k8s from '@kubernetes/client-node' +import { isRetryableError, retryAfterDelay } from '../src/k8s' + +function apiException( + code: number, + headers: { [key: string]: string } = {} +): k8s.ApiException { + return new k8s.ApiException(code, `status ${code}`, {}, headers) +} + +function networkError(code: string, cause?: unknown): Error { + const err = new Error(`network error ${code}`) as Error & { + code: string + cause?: unknown + } + err.code = code + if (cause !== undefined) { + err.cause = cause + } + return err +} + +describe('isRetryableError', () => { + it.each([408, 429, 500, 502, 503, 504])( + 'returns true for ApiException with status %i', + code => { + expect(isRetryableError(apiException(code))).toBe(true) + } + ) + + it.each([400, 401, 403, 404, 409, 422])( + 'returns false for ApiException with status %i', + code => { + expect(isRetryableError(apiException(code))).toBe(false) + } + ) + + it('returns false for ApiException even if cause chain has a retryable network code', () => { + // The ApiException branch does not descend into the cause chain — + // if the API responded 400, retrying is pointless regardless of + // what happened underneath. Documents current precedence. + const inner = networkError('ECONNRESET') + const err = apiException(400) + ;(err as unknown as { cause: unknown }).cause = inner + expect(isRetryableError(err)).toBe(false) + }) + + it.each([ + 'ECONNREFUSED', + 'ECONNRESET', + 'ETIMEDOUT', + 'ENETUNREACH', + 'EAI_AGAIN', + 'ENOTFOUND' + ])('returns true for plain Error with network code %s', code => { + expect(isRetryableError(networkError(code))).toBe(true) + }) + + it('returns false for plain Error with non-retryable code', () => { + expect(isRetryableError(networkError('EACCES'))).toBe(false) + }) + + it('returns true when a retryable code is nested in the cause chain', () => { + const root = networkError('ECONNRESET') + const middle = new Error('middle') as Error & { cause: unknown } + middle.cause = root + const outer = new Error('outer') as Error & { cause: unknown } + outer.cause = middle + expect(isRetryableError(outer)).toBe(true) + }) + + it('returns false for null', () => { + expect(isRetryableError(null)).toBe(false) + }) + + it('returns false for undefined', () => { + expect(isRetryableError(undefined)).toBe(false) + }) + + it('returns false for a string', () => { + expect(isRetryableError('ECONNRESET')).toBe(false) + }) + + it('returns false for Error with no code property', () => { + expect(isRetryableError(new Error('boom'))).toBe(false) + }) + + it('returns false for Error whose code is non-string', () => { + const err = new Error('boom') as Error & { code: number } + err.code = 42 + expect(isRetryableError(err)).toBe(false) + }) +}) + +describe('retryAfterDelay', () => { + it('falls back to exponential backoff when Retry-After header is missing', () => { + const delay = retryAfterDelay(apiException(429), 0) + // retryDelay(0) = 1000 * 1 * (0.5..1.5) = 500..1500 + expect(delay).toBeGreaterThanOrEqual(500) + expect(delay).toBeLessThanOrEqual(1500) + }) + + it('uses lowercase retry-after header value in seconds', () => { + const delay = retryAfterDelay( + apiException(429, { 'retry-after': '5' }), + 0 + ) + expect(delay).toBe(5000) + }) + + it('uses capitalized Retry-After header', () => { + const delay = retryAfterDelay( + apiException(429, { 'Retry-After': '7' }), + 0 + ) + expect(delay).toBe(7000) + }) + + it('caps the delay at 30 seconds', () => { + const delay = retryAfterDelay( + apiException(429, { 'retry-after': '600' }), + 0 + ) + expect(delay).toBe(30_000) + }) + + it('falls back when Retry-After value is not numeric (HTTP-date format)', () => { + // Per RFC 7231 the header may be an HTTP-date; we only handle seconds. + const delay = retryAfterDelay( + apiException(429, { 'retry-after': 'Wed, 21 Oct 2026 07:28:00 GMT' }), + 1 + ) + // retryDelay(1) = 2000 * (0.5..1.5) = 1000..3000 + expect(delay).toBeGreaterThanOrEqual(1000) + expect(delay).toBeLessThanOrEqual(3000) + }) + + it('falls back when Retry-After is zero', () => { + const delay = retryAfterDelay( + apiException(429, { 'retry-after': '0' }), + 2 + ) + // retryDelay(2) = 4000 * (0.5..1.5) = 2000..6000 + expect(delay).toBeGreaterThanOrEqual(2000) + expect(delay).toBeLessThanOrEqual(6000) + }) + + it('falls back when Retry-After is negative', () => { + const delay = retryAfterDelay( + apiException(429, { 'retry-after': '-5' }), + 0 + ) + expect(delay).toBeGreaterThanOrEqual(500) + expect(delay).toBeLessThanOrEqual(1500) + }) + + it('falls back when Retry-After is empty', () => { + const delay = retryAfterDelay( + apiException(429, { 'retry-after': '' }), + 0 + ) + expect(delay).toBeGreaterThanOrEqual(500) + expect(delay).toBeLessThanOrEqual(1500) + }) +})