From c7ca53005e63a748089145a84446121c7efdf29c Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Thu, 30 Apr 2026 12:18:20 +0800 Subject: [PATCH 1/7] add inte test Signed-off-by: Alex Wu --- .github/workflows/flyte-binary-v2.yml | 72 +++++++++++++++++++++++++++ runs/test/devbox/hello.py | 42 ++++++++++++++++ runs/test/devbox/start-devbox.sh | 55 ++++++++++++++++++++ 3 files changed, 169 insertions(+) create mode 100644 runs/test/devbox/hello.py create mode 100755 runs/test/devbox/start-devbox.sh diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml index 8ae7f888ab5..745e8414639 100644 --- a/.github/workflows/flyte-binary-v2.yml +++ b/.github/workflows/flyte-binary-v2.yml @@ -185,6 +185,12 @@ jobs: run: | mkdir -p /tmp/cpu-oci tar -xf /tmp/cpu-oci.tar -C /tmp/cpu-oci + - name: Upload CPU OCI archive for integration test + uses: actions/upload-artifact@v4 + with: + name: devbox-cpu-oci + path: /tmp/cpu-oci.tar + retention-days: 1 - name: Push CPU multi-arch image if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} uses: docker/build-push-action@v6 @@ -225,3 +231,69 @@ jobs: push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} cache-from: type=gha,scope=demo-gpu cache-to: type=gha,mode=max,scope=demo-gpu + + devbox-integration-test: + runs-on: ubuntu-latest + needs: [build-and-push-devbox-bundled-image] + timeout-minutes: 25 + env: + DEVBOX_IMAGE: flyte-devbox:ci + FLYTE_WORKER_IMAGE: ghcr.io/flyteorg/flyte:py3.11-v2.0.0b55 + FLYTE_SDK_VERSION: "2.0.0b55" + steps: + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + with: + name: devbox-cpu-oci + path: /tmp + - name: Load devbox image into Docker + run: | + sudo apt-get update + sudo apt-get install -y skopeo + mkdir -p /tmp/cpu-oci + tar -xf /tmp/cpu-oci.tar -C /tmp/cpu-oci + skopeo copy --override-os linux --override-arch amd64 \ + oci:/tmp/cpu-oci docker-daemon:${{ env.DEVBOX_IMAGE }} + docker images ${{ env.DEVBOX_IMAGE }} + - name: Start devbox cluster + run: ./runs/test/devbox/start-devbox.sh + - name: Pre-pull worker image into k3s containerd + # Avoids the in-cluster pull dominating the test budget when the run is submitted. + run: docker exec flyte-devbox crictl pull ${{ env.FLYTE_WORKER_IMAGE }} + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install flyte SDK + run: pip install "flyte==${{ env.FLYTE_SDK_VERSION }}" + - name: Configure SDK + run: | + mkdir -p "$HOME/.flyte" + cat > "$HOME/.flyte/config.yaml" <<'EOF' + admin: + endpoint: dns:///localhost:30080 + insecure: true + task: + domain: development + project: flytesnacks + org: localhost + image: + builder: local + EOF + - name: Submit task and wait for SUCCEEDED + run: python runs/test/devbox/hello.py + - name: Diagnostics on failure + if: failure() + run: | + echo "::group::docker logs" + docker logs flyte-devbox 2>&1 | tail -200 || true + echo "::endgroup::" + echo "::group::pods" + kubectl get pods -A || true + echo "::endgroup::" + echo "::group::flyte-binary logs" + kubectl logs -n flyte deploy/flyte-binary --tail=500 || true + echo "::endgroup::" + echo "::group::describe failing pods" + kubectl get pods -A --no-headers | awk '$4 != "Running" && $4 != "Completed"' \ + | while read ns name _; do kubectl describe pod -n "$ns" "$name" || true; done + echo "::endgroup::" diff --git a/runs/test/devbox/hello.py b/runs/test/devbox/hello.py new file mode 100644 index 00000000000..a2eb1103271 --- /dev/null +++ b/runs/test/devbox/hello.py @@ -0,0 +1,42 @@ +"""Submits a tiny task to a running devbox and asserts it reaches SUCCEEDED. + +Used by .github/workflows/flyte-binary-v2.yml as the post-build integration +gate: if this script exits non-zero, the just-built devbox image is broken. + +The SDK reads connection info from $HOME/.flyte/config.yaml. Storage for +fast-registration uploads is handled server-side by the devbox's DataProxy + +rustfs, so the SDK doesn't need explicit S3 credentials. The worker image +comes from $FLYTE_WORKER_IMAGE so CI can pre-pull it into k3s before +submission and keep this script aligned with whatever tag the workflow +loaded. +""" +import os +import sys + +import flyte + +WORKER_IMAGE = os.environ["FLYTE_WORKER_IMAGE"] + +env = flyte.TaskEnvironment( + name="devbox_ci_smoke", + image=WORKER_IMAGE, +) + + +@env.task +def add_one(x: int) -> int: + return x + 1 + + +def main() -> int: + flyte.init_from_config() + run = flyte.run(add_one, x=41) + print(f"run.result={run.result!r}") + if run.result != 42: + print(f"FAIL: expected 42, got {run.result!r}", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/runs/test/devbox/start-devbox.sh b/runs/test/devbox/start-devbox.sh new file mode 100755 index 00000000000..c9f769c7f55 --- /dev/null +++ b/runs/test/devbox/start-devbox.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Boots the bundled flyte-devbox image as a single-container k3s cluster +# suitable for integration tests. Mirrors `make start` in +# docker/devbox-bundled/Makefile but headless and CI-friendly: writes +# kubeconfig to $PWD/.kube/kubeconfig, exports KUBECONFIG via $GITHUB_ENV, +# and waits for the flyte-binary pod to report Ready before returning. +set -euo pipefail + +IMAGE="${DEVBOX_IMAGE:-flyte-devbox:ci}" +NAME="${DEVBOX_NAME:-flyte-devbox}" +KUBE_DIR="${KUBE_DIR:-$PWD/.kube}" +READY_TIMEOUT="${READY_TIMEOUT:-300}" + +mkdir -p "$KUBE_DIR" +rm -f "$KUBE_DIR/kubeconfig" + +docker run -d --rm --privileged --name "$NAME" \ + --add-host host.docker.internal:host-gateway \ + -e K3S_KUBECONFIG_OUTPUT=/.kube/kubeconfig \ + -v "$KUBE_DIR":/.kube \ + -p 6443:6443 \ + -p 30000:30000 \ + -p 30001:5432 \ + -p 30002:30002 \ + -p 30080:30080 \ + -p 30081:30081 \ + "$IMAGE" + +echo "Waiting for kubeconfig (timeout ${READY_TIMEOUT}s)..." +deadline=$(( $(date +%s) + READY_TIMEOUT )) +until [ -s "$KUBE_DIR/kubeconfig" ]; do + if [ "$(date +%s)" -gt "$deadline" ]; then + echo "ERROR: kubeconfig not written within ${READY_TIMEOUT}s" >&2 + docker logs "$NAME" >&2 || true + exit 1 + fi + sleep 1 +done +docker exec "$NAME" chown "$(id -u):$(id -g)" /.kube/kubeconfig + +KUBECONFIG="$KUBE_DIR/kubeconfig" +export KUBECONFIG +if [ -n "${GITHUB_ENV:-}" ]; then + echo "KUBECONFIG=$KUBECONFIG" >> "$GITHUB_ENV" +fi + +echo "Waiting for flyte-binary pod to be Ready..." +kubectl wait --for=condition=Ready pod -n flyte \ + -l app.kubernetes.io/name=flyte-binary \ + --timeout="${READY_TIMEOUT}s" + +echo "Devbox ready." +echo " Connect API: http://localhost:30080" +echo " rustfs S3: http://localhost:30002" +echo " Postgres: localhost:30001" From 2e3cb4277e746fc694855de12c77789de6abebe0 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Thu, 30 Apr 2026 12:48:16 +0800 Subject: [PATCH 2/7] fix timeout condition Signed-off-by: Alex Wu --- runs/test/devbox/start-devbox.sh | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/runs/test/devbox/start-devbox.sh b/runs/test/devbox/start-devbox.sh index c9f769c7f55..685e5d6b283 100755 --- a/runs/test/devbox/start-devbox.sh +++ b/runs/test/devbox/start-devbox.sh @@ -44,10 +44,30 @@ if [ -n "${GITHUB_ENV:-}" ]; then echo "KUBECONFIG=$KUBECONFIG" >> "$GITHUB_ENV" fi -echo "Waiting for flyte-binary pod to be Ready..." -kubectl wait --for=condition=Ready pod -n flyte \ - -l app.kubernetes.io/name=flyte-binary \ - --timeout="${READY_TIMEOUT}s" +echo "Waiting for flyte namespace..." +until kubectl get ns flyte >/dev/null 2>&1; do + if [ "$(date +%s)" -gt "$deadline" ]; then + echo "ERROR: flyte namespace not created within ${READY_TIMEOUT}s" >&2 + kubectl get ns >&2 || true + exit 1 + fi + sleep 2 +done + +echo "Waiting for flyte-binary deployment to exist..." +until kubectl get deploy -n flyte flyte-binary >/dev/null 2>&1; do + if [ "$(date +%s)" -gt "$deadline" ]; then + echo "ERROR: flyte-binary deployment not created within ${READY_TIMEOUT}s" >&2 + kubectl get all -A >&2 || true + exit 1 + fi + sleep 2 +done + +remaining=$(( deadline - $(date +%s) )) +[ "$remaining" -lt 30 ] && remaining=30 +echo "Waiting for flyte-binary rollout (timeout ${remaining}s)..." +kubectl rollout status deploy/flyte-binary -n flyte --timeout="${remaining}s" echo "Devbox ready." echo " Connect API: http://localhost:30080" From 7785420812bdee706c59f1775a7fee7214ee0dd5 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Thu, 30 Apr 2026 15:24:33 +0800 Subject: [PATCH 3/7] mark output Signed-off-by: Alex Wu --- .github/workflows/flyte-binary-v2.yml | 2 +- runs/test/devbox/hello.py | 14 ++++++++++++-- runs/test/devbox/start-devbox.sh | 21 +++++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml index 745e8414639..9257f2b9e6d 100644 --- a/.github/workflows/flyte-binary-v2.yml +++ b/.github/workflows/flyte-binary-v2.yml @@ -271,7 +271,7 @@ jobs: cat > "$HOME/.flyte/config.yaml" <<'EOF' admin: endpoint: dns:///localhost:30080 - insecure: true + insecure: True task: domain: development project: flytesnacks diff --git a/runs/test/devbox/hello.py b/runs/test/devbox/hello.py index a2eb1103271..b88ad39945f 100644 --- a/runs/test/devbox/hello.py +++ b/runs/test/devbox/hello.py @@ -29,8 +29,18 @@ def add_one(x: int) -> int: def main() -> int: - flyte.init_from_config() - run = flyte.run(add_one, x=41) + try: + flyte.init_from_config() + run = flyte.run(add_one, x=41) + except Exception as e: + # The SDK wraps storage errors with a generic message; walk the chain + # so CI logs show the real cause (network, signing, etc.). + cur, depth = e, 0 + while cur is not None and depth < 10: + print(f" [{depth}] {type(cur).__name__}: {cur}", file=sys.stderr) + cur = cur.__cause__ or cur.__context__ + depth += 1 + raise print(f"run.result={run.result!r}") if run.result != 42: print(f"FAIL: expected 42, got {run.result!r}", file=sys.stderr) diff --git a/runs/test/devbox/start-devbox.sh b/runs/test/devbox/start-devbox.sh index 685e5d6b283..f888c082ac3 100755 --- a/runs/test/devbox/start-devbox.sh +++ b/runs/test/devbox/start-devbox.sh @@ -69,6 +69,27 @@ remaining=$(( deadline - $(date +%s) )) echo "Waiting for flyte-binary rollout (timeout ${remaining}s)..." kubectl rollout status deploy/flyte-binary -n flyte --timeout="${remaining}s" +# Bridge rustfs.flyte:9000 -> localhost:30002 (the rustfs NodePort). +# DataProxy mints signed URLs whose host is the in-cluster storage endpoint +# (http://rustfs.flyte:9000), which is unreachable from the runner. We add a +# /etc/hosts entry and a TCP forwarder so the SDK's PUT to the signed URL +# resolves to the published NodePort and lands on the rustfs pod. +if ! grep -q '[[:space:]]rustfs\.flyte\b' /etc/hosts; then + echo "127.0.0.1 rustfs.flyte" | sudo tee -a /etc/hosts >/dev/null +fi +nohup socat TCP-LISTEN:9000,reuseaddr,fork TCP:127.0.0.1:30002 \ + >/tmp/rustfs-forward.log 2>&1 & +disown +forward_deadline=$(( $(date +%s) + 15 )) +until nc -z 127.0.0.1 9000 2>/dev/null; do + if [ "$(date +%s)" -gt "$forward_deadline" ]; then + echo "ERROR: rustfs.flyte:9000 forward did not open" >&2 + cat /tmp/rustfs-forward.log >&2 || true + exit 1 + fi + sleep 0.3 +done + echo "Devbox ready." echo " Connect API: http://localhost:30080" echo " rustfs S3: http://localhost:30002" From e19b1e74041090b3995fa52b1723730911de4314 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Thu, 30 Apr 2026 16:14:08 +0800 Subject: [PATCH 4/7] fix install Signed-off-by: Alex Wu --- runs/test/devbox/start-devbox.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/runs/test/devbox/start-devbox.sh b/runs/test/devbox/start-devbox.sh index f888c082ac3..4dec186d90b 100755 --- a/runs/test/devbox/start-devbox.sh +++ b/runs/test/devbox/start-devbox.sh @@ -77,6 +77,9 @@ kubectl rollout status deploy/flyte-binary -n flyte --timeout="${remaining}s" if ! grep -q '[[:space:]]rustfs\.flyte\b' /etc/hosts; then echo "127.0.0.1 rustfs.flyte" | sudo tee -a /etc/hosts >/dev/null fi +if ! command -v socat >/dev/null 2>&1; then + sudo apt-get update -qq && sudo apt-get install -y -qq socat +fi nohup socat TCP-LISTEN:9000,reuseaddr,fork TCP:127.0.0.1:30002 \ >/tmp/rustfs-forward.log 2>&1 & disown From a3338c4c5fbda2da402d225916dec6076e5c44c2 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Thu, 30 Apr 2026 16:45:50 +0800 Subject: [PATCH 5/7] fix setting Signed-off-by: Alex Wu --- .github/workflows/flyte-binary-v2.yml | 5 ++++- runs/test/devbox/start-devbox.sh | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml index 9257f2b9e6d..c820bfee4f2 100644 --- a/.github/workflows/flyte-binary-v2.yml +++ b/.github/workflows/flyte-binary-v2.yml @@ -266,11 +266,14 @@ jobs: - name: Install flyte SDK run: pip install "flyte==${{ env.FLYTE_SDK_VERSION }}" - name: Configure SDK + # Talk directly to flyte-binary (port-forwarded by start-devbox.sh). + # The Traefik NodePort 30080 doesn't speak h2c reliably for gRPC, so + # SDK calls fall over with cryptic ValueErrors when routed through it. run: | mkdir -p "$HOME/.flyte" cat > "$HOME/.flyte/config.yaml" <<'EOF' admin: - endpoint: dns:///localhost:30080 + endpoint: dns:///localhost:8090 insecure: True task: domain: development diff --git a/runs/test/devbox/start-devbox.sh b/runs/test/devbox/start-devbox.sh index 4dec186d90b..7edfb337975 100755 --- a/runs/test/devbox/start-devbox.sh +++ b/runs/test/devbox/start-devbox.sh @@ -93,7 +93,28 @@ until nc -z 127.0.0.1 9000 2>/dev/null; do sleep 0.3 done +# Port-forward directly to the flyte-binary ClusterIP service. +# The bundled Traefik on NodePort 30080 doesn't reliably do h2c, so the +# Python SDK's gRPC client (HTTP/2 cleartext) fails through it. Talking +# directly to svc/flyte-binary:8090 sidesteps the proxy entirely. +nohup kubectl port-forward -n flyte svc/flyte-binary 8090:8090 \ + --address 127.0.0.1 \ + >/tmp/flyte-binary-pf.log 2>&1 & +disown +pf_deadline=$(( $(date +%s) + 15 )) +until nc -z 127.0.0.1 8090 2>/dev/null; do + if [ "$(date +%s)" -gt "$pf_deadline" ]; then + echo "ERROR: flyte-binary port-forward did not open" >&2 + cat /tmp/flyte-binary-pf.log >&2 || true + exit 1 + fi + sleep 0.3 +done + echo "Devbox ready." +echo " flyte-binary (direct): http://localhost:8090" +echo " flyte-binary (Traefik): http://localhost:30080" +echo " rustfs S3: http://localhost:30002 (also rustfs.flyte:9000)" echo " Connect API: http://localhost:30080" echo " rustfs S3: http://localhost:30002" echo " Postgres: localhost:30001" From 6983e9c3c946fb3a91cad0bf2aa56800d81130f3 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Thu, 30 Apr 2026 17:14:56 +0800 Subject: [PATCH 6/7] fix setting Signed-off-by: Alex Wu --- runs/test/devbox/start-devbox.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runs/test/devbox/start-devbox.sh b/runs/test/devbox/start-devbox.sh index 7edfb337975..8b5d620a415 100755 --- a/runs/test/devbox/start-devbox.sh +++ b/runs/test/devbox/start-devbox.sh @@ -97,7 +97,7 @@ done # The bundled Traefik on NodePort 30080 doesn't reliably do h2c, so the # Python SDK's gRPC client (HTTP/2 cleartext) fails through it. Talking # directly to svc/flyte-binary:8090 sidesteps the proxy entirely. -nohup kubectl port-forward -n flyte svc/flyte-binary 8090:8090 \ +nohup kubectl port-forward -n flyte svc/flyte-binary-http 8090:8090 \ --address 127.0.0.1 \ >/tmp/flyte-binary-pf.log 2>&1 & disown From 286589ae9cacc0c93876c89b472646793a649c64 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Thu, 30 Apr 2026 17:58:33 +0800 Subject: [PATCH 7/7] add network test Signed-off-by: Alex Wu --- .github/workflows/flyte-binary-v2.yml | 33 +++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml index c820bfee4f2..1ce5fb5f7bb 100644 --- a/.github/workflows/flyte-binary-v2.yml +++ b/.github/workflows/flyte-binary-v2.yml @@ -282,6 +282,39 @@ jobs: image: builder: local EOF + - name: Probe DataProxy reachability + # Diagnostic: hit DataProxy.CreateUploadLocation via plain HTTP/1.1 + # Connect protocol on both endpoints. This bypasses h2c questions and + # tells us (a) whether DataProxy is functional, (b) what host the + # signed URL points at, (c) whether the rustfs.flyte:9000 bridge is + # actually wired up correctly. + run: | + set +e + set -x + payload='{"project":"flytesnacks","domain":"development","org":"localhost","filename":"probe.tar.gz","filename_root":"ci-probe","content_md5":"dGVzdC1jb250ZW50LWhhc2g=","expires_in":"1800s","content_length":1024}' + for ep in "http://localhost:8090" "http://localhost:30080"; do + echo "::group::CreateUploadLocation @ $ep" + curl -sS -i \ + -H 'Content-Type: application/json' \ + -X POST "$ep/flyteidl2.dataproxy.DataProxyService/CreateUploadLocation" \ + --data "$payload" + echo + echo "::endgroup::" + done + echo "::group::PUT to signed URL via rustfs.flyte:9000 bridge" + signed_url=$(curl -sS \ + -H 'Content-Type: application/json' \ + -X POST "http://localhost:8090/flyteidl2.dataproxy.DataProxyService/CreateUploadLocation" \ + --data "$payload" | python -c 'import sys,json; print(json.load(sys.stdin).get("signedUrl",""))') + echo "signed_url=$signed_url" + if [ -n "$signed_url" ]; then + curl -sS -i -X PUT --data-binary "hello" "$signed_url" || true + fi + echo "::endgroup::" + echo "::group::rustfs.flyte resolution + reachability" + getent hosts rustfs.flyte || true + curl -sS -i http://rustfs.flyte:9000/ || true + echo "::endgroup::" - name: Submit task and wait for SUCCEEDED run: python runs/test/devbox/hello.py - name: Diagnostics on failure