-
Notifications
You must be signed in to change notification settings - Fork 158
feat(interceptor): add SSE streaming callback for cold-start scaling #1592
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,17 @@ | ||
| package middleware | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "context" | ||
| "encoding/json" | ||
| "fmt" | ||
| "io" | ||
| "net/http" | ||
| "strconv" | ||
| "time" | ||
|
|
||
| "github.com/kedacore/http-add-on/interceptor/handler" | ||
| httpv1beta1 "github.com/kedacore/http-add-on/operator/apis/http/v1beta1" | ||
| kedahttp "github.com/kedacore/http-add-on/pkg/http" | ||
| "github.com/kedacore/http-add-on/pkg/k8s" | ||
| "github.com/kedacore/http-add-on/pkg/util" | ||
|
|
@@ -64,6 +68,22 @@ func (er *EndpointResolver) ServeHTTP(w http.ResponseWriter, r *http.Request) { | |
| } | ||
|
|
||
| serviceKey := ir.Namespace + "/" + ir.Spec.Target.Service | ||
|
|
||
| // Streaming callback: if the route has a StreamingCallback configured and | ||
| // the backend is not ready, check if this is a streaming request. If so, | ||
| // send SSE keepalive events while waiting for the backend. | ||
| hasStreamingCallback := ir.Spec.ColdStart != nil && ir.Spec.ColdStart.StreamingCallback != nil | ||
| if hasStreamingCallback && !er.readyCache.HasReadyEndpoints(serviceKey) { | ||
| streaming, err := isStreamingRequest(r) | ||
| if err != nil { | ||
| util.LoggerFromContext(ctx).Error(err, "failed to check streaming request") | ||
| } | ||
| if err == nil && streaming { | ||
| er.serveStreamingCallback(waitCtx, ctx, w, r, serviceKey, ir) | ||
| return | ||
| } | ||
| } | ||
|
|
||
| isColdStart, err := er.readyCache.WaitForReady(waitCtx, serviceKey) | ||
| if err != nil { | ||
| // No fallback, return an error | ||
|
|
@@ -100,3 +120,159 @@ func (er *EndpointResolver) ServeHTTP(w http.ResponseWriter, r *http.Request) { | |
|
|
||
| er.next.ServeHTTP(w, r) | ||
| } | ||
|
|
||
| // serveStreamingCallback handles cold-start waits for streaming requests by | ||
| // sending OpenAI-compatible SSE keepalive events until the backend is ready. | ||
| func (er *EndpointResolver) serveStreamingCallback( | ||
| waitCtx, parentCtx context.Context, | ||
| w http.ResponseWriter, | ||
| r *http.Request, | ||
| serviceKey string, | ||
| ir *httpv1beta1.InterceptorRoute, | ||
| ) { | ||
| logger := util.LoggerFromContext(parentCtx) | ||
| cb := ir.Spec.ColdStart.StreamingCallback | ||
|
|
||
| rc := http.NewResponseController(w) | ||
|
|
||
| // Write SSE headers — commits to a 200 response. | ||
| w.Header().Set("Content-Type", "text/event-stream") | ||
| w.Header().Set("Cache-Control", "no-cache") | ||
| w.Header().Set("Connection", "keep-alive") | ||
| w.Header().Set("X-Accel-Buffering", "no") | ||
| w.WriteHeader(http.StatusOK) | ||
|
|
||
| // Send initial loading message. | ||
| if err := writeSSEEvent(w, cb.Message); err != nil { | ||
| logger.Error(err, "failed to write initial streaming callback event") | ||
| return | ||
| } | ||
| if err := rc.Flush(); err != nil { | ||
| logger.Error(err, "failed to flush initial streaming callback event") | ||
| return | ||
| } | ||
|
|
||
| interval := cb.Interval.Duration | ||
| if interval <= 0 { | ||
| interval = 5 * time.Second | ||
| } | ||
|
|
||
| // Start keepalive goroutine. | ||
| callbackDone := make(chan struct{}) | ||
| callbackStopped := make(chan struct{}) | ||
| go func() { | ||
| defer close(callbackStopped) | ||
| ticker := time.NewTicker(interval) | ||
| defer ticker.Stop() | ||
| for { | ||
| select { | ||
| case <-ticker.C: | ||
| if err := writeSSEEvent(w, cb.KeepaliveMessage); err != nil { | ||
| logger.Error(err, "failed to write keepalive streaming callback event") | ||
| return | ||
| } | ||
| if err := rc.Flush(); err != nil { | ||
| logger.Error(err, "failed to flush keepalive streaming callback event") | ||
| return | ||
| } | ||
| case <-callbackDone: | ||
| return | ||
| } | ||
| } | ||
| }() | ||
|
|
||
| // Wait for the backend to become ready. | ||
| isColdStart, err := er.readyCache.WaitForReady(waitCtx, serviceKey) | ||
| close(callbackDone) | ||
| <-callbackStopped // ensure goroutine exits before touching w again | ||
|
|
||
| if err != nil { | ||
| // Already committed to 200, so send an SSE error event instead of HTTP error. | ||
| logger.Error(err, "backend not ready during streaming callback") | ||
| errMsg := fmt.Sprintf("Backend did not become ready: %v", err) | ||
| _ = writeSSEEvent(w, errMsg) | ||
| _, _ = fmt.Fprintf(w, "data: [DONE]\n\n") | ||
| _ = rc.Flush() | ||
| return | ||
| } | ||
|
|
||
| if er.cfg.EnableColdStartHeader { | ||
| w.Header().Set(kedahttp.HeaderColdStart, strconv.FormatBool(isColdStart)) | ||
| } | ||
|
|
||
| // Send a visual separator so the real model response starts on a fresh | ||
| // line, rather than being appended directly after the keepalive dots. | ||
| // The content ends with a zero-width space (U+200B) after the final | ||
| // newline because common shell-based clients use bash command | ||
| // substitution which strips trailing newlines. The zero-width space | ||
| // anchors the newline so it survives extraction while remaining | ||
| // invisible in the terminal output. | ||
| if err := writeSSEEvent(w, "\n\n---\n\u200B"); err != nil { | ||
| logger.Error(err, "failed to write separator streaming callback event") | ||
| } else { | ||
| _ = rc.Flush() | ||
| } | ||
|
|
||
| // Wrap the writer to suppress duplicate WriteHeader from the upstream proxy. | ||
| er.next.ServeHTTP(&headerSuppressingWriter{ | ||
| ResponseWriter: w, | ||
| headerWritten: true, | ||
| }, r) | ||
| } | ||
|
|
||
| // isStreamingRequest checks whether the JSON request body contains "stream": true. | ||
| // It restores the body so subsequent handlers can re-read it. | ||
| func isStreamingRequest(r *http.Request) (bool, error) { | ||
| if r.Body == nil { | ||
| return false, nil | ||
| } | ||
| body, err := io.ReadAll(r.Body) | ||
| r.Body = io.NopCloser(bytes.NewReader(body)) | ||
| if err != nil { | ||
| return false, err | ||
| } | ||
|
Comment on lines
+229
to
+233
|
||
| if len(body) == 0 { | ||
| return false, nil | ||
| } | ||
| var payload struct { | ||
| Stream bool `json:"stream"` | ||
| } | ||
| if err := json.Unmarshal(body, &payload); err != nil { | ||
| return false, nil //nolint:nilerr // non-JSON body is not an error | ||
| } | ||
| return payload.Stream, nil | ||
| } | ||
|
|
||
| // writeSSEEvent writes a single OpenAI-compatible chat.completion.chunk SSE event. | ||
| func writeSSEEvent(w http.ResponseWriter, content string) error { | ||
| contentJSON, _ := json.Marshal(content) | ||
| _, err := fmt.Fprintf(w, | ||
| "data: {\"id\":\"keda-cold-start\",\"object\":\"chat.completion.chunk\",\"created\":%d,\"model\":\"system\",\"choices\":[{\"index\":0,\"delta\":{\"content\":%s},\"finish_reason\":null}]}\n\n", | ||
| time.Now().Unix(), | ||
| contentJSON, | ||
| ) | ||
| return err | ||
| } | ||
|
|
||
| // headerSuppressingWriter wraps a ResponseWriter and silently ignores | ||
| // WriteHeader calls after the first one. This prevents the upstream | ||
| // reverse proxy from logging "superfluous response.WriteHeader call" | ||
| // when we have already committed to a 200 for SSE streaming. | ||
| type headerSuppressingWriter struct { | ||
| http.ResponseWriter | ||
| headerWritten bool | ||
| } | ||
|
|
||
| func (w *headerSuppressingWriter) WriteHeader(code int) { | ||
| if w.headerWritten { | ||
| return | ||
| } | ||
| w.headerWritten = true | ||
|
Comment on lines
+266
to
+270
|
||
| w.ResponseWriter.WriteHeader(code) | ||
| } | ||
|
|
||
| // Unwrap exposes the underlying ResponseWriter so that | ||
| // http.NewResponseController can find optional interfaces (Flusher, Hijacker, etc.). | ||
| func (w *headerSuppressingWriter) Unwrap() http.ResponseWriter { | ||
| return w.ResponseWriter | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the streaming-callback path the response is already committed with WriteHeader(200), so setting X-KEDA-HTTP-Cold-Start here will not reach real clients (only httptest.ResponseRecorder). If this header is intended for clients, set it before WriteHeader (e.g., set it to "true" up-front for this path) or expose the value via an SSE event/trailer instead.