Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ This changelog keeps track of work items that have been completed and are ready
### New

- **General**: TODO ([#TODO](https://github.com/kedacore/http-add-on/issues/TODO))
- **Scaler**: Add OpenTelemetry metrics and distributed tracing to the external scaler ([#965](https://github.com/kedacore/http-add-on/issues/965))

### Improvements

Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ require (
github.com/prometheus/client_golang v1.23.2
github.com/stretchr/testify v1.11.1
github.com/tsenart/vegeta/v12 v12.13.0
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0
go.opentelemetry.io/otel v1.43.0
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0
Expand Down Expand Up @@ -121,7 +122,7 @@ require (
golang.org/x/tools v0.44.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
Expand Down
6 changes: 4 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 h1:0Qx7VGBacMm9ZENQ7TnNObTYI4ShC+lHI16seduaxZo=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0/go.mod h1:Sje3i3MjSPKTSPvVWCaL8ugBzJwik3u4smCjUeuupqg=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 h1:CqXxU8VOmDefoh0+ztfGaymYbhdB/tT3zs79QaZTNGY=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0/go.mod h1:BuhAPThV8PBHBvg8ZzZ/Ok3idOdhWIodywz2xEcRbJo=
go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
Expand Down Expand Up @@ -308,8 +310,8 @@ gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E
gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d h1:wT2n40TBqFY6wiwazVK9/iTWbsQrgk5ZfCSVFLO9LQA=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
google.golang.org/grpc v1.81.1 h1:VnnIIZ88UzOOKLukQi+ImGz8O1Wdp8nAGGnvOfEIWQQ=
google.golang.org/grpc v1.81.1/go.mod h1:xGH9GfzOyMTGIOXBJmXt+BX/V0kcdQbdcuwQ/zNw42I=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
Expand Down
14 changes: 14 additions & 0 deletions scaler/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,20 @@ type config struct {
ProfilingAddr string `env:"PROFILING_BIND_ADDRESS" envDefault:""`
// StreamIntervalMS is the interval in milliseconds between stream ticks
StreamIntervalMS int `env:"KEDA_HTTP_SCALER_STREAM_INTERVAL_MS" envDefault:"200"`

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we could also deduplicate the config to ensure it is consistent across components?

Metrics metricsConfig `envPrefix:""`
Tracing tracingConfig `envPrefix:""`
}

type metricsConfig struct {
OtelPrometheusExporterEnabled bool `env:"OTEL_PROM_EXPORTER_ENABLED" envDefault:"true"`
OtelPrometheusExporterPort int `env:"OTEL_PROM_EXPORTER_PORT" envDefault:"2224"`
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to not use the same port we use for the interceptor metrics?

OtelHTTPExporterEnabled bool `env:"OTEL_EXPORTER_OTLP_METRICS_ENABLED" envDefault:"false"`
}

type tracingConfig struct {
Enabled bool `env:"OTEL_EXPORTER_OTLP_TRACES_ENABLED" envDefault:"false"`
Exporter string `env:"OTEL_EXPORTER_OTLP_TRACES_PROTOCOL" envDefault:"console"`
}

func mustParseConfig() config {
Expand Down
80 changes: 74 additions & 6 deletions scaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@ import (
"net/http"
_ "net/http/pprof" //nolint:gosec // G108: pprof intentionally exposed, gated by --profiling-addr
"os"
"runtime"
"time"

"github.com/go-logr/logr"
"github.com/kedacore/keda/v2/pkg/scalers/externalscaler"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
"google.golang.org/grpc/health"
Expand All @@ -29,8 +32,11 @@ import (

"github.com/kedacore/http-add-on/pkg/build"
kedacache "github.com/kedacore/http-add-on/pkg/cache"
kedahttp "github.com/kedacore/http-add-on/pkg/http"
"github.com/kedacore/http-add-on/pkg/k8s"
"github.com/kedacore/http-add-on/pkg/util"
"github.com/kedacore/http-add-on/scaler/metrics"
"github.com/kedacore/http-add-on/scaler/tracing"
)

var setupLog = ctrl.Log.WithName("setup")
Expand All @@ -53,10 +59,36 @@ func main() {

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

setupLog.Info(
"starting scaler",
"metricsConfig", cfg.Metrics,
"tracingConfig", cfg.Tracing,
)

provider, err := metrics.NewMeterProvider(
cfg.Metrics.OtelPrometheusExporterEnabled,
cfg.Metrics.OtelHTTPExporterEnabled,
)
if err != nil {
setupLog.Error(err, "failed to create meter provider")
os.Exit(1)
}
defer func() {
if err := provider.Shutdown(context.Background()); err != nil {
setupLog.Error(err, "error shutting down meter provider")
}
}()

instruments, err := metrics.NewInstruments(provider)
if err != nil {
setupLog.Error(err, "failed to create metric instruments")
runtime.Goexit()
}

k8sCfg, err := ctrl.GetConfig()
if err != nil {
setupLog.Error(err, "Kubernetes client config not found")
os.Exit(1)
runtime.Goexit()
Copy link
Copy Markdown
Member

@linkvt linkvt May 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for using runtime.Goexit now? If this is intended we should probably also add the defer os.Exit(1) at top as in the interceptor to also stop the grpc server etc after runtime.Goexit has been called.

}

ctrlCache, err := cache.New(k8sCfg, cache.Options{
Expand All @@ -65,16 +97,29 @@ func main() {
})
if err != nil {
setupLog.Error(err, "creating cache")
os.Exit(1)
runtime.Goexit()
}

pinger := newQueuePinger(ctrl.Log, k8s.EndpointsFuncForControllerClient(ctrlCache), namespace, svcName, deplName, targetPortStr)
pinger := newQueuePinger(ctrl.Log, k8s.EndpointsFuncForControllerClient(ctrlCache), namespace, svcName, deplName, targetPortStr, instruments)

ctx := ctrl.SetupSignalHandler()
ctx = util.ContextWithLogger(ctx, setupLog)

eg, ctx := errgroup.WithContext(ctx)

if cfg.Tracing.Enabled {
shutdown, err := tracing.SetupOTelSDK(ctx, cfg.Tracing.Exporter)
if err != nil {
setupLog.Error(err, "error setting up tracer")
runtime.Goexit()
}
defer func() {
if shutdownErr := shutdown(context.Background()); shutdownErr != nil {
setupLog.Error(shutdownErr, "error during tracer shutdown")
}
}()
}

// start the controller-runtime cache
eg.Go(func() error {
setupLog.Info("starting the controller-runtime cache")
Expand All @@ -95,7 +140,7 @@ func main() {
// Wait for cache to sync before starting components that depend on it
if !ctrlCache.WaitForCacheSync(ctx) {
setupLog.Error(nil, "cache failed to sync")
os.Exit(1)
runtime.Goexit()
}

eg.Go(func() error {
Expand All @@ -120,11 +165,21 @@ func main() {
return nil
})

if cfg.Metrics.OtelPrometheusExporterEnabled {
eg.Go(func() error {
if err := runMetricsServer(ctx, ctrl.Log, cfg.Metrics); !util.IsIgnoredErr(err) {
setupLog.Error(err, "could not start the Prometheus metrics server")
return err
}
return nil
})
}

build.PrintComponentInfo(ctrl.Log, "Scaler")

if err := eg.Wait(); err != nil && !errors.Is(err, context.Canceled) {
setupLog.Error(err, "fatal error")
os.Exit(1)
runtime.Goexit()
}

setupLog.Info("Bye!")
Expand All @@ -139,7 +194,12 @@ func startGrpcServer(ctx context.Context, cfg config, lggr logr.Logger, pinger *
return err
}

grpcServer := grpc.NewServer()
var grpcOpts []grpc.ServerOption
if cfg.Tracing.Enabled {
grpcOpts = append(grpcOpts, grpc.StatsHandler(otelgrpc.NewServerHandler()))
}

grpcServer := grpc.NewServer(grpcOpts...)
reflection.Register(grpcServer)

hs := health.NewServer()
Expand Down Expand Up @@ -180,3 +240,11 @@ func startGrpcServer(ctx context.Context, cfg config, lggr logr.Logger, pinger *

return grpcServer.Serve(lis)
}

func runMetricsServer(ctx context.Context, lggr logr.Logger, metricsCfg metricsConfig) error {
lggr.Info("starting the prometheus metrics server", "port", metricsCfg.OtelPrometheusExporterPort, "path", "/metrics")
addr := fmt.Sprintf("0.0.0.0:%d", metricsCfg.OtelPrometheusExporterPort)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
return kedahttp.ServeContext(ctx, addr, mux, nil)
}
81 changes: 81 additions & 0 deletions scaler/metrics/instruments.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package metrics
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably also add a test like the prometheus_test.go the interceptor has right now?


import (
"context"
"fmt"
"time"

"go.opentelemetry.io/otel/attribute"
api "go.opentelemetry.io/otel/metric"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
)

const (
meterName = "keda-external-scaler"

// ServiceName is the OTEL service.name used for both metrics and tracing.
ServiceName = "keda-http-external-scaler"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably also align the service name of the interceptor to use this naming scheme


MetricPingerFetchDuration = "scaler.pinger.fetch.duration"
MetricPingerFetchErrors = "scaler.pinger.fetch.errors"
MetricPingerEndpoints = "scaler.pinger.endpoints"

AttrNamespace = "namespace"
AttrService = "service"
Comment on lines +23 to +24
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unused vars

)

// Instruments holds all metric instruments for the external scaler.
type Instruments struct {
pingerFetchDuration api.Float64Histogram
pingerFetchErrors api.Int64Counter
pingerEndpoints api.Int64Gauge
}

// NewInstruments creates metric instruments from a MeterProvider.
func NewInstruments(provider *sdkmetric.MeterProvider) (*Instruments, error) {
meter := provider.Meter(meterName)

pingerFetchDuration, err := meter.Float64Histogram(
MetricPingerFetchDuration,
api.WithDescription("Duration of a queue pinger fetch cycle across all interceptor pods"),
api.WithUnit("s"),
api.WithExplicitBucketBoundaries(
0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5,
),
)
if err != nil {
return nil, fmt.Errorf("creating pinger fetch duration histogram: %w", err)
}

pingerFetchErrors, err := meter.Int64Counter(
MetricPingerFetchErrors,
api.WithDescription("Total failed queue pinger fetch cycles"),
)
if err != nil {
return nil, fmt.Errorf("creating pinger fetch errors counter: %w", err)
}

pingerEndpoints, err := meter.Int64Gauge(
MetricPingerEndpoints,
api.WithDescription("Number of interceptor endpoints the scaler is polling"),
)
if err != nil {
return nil, fmt.Errorf("creating pinger endpoints gauge: %w", err)
}

return &Instruments{
pingerFetchDuration: pingerFetchDuration,
pingerFetchErrors: pingerFetchErrors,
pingerEndpoints: pingerEndpoints,
}, nil
}

// RecordFetch records a completed pinger fetch cycle.
func (i *Instruments) RecordFetch(duration time.Duration, endpointCount int, fetchErr error) {
attrs := api.WithAttributeSet(attribute.NewSet())
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be removed as there are no attributes

i.pingerFetchDuration.Record(context.Background(), duration.Seconds(), attrs)
i.pingerEndpoints.Record(context.Background(), int64(endpointCount), attrs)
if fetchErr != nil {
i.pingerFetchErrors.Add(context.Background(), 1, attrs)
}
}
51 changes: 51 additions & 0 deletions scaler/metrics/provider.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package metrics
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a copy of interceptor/provider/metrics.go, we should deduplicate this code.


import (
"context"
"fmt"

"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/exporters/prometheus"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"

"github.com/kedacore/http-add-on/pkg/build"
)

// NewMeterProvider creates a MeterProvider with Prometheus and/or OTLP readers.
// Without readers, all instrument operations become no-ops.
func NewMeterProvider(promEnabled, otlpEnabled bool, opts ...sdkmetric.Option) (*sdkmetric.MeterProvider, error) {
var options []sdkmetric.Option

if promEnabled {
promExporter, err := prometheus.New(
prometheus.WithoutScopeInfo(),
)
if err != nil {
return nil, fmt.Errorf("creating prometheus exporter: %w", err)
}
options = append(options, sdkmetric.WithReader(promExporter))
}

if otlpEnabled {
otlpExporter, err := otlpmetrichttp.New(context.Background())
if err != nil {
return nil, fmt.Errorf("creating OTLP exporter: %w", err)
}
options = append(options, sdkmetric.WithReader(
sdkmetric.NewPeriodicReader(otlpExporter),
))
}

options = append(options, sdkmetric.WithResource(
resource.NewSchemaless(
attribute.String("service.name", ServiceName),
attribute.String("service.version", build.Version()),
),
))
Comment thread
Fedosin marked this conversation as resolved.

options = append(options, opts...)

return sdkmetric.NewMeterProvider(options...), nil
}
Loading
Loading