diff --git a/.chloggen/awsecscontainermetrics-instance-level.yaml b/.chloggen/awsecscontainermetrics-instance-level.yaml new file mode 100644 index 0000000000000..11687f2ecd247 --- /dev/null +++ b/.chloggen/awsecscontainermetrics-instance-level.yaml @@ -0,0 +1,30 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog) +component: receiver/awsecscontainermetrics + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add instance_level_metrics config option to collect metrics from all tasks on an ECS Managed Instance + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [48396] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: | + When enabled, the receiver queries the /tasks and /tasks/stats endpoints available to + Managed Daemon Services, collecting container metrics for all tasks on the instance + instead of only the receiver's own task. + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/receiver/awsecscontainermetricsreceiver/README.md b/receiver/awsecscontainermetricsreceiver/README.md index 1fa8136f2a998..905c2d2245092 100644 --- a/receiver/awsecscontainermetricsreceiver/README.md +++ b/receiver/awsecscontainermetricsreceiver/README.md @@ -35,6 +35,20 @@ This receiver collects task metadata and container stats at a fixed interval and default: `20s` +#### instance_level_metrics: + +When enabled, the receiver collects metrics for all tasks running on the instance by querying the `/tasks` and `/tasks/stats` endpoints. This requires the receiver to run as a [Managed Daemon Service](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-managed-instances-daemon.html) on ECS Managed Instances, which provides access to instance-wide task metadata and stats. + +default: `false` + +Example: + +```yaml +receivers: + awsecscontainermetrics: + collection_interval: 20s + instance_level_metrics: true +``` ## Enabling the AWS ECS Container Metrics Receiver diff --git a/receiver/awsecscontainermetricsreceiver/config.go b/receiver/awsecscontainermetricsreceiver/config.go index b64d8bc279181..f1f9e5bc7a855 100644 --- a/receiver/awsecscontainermetricsreceiver/config.go +++ b/receiver/awsecscontainermetricsreceiver/config.go @@ -12,6 +12,11 @@ type Config struct { // CollectionInterval is the interval at which metrics should be collected CollectionInterval time.Duration `mapstructure:"collection_interval"` + // InstanceLevelMetrics enables collection of metrics for all tasks on the instance. + // This requires the receiver to run as a Managed Daemon Service on ECS Managed Instances, + // which provides access to the /tasks and /tasks/stats endpoints. + InstanceLevelMetrics bool `mapstructure:"instance_level_metrics"` + // prevent unkeyed literal initialization _ struct{} } diff --git a/receiver/awsecscontainermetricsreceiver/config.schema.yaml b/receiver/awsecscontainermetricsreceiver/config.schema.yaml index 6f7b1328291bd..ab3ef3b54b32d 100644 --- a/receiver/awsecscontainermetricsreceiver/config.schema.yaml +++ b/receiver/awsecscontainermetricsreceiver/config.schema.yaml @@ -5,3 +5,6 @@ properties: description: CollectionInterval is the interval at which metrics should be collected type: string format: duration + instance_level_metrics: + description: InstanceLevelMetrics enables collection of metrics for all tasks on the instance. This requires the receiver to run as a Managed Daemon Service on ECS Managed Instances, which provides access to the /tasks and /tasks/stats endpoints. + type: boolean diff --git a/receiver/awsecscontainermetricsreceiver/config_test.go b/receiver/awsecscontainermetricsreceiver/config_test.go index 741183fbbea96..bc743242ec05d 100644 --- a/receiver/awsecscontainermetricsreceiver/config_test.go +++ b/receiver/awsecscontainermetricsreceiver/config_test.go @@ -37,6 +37,13 @@ func TestLoadConfig(t *testing.T) { CollectionInterval: 10 * time.Second, }, }, + { + id: component.NewIDWithName(metadata.Type, "instance_level"), + expected: &Config{ + CollectionInterval: defaultCollectionInterval, + InstanceLevelMetrics: true, + }, + }, } for _, tt := range tests { diff --git a/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/docker_stats.go b/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/docker_stats.go index f5cf57a2759b5..204cb903a5959 100644 --- a/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/docker_stats.go +++ b/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/docker_stats.go @@ -7,6 +7,11 @@ import "time" const TaskStatsPath = "/task/stats" +const ( + InstanceStatsPath = "/tasks/stats" + InstanceMetadataPath = "/tasks" +) + // ContainerStats defines the structure for container stats type ContainerStats struct { Name string `json:"name"` diff --git a/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/stats_provider.go b/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/stats_provider.go index a9a176ff4ad7b..48e8e6228f14a 100644 --- a/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/stats_provider.go +++ b/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/stats_provider.go @@ -6,6 +6,7 @@ package awsecscontainermetrics // import "github.com/open-telemetry/opentelemetr import ( "encoding/json" "fmt" + "maps" "go.uber.org/zap" @@ -49,3 +50,59 @@ func (p *StatsProvider) GetStats() (map[string]*ContainerStats, ecsutil.TaskMeta return stats, metadata, nil } + +// GetInstanceStats fetches stats and metadata for all tasks on the instance. +// This uses the /tasks/stats and /tasks endpoints available to Managed Daemon Services. +func (p *StatsProvider) GetInstanceStats() ([]TaskStatsEntry, error) { + metadataResp, err := p.rc.GetResponse(InstanceMetadataPath) + if err != nil { + return nil, fmt.Errorf("cannot read data from instance metadata endpoint: %w", err) + } + + var allTaskMetadata []ecsutil.TaskMetadata + err = json.Unmarshal(metadataResp, &allTaskMetadata) + if err != nil { + return nil, fmt.Errorf("cannot unmarshal instance task metadata: %w", err) + } + + statsResp, err := p.rc.GetResponse(InstanceStatsPath) + if err != nil { + return nil, fmt.Errorf("cannot read data from instance stats endpoint: %w", err) + } + + var rawStats []map[string]*ContainerStats + err = json.Unmarshal(statsResp, &rawStats) + if err != nil { + return nil, fmt.Errorf("cannot unmarshal instance stats: %w", err) + } + + // Flatten the array of single-entry maps into one map keyed by container ID + allStats := make(map[string]*ContainerStats) + for _, entry := range rawStats { + maps.Copy(allStats, entry) + } + + // Build per-task results pairing metadata with its container stats + results := make([]TaskStatsEntry, 0, len(allTaskMetadata)) + for i := range allTaskMetadata { + taskStats := make(map[string]*ContainerStats) + for j := range allTaskMetadata[i].Containers { + id := allTaskMetadata[i].Containers[j].DockerID + if s, ok := allStats[id]; ok { + taskStats[id] = s + } + } + results = append(results, TaskStatsEntry{ + Stats: taskStats, + Metadata: allTaskMetadata[i], + }) + } + + return results, nil +} + +// TaskStatsEntry pairs a task's metadata with the container stats for that task. +type TaskStatsEntry struct { + Stats map[string]*ContainerStats + Metadata ecsutil.TaskMetadata +} diff --git a/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/stats_provider_test.go b/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/stats_provider_test.go index e5ce59c85942e..ea75ef24c4bfd 100644 --- a/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/stats_provider_test.go +++ b/receiver/awsecscontainermetricsreceiver/internal/awsecscontainermetrics/stats_provider_test.go @@ -34,8 +34,13 @@ func (f testRestClient) GetResponse(path string) ([]byte, error) { return []byte("wrong-json-body"), nil } - if path == TaskStatsPath { + switch path { + case TaskStatsPath: return os.ReadFile("../../testdata/task_stats.json") + case InstanceMetadataPath: + return os.ReadFile("../../testdata/instance_tasks_metadata.json") + case InstanceStatsPath: + return os.ReadFile("../../testdata/instance_tasks_stats.json") } return nil, nil @@ -77,3 +82,42 @@ func TestGetStats(t *testing.T) { }) } } + +func TestGetInstanceStats(t *testing.T) { + tests := []struct { + name string + client ecsutil.RestClient + wantError string + }{ + { + name: "success", + client: &testRestClient{T: t}, + wantError: "", + }, + { + name: "failure", + client: &testRestClient{T: t, fail: true}, + wantError: "cannot read data from instance metadata endpoint: failed", + }, + { + name: "invalid-json", + client: &testRestClient{T: t, invalidJSON: true}, + wantError: "cannot unmarshal instance task metadata: invalid character 'w' looking for beginning of value", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + provider := NewStatsProvider(tt.client, zap.NewNop()) + entries, err := provider.GetInstanceStats() + if tt.wantError == "" { + require.NoError(t, err) + require.Len(t, entries, 2) + assert.Equal(t, "arn:aws:ecs:eu-west-1:111222333444:cluster/mi-cluster", entries[0].Metadata.Cluster) + assert.NotEmpty(t, entries[0].Stats) + assert.NotEmpty(t, entries[1].Stats) + } else { + assert.Equal(t, tt.wantError, err.Error()) + } + }) + } +} diff --git a/receiver/awsecscontainermetricsreceiver/receiver.go b/receiver/awsecscontainermetricsreceiver/receiver.go index 41056b371ef23..ccf8322132ca4 100644 --- a/receiver/awsecscontainermetricsreceiver/receiver.go +++ b/receiver/awsecscontainermetricsreceiver/receiver.go @@ -74,6 +74,14 @@ func (aecmr *awsEcsContainerMetricsReceiver) Shutdown(context.Context) error { // collectDataFromEndpoint collects container stats from Amazon ECS Task Metadata Endpoint func (aecmr *awsEcsContainerMetricsReceiver) collectDataFromEndpoint(ctx context.Context) error { aecmr.provider = awsecscontainermetrics.NewStatsProvider(aecmr.restClient, aecmr.logger) + + if aecmr.config.InstanceLevelMetrics { + return aecmr.collectInstanceLevelMetrics(ctx) + } + return aecmr.collectTaskLevelMetrics(ctx) +} + +func (aecmr *awsEcsContainerMetricsReceiver) collectTaskLevelMetrics(ctx context.Context) error { stats, metadata, err := aecmr.provider.GetStats() if err != nil { aecmr.logger.Error("Failed to collect stats", zap.Error(err)) @@ -91,3 +99,23 @@ func (aecmr *awsEcsContainerMetricsReceiver) collectDataFromEndpoint(ctx context return nil } + +func (aecmr *awsEcsContainerMetricsReceiver) collectInstanceLevelMetrics(ctx context.Context) error { + entries, err := aecmr.provider.GetInstanceStats() + if err != nil { + aecmr.logger.Error("Failed to collect instance-level stats", zap.Error(err)) + return err + } + + for i := range entries { + mds := awsecscontainermetrics.MetricsData(entries[i].Stats, entries[i].Metadata, aecmr.logger) + for _, md := range mds { + err = aecmr.nextConsumer.ConsumeMetrics(ctx, md) + if err != nil { + return err + } + } + } + + return nil +} diff --git a/receiver/awsecscontainermetricsreceiver/receiver_test.go b/receiver/awsecscontainermetricsreceiver/receiver_test.go index 49e88280db088..cb502e5f17db2 100644 --- a/receiver/awsecscontainermetricsreceiver/receiver_test.go +++ b/receiver/awsecscontainermetricsreceiver/receiver_test.go @@ -23,8 +23,13 @@ func (f fakeRestClient) GetResponse(path string) ([]byte, error) { if body, err := ecsutiltest.GetTestdataResponseByPath(f.T, path); body != nil || err != nil { return body, err } - if path == awsecscontainermetrics.TaskStatsPath { + switch path { + case awsecscontainermetrics.TaskStatsPath: return os.ReadFile("testdata/task_stats.json") + case awsecscontainermetrics.InstanceMetadataPath: + return os.ReadFile("testdata/instance_tasks_metadata.json") + case awsecscontainermetrics.InstanceStatsPath: + return os.ReadFile("testdata/instance_tasks_stats.json") } return nil, nil } @@ -90,6 +95,28 @@ func TestCollectDataFromEndpointWithConsumerError(t *testing.T) { require.EqualError(t, err, "Test Error for Metrics Consumer") } +func TestCollectInstanceLevelMetrics(t *testing.T) { + cfg := createDefaultConfig().(*Config) + cfg.InstanceLevelMetrics = true + sink := new(consumertest.MetricsSink) + metricsReceiver, err := newAWSECSContainermetrics( + zap.NewNop(), + cfg, + sink, + &fakeRestClient{t}, + ) + + require.NoError(t, err) + require.NotNil(t, metricsReceiver) + + r := metricsReceiver.(*awsEcsContainerMetricsReceiver) + ctx := t.Context() + + err = r.collectDataFromEndpoint(ctx) + require.NoError(t, err) + require.Positive(t, sink.DataPointCount()) +} + type invalidFakeClient struct{} func (invalidFakeClient) GetResponse(string) ([]byte, error) { diff --git a/receiver/awsecscontainermetricsreceiver/testdata/config.yaml b/receiver/awsecscontainermetricsreceiver/testdata/config.yaml index 41b3672b0a0bb..12c97ab78ab81 100644 --- a/receiver/awsecscontainermetricsreceiver/testdata/config.yaml +++ b/receiver/awsecscontainermetricsreceiver/testdata/config.yaml @@ -1,3 +1,5 @@ awsecscontainermetrics: awsecscontainermetrics/collection_interval_settings: collection_interval: 10s +awsecscontainermetrics/instance_level: + instance_level_metrics: true diff --git a/receiver/awsecscontainermetricsreceiver/testdata/instance_tasks_metadata.json b/receiver/awsecscontainermetricsreceiver/testdata/instance_tasks_metadata.json new file mode 100644 index 0000000000000..36b66f6500079 --- /dev/null +++ b/receiver/awsecscontainermetricsreceiver/testdata/instance_tasks_metadata.json @@ -0,0 +1,87 @@ +[ + { + "Cluster": "arn:aws:ecs:eu-west-1:111222333444:cluster/mi-cluster", + "TaskARN": "arn:aws:ecs:eu-west-1:111222333444:task/mi-cluster/081d2c75d3a2419d95ab6f8979fb61fb", + "Family": "daemon-app-monitor", + "Revision": "2", + "DesiredStatus": "RUNNING", + "KnownStatus": "RUNNING", + "Limits": { + "CPU": 0.25, + "Memory": 512 + }, + "PullStartedAt": "2026-05-14T17:56:14.38242749Z", + "PullStoppedAt": "2026-05-14T17:56:29.673031691Z", + "AvailabilityZone": "eu-west-1b", + "LaunchType": "MANAGED_INSTANCES", + "Containers": [ + { + "DockerId": "081d2c75d3a2419d95ab6f8979fb61fb-1547245599", + "Name": "monitor-agent", + "DockerName": "monitor-agent", + "Image": "111222333444.dkr.ecr.eu-west-1.amazonaws.com/app-monitor:latest", + "ImageID": "sha256:fb9bbacc4f8c120a057a64f84ab8c2dfde29bef164544f9fbcba7406c679e887", + "Labels": { + "com.amazonaws.ecs.cluster": "arn:aws:ecs:eu-west-1:111222333444:cluster/mi-cluster", + "com.amazonaws.ecs.container-name": "monitor-agent", + "com.amazonaws.ecs.task-arn": "arn:aws:ecs:eu-west-1:111222333444:task/mi-cluster/081d2c75d3a2419d95ab6f8979fb61fb", + "com.amazonaws.ecs.task-definition-family": "daemon-app-monitor", + "com.amazonaws.ecs.task-definition-version": "2" + }, + "DesiredStatus": "RUNNING", + "KnownStatus": "RUNNING", + "Limits": { + "CPU": 2 + }, + "CreatedAt": "2026-05-14T17:56:31.901628055Z", + "StartedAt": "2026-05-14T17:56:31.901628055Z", + "Type": "NORMAL", + "ContainerARN": "arn:aws:ecs:eu-west-1:111222333444:container/mi-cluster/081d2c75d3a2419d95ab6f8979fb61fb/be452a3e-7694-43fe-ad76-9d91f6a021e1" + } + ], + "Group": "daemon:app-monitor" + }, + { + "Cluster": "arn:aws:ecs:eu-west-1:111222333444:cluster/mi-cluster", + "TaskARN": "arn:aws:ecs:eu-west-1:111222333444:task/mi-cluster/45f36d5449924905a16c3d03c1fba67a", + "Family": "web-frontend", + "Revision": "1", + "DesiredStatus": "RUNNING", + "KnownStatus": "RUNNING", + "Limits": { + "CPU": 1, + "Memory": 2048 + }, + "PullStartedAt": "2026-05-14T17:56:45.700683062Z", + "PullStoppedAt": "2026-05-14T17:56:47.679100551Z", + "AvailabilityZone": "eu-west-1b", + "LaunchType": "MANAGED_INSTANCES", + "Containers": [ + { + "DockerId": "45f36d5449924905a16c3d03c1fba67a-1950351559", + "Name": "nginx-proxy", + "DockerName": "nginx-proxy", + "Image": "111222333444.dkr.ecr.eu-west-1.amazonaws.com/web-frontend:latest", + "ImageID": "sha256:fb9bbacc4f8c120a057a64f84ab8c2dfde29bef164544f9fbcba7406c679e887", + "Labels": { + "com.amazonaws.ecs.cluster": "arn:aws:ecs:eu-west-1:111222333444:cluster/mi-cluster", + "com.amazonaws.ecs.container-name": "nginx-proxy", + "com.amazonaws.ecs.task-arn": "arn:aws:ecs:eu-west-1:111222333444:task/mi-cluster/45f36d5449924905a16c3d03c1fba67a", + "com.amazonaws.ecs.task-definition-family": "web-frontend", + "com.amazonaws.ecs.task-definition-version": "1" + }, + "DesiredStatus": "RUNNING", + "KnownStatus": "RUNNING", + "Limits": { + "CPU": 2 + }, + "CreatedAt": "2026-05-14T17:56:47.93414642Z", + "StartedAt": "2026-05-14T17:56:47.93414642Z", + "Type": "NORMAL", + "ContainerARN": "arn:aws:ecs:eu-west-1:111222333444:container/mi-cluster/45f36d5449924905a16c3d03c1fba67a/d6b0a829-c58e-4e99-8069-57385c0ec17e" + } + ], + "ServiceName": "web-frontend-service", + "Group": "service:web-frontend-service" + } +] diff --git a/receiver/awsecscontainermetricsreceiver/testdata/instance_tasks_stats.json b/receiver/awsecscontainermetricsreceiver/testdata/instance_tasks_stats.json new file mode 100644 index 0000000000000..ed014b323c582 --- /dev/null +++ b/receiver/awsecscontainermetricsreceiver/testdata/instance_tasks_stats.json @@ -0,0 +1,122 @@ +[ + { + "081d2c75d3a2419d95ab6f8979fb61fb-1547245599": { + "read": "2026-05-15T09:57:51.915486061Z", + "preread": "2026-05-15T09:57:41.915132341Z", + "blkio_stats": { + "io_service_bytes_recursive": [ + {"major": 259, "minor": 0, "op": "read", "value": 0}, + {"major": 259, "minor": 0, "op": "write", "value": 520192} + ] + }, + "cpu_stats": { + "cpu_usage": { + "total_usage": 9097974000, + "usage_in_kernelmode": 3899124000, + "usage_in_usermode": 5198849000 + }, + "system_cpu_usage": 114725520000000, + "online_cpus": 2 + }, + "precpu_stats": { + "cpu_usage": { + "total_usage": 9093768000, + "usage_in_kernelmode": 3897322000, + "usage_in_usermode": 5196446000 + }, + "system_cpu_usage": 114705630000000, + "online_cpus": 2 + }, + "memory_stats": { + "usage": 66203648, + "stats": { + "active_anon": 16384, + "active_file": 479232, + "anon": 20643840, + "file": 43503616, + "pgfault": 47559, + "pgmajfault": 87 + }, + "limit": 18446744073709551615 + }, + "name": "monitor-agent", + "id": "081d2c75d3a2419d95ab6f8979fb61fb-1547245599", + "networks": { + "eth0": { + "rx_bytes": 5035147, + "rx_packets": 31058, + "rx_errors": 0, + "rx_dropped": 0, + "tx_bytes": 7564495, + "tx_packets": 34489, + "tx_errors": 0, + "tx_dropped": 0 + } + }, + "network_rate_stats": { + "rx_bytes_per_sec": 2300.018641440596, + "tx_bytes_per_sec": 1437.9491354252325 + } + } + }, + { + "45f36d5449924905a16c3d03c1fba67a-1950351559": { + "read": "2026-05-15T09:57:57.943163566Z", + "preread": "2026-05-15T09:57:47.943591104Z", + "blkio_stats": { + "io_service_bytes_recursive": [ + {"major": 259, "minor": 1, "op": "read", "value": 6328320}, + {"major": 259, "minor": 0, "op": "write", "value": 593920} + ] + }, + "cpu_stats": { + "cpu_usage": { + "total_usage": 18222054000, + "usage_in_kernelmode": 8914741000, + "usage_in_usermode": 9307313000 + }, + "system_cpu_usage": 114737540000000, + "online_cpus": 2 + }, + "precpu_stats": { + "cpu_usage": { + "total_usage": 18219339000, + "usage_in_kernelmode": 8914741000, + "usage_in_usermode": 9304598000 + }, + "system_cpu_usage": 114717640000000, + "online_cpus": 2 + }, + "memory_stats": { + "usage": 46559232, + "stats": { + "active_anon": 16384, + "active_file": 12488704, + "anon": 20848640, + "file": 23621632, + "pgfault": 108315, + "pgmajfault": 46 + }, + "limit": 18446744073709551615 + }, + "name": "nginx-proxy", + "id": "45f36d5449924905a16c3d03c1fba67a-1950351559", + "networks": { + "eth1": { + "rx_bytes": 1085576, + "rx_packets": 9127, + "rx_errors": 0, + "rx_dropped": 0, + "tx_bytes": 1355438, + "tx_packets": 10669, + "tx_errors": 0, + "tx_dropped": 0 + } + }, + "network_rate_stats": { + "rx_bytes_per_sec": 5.20022233966631, + "tx_bytes_per_sec": 6.600282200345702 + } + } + } +]