Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions packages/@aws-cdk/toolkit-lib/lib/actions/diagnose/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,25 @@ export interface TracedResourceError {
* (Not optional on purpose so we are not allowed to forget to call the code that should fill it)
*/
readonly sourceTrace: SourceTrace | undefined;

/**
* Additional context gathered from AWS service APIs to help diagnose the root cause.
*
* For example, CloudWatch Logs from an ECS service whose tasks failed to start.
*/
readonly additionalContext?: AdditionalDiagnosticContext[];
}

export interface AdditionalDiagnosticContext {
/**
* A short description of where this context came from
*
* @example "CloudWatch Logs (log-group-name)"
*/
readonly source: string;

/**
* The log lines or messages retrieved
*/
readonly messages: string[];
}
17 changes: 17 additions & 0 deletions packages/@aws-cdk/toolkit-lib/lib/api/aws-auth/sdk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,11 @@ import {
} from '@aws-sdk/client-ecr';
import type {
DescribeServicesCommandInput,
DescribeServicesCommandOutput,
DescribeTaskDefinitionCommandInput,
DescribeTaskDefinitionCommandOutput,
DescribeTasksCommandInput,
DescribeTasksCommandOutput,
RegisterTaskDefinitionCommandInput,
ListClustersCommandInput,
ListClustersCommandOutput,
Expand All @@ -261,6 +266,9 @@ import type {
UpdateServiceCommandOutput,
} from '@aws-sdk/client-ecs';
import {
DescribeServicesCommand,
DescribeTaskDefinitionCommand,
DescribeTasksCommand,
ECSClient,
ListClustersCommand,
RegisterTaskDefinitionCommand,
Expand Down Expand Up @@ -555,6 +563,9 @@ export interface IECRClient {
}

export interface IECSClient {
describeServices(input: DescribeServicesCommandInput): Promise<DescribeServicesCommandOutput>;
describeTaskDefinition(input: DescribeTaskDefinitionCommandInput): Promise<DescribeTaskDefinitionCommandOutput>;
describeTasks(input: DescribeTasksCommandInput): Promise<DescribeTasksCommandOutput>;
listClusters(input: ListClustersCommandInput): Promise<ListClustersCommandOutput>;
registerTaskDefinition(input: RegisterTaskDefinitionCommandInput): Promise<RegisterTaskDefinitionCommandOutput>;
updateService(input: UpdateServiceCommandInput): Promise<UpdateServiceCommandOutput>;
Expand Down Expand Up @@ -950,6 +961,12 @@ export class SDK {
public ecs(): IECSClient {
const client = new ECSClient(this.config);
return {
describeServices: (input: DescribeServicesCommandInput): Promise<DescribeServicesCommandOutput> =>
client.send(new DescribeServicesCommand(input)),
describeTaskDefinition: (input: DescribeTaskDefinitionCommandInput): Promise<DescribeTaskDefinitionCommandOutput> =>
client.send(new DescribeTaskDefinitionCommand(input)),
describeTasks: (input: DescribeTasksCommandInput): Promise<DescribeTasksCommandOutput> =>
client.send(new DescribeTasksCommand(input)),
listClusters: (input: ListClustersCommandInput): Promise<ListClustersCommandOutput> =>
client.send(new ListClustersCommand(input)),
registerTaskDefinition: (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ export class Deployments {
sourceTracer: new StackArtifactSourceTracer(options.stack),
ioHelper: this.ioHelper,
topLevelStackHierarchicalId: options.stack.hierarchicalId,
additionalExplorationSdkProvider: async () => (await this.envs.accessStackForLookupBestEffort(options.stack)).sdk,
}),
}, this.ioHelper);
}
Expand Down Expand Up @@ -486,6 +487,7 @@ export class Deployments {
sourceTracer: new StackArtifactSourceTracer(stack),
ioHelper: this.ioHelper,
topLevelStackHierarchicalId: stack.hierarchicalId,
additionalExplorationSdkProvider: async () => (await this.envs.accessStackForLookupBestEffort(stack)).sdk,
}),
});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,32 @@ function formatResourceErrors(es: TracedResourceError[]) {
const nodeText = b.nodeText(p);
nodeText.header = [`${lastPart} ${addendum(' ', e.resourceType, e.logicalId)}`.trim()];
nodeText.body.push(...sideBySide(['🛑'], ' ', wrapText(120, e.message)));
nodeText.body.push(...formatAdditionalContext(e));
nodeText.footer = e.sourceTrace?.creationStackTrace
? sideBySide(['Source Location:'], ' ', e.sourceTrace?.creationStackTrace)
: [];
}
return b.render();
}

function formatAdditionalContext(e: TracedResourceError): string[] {
if (!e.additionalContext || e.additionalContext.length === 0) {
return [];
}

const lines: string[] = [];
for (const ctx of e.additionalContext) {
lines.push('', `📋 ${ctx.source}:`);
for (const msg of ctx.messages.slice(0, 20)) {
lines.push(` ${msg}`);
}
if (ctx.messages.length > 20) {
lines.push(` ... (${ctx.messages.length - 20} more lines)`);
}
}
return lines;
}

/**
* Return a /-separated construct path for the given error, or try to build as close a represention as possible if we don't have a construct path
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
import type { AdditionalDiagnosticContext } from '../../actions/diagnose';
import type { ICloudWatchLogsClient, IECSClient, SDK } from '../aws-auth/sdk';
import type { ResourceError } from '../stack-events/resource-errors';

/**
* Investigate a failed resource using AWS service APIs to gather additional root cause context.
*
* Returns additional diagnostic context (e.g. log lines) or an empty array if
* investigation is not possible or yields no results for this resource type.
*/
export async function investigateResource(
err: ResourceError,
sdk: SDK,
debug: (msg: string) => Promise<void>,
): Promise<AdditionalDiagnosticContext[]> {
switch (err.resourceType) {
case 'AWS::ECS::Service':
return investigateEcsService(err, sdk, debug);
default:
return [];
}
}

async function investigateEcsService(
err: ResourceError,
sdk: SDK,
debug: (msg: string) => Promise<void>,
): Promise<AdditionalDiagnosticContext[]> {
const physicalId = err.physicalId;
if (!physicalId) {
await debug('ECS investigation: no physical ID available');
return [];
}

const { clusterArn, serviceName } = parseEcsServiceIdentifier(physicalId);
if (!serviceName) {
await debug(`ECS investigation: could not parse service identifier from "${physicalId}"`);
return [];
}

const ecs = sdk.ecs();
const cwl = sdk.cloudWatchLogs();

const service = await describeService(ecs, clusterArn, serviceName, debug);
if (!service) {
return [];
}

const results: AdditionalDiagnosticContext[] = [];

const stoppedTaskContext = await getStoppedTaskReasons(ecs, clusterArn, service, debug);
if (stoppedTaskContext) {
results.push(stoppedTaskContext);
}

const taskDefinitionArn = service.taskDefinition;
if (!taskDefinitionArn) {
return results;
}

const logConfigs = await getLogConfigsFromTaskDefinition(ecs, taskDefinitionArn, debug);
if (logConfigs.length === 0) {
return results;
}

// eslint-disable-next-line @cdklabs/promiseall-no-unbounded-parallelism
const logResults = await Promise.all(logConfigs.map(cfg => fetchRecentLogs(cwl, cfg, debug)));
for (const context of logResults) {
if (context) {
results.push(context);
}
}

return results;
}

function parseEcsServiceIdentifier(physicalId: string): { clusterArn?: string; serviceName?: string } {
// ARN format: arn:aws:ecs:region:account:service/cluster-name/service-name
const arnMatch = physicalId.match(/arn:.*:ecs:.*:.*:service\/([^/]+)\/(.+)/);
if (arnMatch) {
return { clusterArn: arnMatch[1], serviceName: arnMatch[2] };
}

const parts = physicalId.split('/');
if (parts.length === 2) {
return { clusterArn: parts[0], serviceName: parts[1] };
}

return { serviceName: physicalId };
}

async function describeService(
ecs: IECSClient,
cluster: string | undefined,
serviceName: string,
debug: (msg: string) => Promise<void>,
) {
try {
const resp = await ecs.describeServices({ cluster, services: [serviceName] });
const service = resp.services?.[0];
if (!service) {
await debug(`ECS investigation: service "${serviceName}" not found`);
}
return service;
} catch (e: any) {
await debug(`ECS investigation: failed to describe service: ${e.message}`);
return undefined;
}
}

async function getStoppedTaskReasons(
ecs: IECSClient,
cluster: string | undefined,
service: { events?: Array<{ message?: string }>; [key: string]: any },
debug: (msg: string) => Promise<void>,
): Promise<AdditionalDiagnosticContext | undefined> {
try {
const failureEvents = (service.events ?? [])
.filter(e => e.message?.includes('stopped') || e.message?.includes('failed'))
.slice(0, 5);

if (failureEvents.length === 0) {
return undefined;
}

const taskIds = (service.events ?? [])
.map(e => {
const match = e.message?.match(/task ([a-f0-9-]+)/);
return match ? match[1] : undefined;
})
.filter((id): id is string => id != null)
.slice(0, 3);

const messages: string[] = [];

if (taskIds.length > 0) {
const tasksResp = await ecs.describeTasks({ cluster, tasks: taskIds });
for (const task of tasksResp.tasks ?? []) {
if (task.stoppedReason) {
messages.push(`Task stopped: ${task.stoppedReason}`);
}
for (const container of task.containers ?? []) {
if (container.reason) {
messages.push(`Container "${container.name}": ${container.reason}`);
}
}
}
}

if (messages.length === 0) {
for (const event of failureEvents) {
if (event.message) {
messages.push(event.message);
}
}
}

if (messages.length === 0) {
return undefined;
}

return { source: 'ECS Stopped Tasks', messages };
} catch (e: any) {
await debug(`ECS investigation: failed to get stopped task reasons: ${e.message}`);
return undefined;
}
}

interface AwsLogsConfig {
logGroup: string;
streamPrefix?: string;
containerName?: string;
}

async function getLogConfigsFromTaskDefinition(
ecs: IECSClient,
taskDefinitionArn: string,
debug: (msg: string) => Promise<void>,
): Promise<AwsLogsConfig[]> {
try {
const resp = await ecs.describeTaskDefinition({ taskDefinition: taskDefinitionArn });
const containers = resp.taskDefinition?.containerDefinitions ?? [];
const configs: AwsLogsConfig[] = [];
for (const container of containers) {
const logConfig = container.logConfiguration;
if (logConfig?.logDriver === 'awslogs') {
const logGroup = logConfig.options?.['awslogs-group'];
if (logGroup) {
configs.push({
logGroup,
streamPrefix: logConfig.options?.['awslogs-stream-prefix'],
containerName: container.name,
});
}
}
}
return configs;
} catch (e: any) {
await debug(`ECS investigation: failed to describe task definition: ${e.message}`);
return [];
}
}

async function fetchRecentLogs(
cwl: ICloudWatchLogsClient,
logConfig: AwsLogsConfig,
debug: (msg: string) => Promise<void>,
): Promise<AdditionalDiagnosticContext | undefined> {
try {
const startTime = Date.now() - 30 * 60 * 1000;

const resp = await cwl.filterLogEvents({
logGroupName: logConfig.logGroup,
startTime,
limit: 20,
...(logConfig.streamPrefix ? { logStreamNamePrefix: logConfig.streamPrefix } : {}),
});

const events = resp.events ?? [];
if (events.length === 0) {
await debug(`ECS investigation: no recent log events in ${logConfig.logGroup}`);
return undefined;
}

const messages = events
.map(e => e.message?.trimEnd())
.filter((m): m is string => m != null);

const source = logConfig.containerName
? `CloudWatch Logs: ${logConfig.logGroup} (container: ${logConfig.containerName})`
: `CloudWatch Logs: ${logConfig.logGroup}`;

return { source, messages };
} catch (e: any) {
await debug(`ECS investigation: failed to fetch logs from ${logConfig.logGroup}: ${e.message}`);
return undefined;
}
}
Loading
Loading