Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

ECS Container Health Checks Failing

I'm having trouble with the container healthchecks in my ECS deployment.

I have a NextJS application with a /health route that returns an HTTP 200 response with some JSON. It works as expected locally and for my ALB load balancer health checks.

My deployed application is accessible via it's web address and work as expected there.

However, I cannot get my ECS container health checks to pass. I've tried almost every iteration of the healthcheck.command I could think of in the task definition and all of them fail after ~2 minutes.

screenshot of ECS task

Health check iterations Ive tried:

  • CMD-SHELL,curl -f http://localhost/health || exit 1
  • CMD-SHELL,curl -f http://localhost:3000/health || exit 1
  • CMD-SHELL,curl -f http://127.0.0.1/health || exit 1
  • CMD-SHELL,curl -f http://127.0.0.1:3000/health || exit 1
  • CMD-SHELL, echo "OK"
  • echo "OK"
  • CMD, echo "OK"

I do not see any signs of health checks in my application logs (obv. I wouldn't see anything when the health check is an echo, but didn't see anything when it was an API endpoint). I've confirmed that echo and curl exist in the final container image

My Task definition looks like this:

{
    "taskDefinitionArn": "[REDACTED]",
    "containerDefinitions": [
        {
            "name": "log_router",
            "image": "grafana/fluent-bit-plugin-loki",
            "cpu": 0,
            "memoryReservation": 50,
            "portMappings": [],
            "essential": true,
            "environment": [],
            "mountPoints": [],
            "volumesFrom": [],
            "user": "0",
            "logConfiguration": {
                "logDriver": "awslogs",
                "options": {
                    "awslogs-group": "firelens-container",
                    "awslogs-create-group": "true",
                    "awslogs-region": "us-east-2",
                    "awslogs-stream-prefix": "firelens"
                },
                "secretOptions": []
            },
            "systemControls": [],
            "firelensConfiguration": {
                "type": "fluentbit",
                "options": {
                    "enable-ecs-log-metadata": "true"
                }
            }
        },
        {
            "name": "[REDACTED]",
            "image": "[REDACTED]:latest",
            "cpu": 1024,
            "memory": 2048,
            "portMappings": [
                {
                    "name": "[REDACTED]-3000-tcp",
                    "containerPort": 3000,
                    "hostPort": 3000,
                    "protocol": "tcp"
                }
            ],
            "essential": true,
            "environment": [
                {
                    "name": "ENVIRONMENT",
                    "value": "development"
                }
            ],
            "mountPoints": [],
            "volumesFrom": [],
            "secrets": [],
            "ulimits": [
                {
                    "name": "nofile",
                    "softLimit": 65536,
                    "hardLimit": 65536
                }
            ],
            "logConfiguration": {
                "logDriver": "awsfirelens",
                "options": {
                    "RemoveKeys": "container_id,ecs_task_arn",
                    "LineFormat": "key_value",
                    "Labels": "{app=\"[REDACTED]\"}",
                    "LabelKeys": "container_name,ecs_task_definition,source,ecs_cluster",
                    "Url": "[REDACTED]",
                    "Name": "grafana-loki"
                },
                "secretOptions": []
            },
            "healthCheck": {
                "command": [
                    "CMD",
                    "echo \"ok\""
                ],
                "interval": 30,
                "timeout": 5,
                "retries": 3,
                "startPeriod": 30
            },
            "systemControls": []
        }
    ],
    "family": "[REDACTED]-development",
    "taskRoleArn": "[REDACTED]",
    "executionRoleArn": "[REDACTED]",
    "networkMode": "awsvpc",
    "revision": 22,
    "volumes": [],
    "status": "ACTIVE",
    "requiresAttributes": [
        {
            "name": "ecs.capability.execution-role-awslogs"
        },
        {
            "name": "com.amazonaws.ecs.capability.ecr-auth"
        },
        {
            "name": "com.amazonaws.ecs.capability.docker-remote-api.1.17"
        },
        {
            "name": "com.amazonaws.ecs.capability.docker-remote-api.1.21"
        },
        {
            "name": "com.amazonaws.ecs.capability.logging-driver.awsfirelens"
        },
        {
            "name": "com.amazonaws.ecs.capability.task-iam-role"
        },
        {
            "name": "ecs.capability.container-health-check"
        },
        {
            "name": "ecs.capability.execution-role-ecr-pull"
        },
        {
            "name": "ecs.capability.secrets.ssm.environment-variables"
        },
        {
            "name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
        },
        {
            "name": "ecs.capability.task-eni"
        },
        {
            "name": "com.amazonaws.ecs.capability.docker-remote-api.1.29"
        },
        {
            "name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
        },
        {
            "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
        },
        {
            "name": "ecs.capability.firelens.fluentbit"
        }
    ],
    "placementConstraints": [],
    "compatibilities": [
        "EC2",
        "FARGATE"
    ],
    "requiresCompatibilities": [
        "FARGATE"
    ],
    "cpu": "1024",
    "memory": "2048",
    "registeredAt": "2024-07-12T15:54:18.852Z",
    "registeredBy": "[REDACTED]",
    "tags": [
        {
            "key": "Environment",
            "value": "development"
        },
        {
            "key": "CostCenter",
            "value": "engineering"
        },
        {
            "key": "Application",
            "value": "[REDACTED]"
        },
        {
            "key": "Name",
            "value": "[REDACTED]"
        }
    ]
}
like image 678
Andy Avatar asked Oct 20 '25 11:10

Andy


1 Answers

It took a few things to fix this. Since I was using the :latest tag, the image which had curl installed was not being used when updating the task definition.

I should be using a unique image tag/SHA when updating task defintions

I also found that localhost is not addressable, so 127.0.0.1 was needed instead. The final health check that worked looks like this:

Console --> CMD-SHELL,curl -f http://127.0.0.1:3000/ || exit 1

JSON -->

            "healthCheck": {
                "command": [
                    "CMD-SHELL",
                    "curl -f http://127.0.0.1:3000/ || exit 1"
                ],
                "interval": 30,
                "timeout": 5,
                "retries": 3,
                "startPeriod": 30
            },
like image 140
Andy Avatar answered Oct 22 '25 04:10

Andy



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!