Terraform AWS LB healthcheck failed


问题

我有一个terraform,下面的代码正在为我配置AWS ECS Fargate上的网关服务。不在负载均衡器下的服务在私有网络中工作,但添加了LB的网关却无法进行健康检查,每2-3分钟就会取消配置并配置新的任务。Docker文件在3000端口暴露了一个服务。

这是一个失败的地形图计划

locals {
  gateway_version = "1.0.0"
  gateway_port    = 3000
}

## VPC

module "vpc" {
  source  = "terraform-aws-modules/vpc/aws"
  version = "3.11.0"

  name        = "${var.env}-vpc"
  cidr        = "20.0.0.0/16"
  enable_ipv6 = true

  azs                 = ["eu-central-1a", "eu-central-1b"]
  public_subnets      = ["20.0.1.0/24", "20.0.2.0/24"]
  private_subnets     = ["20.0.86.0/24", "20.0.172.0/24"]
  elasticache_subnets = ["20.0.31.0/24", "20.0.32.0/24"]

  enable_nat_gateway = true
  single_nat_gateway = true

  tags = {
    Terraform = "true"
  }
}

## Security Groups

module "sg" {
  source  = "terraform-aws-modules/security-group/aws"
  version = "~> 4.0"

  name        = "${var.env}-sg-default"
  description = "Default service security group"
  vpc_id      = module.vpc.vpc_id

  ingress_cidr_blocks = ["0.0.0.0/0"]
  ingress_rules = [
    "all-icmp",
    "http-80-tcp",
    "https-443-tcp",
    "mysql-tcp",
    "rabbitmq-4369-tcp",
    "rabbitmq-5671-tcp",
    "rabbitmq-5672-tcp",
    "rabbitmq-15672-tcp",
    "rabbitmq-25672-tcp",
    "redis-tcp"
  ]
  egress_rules = ["all-all"]
}

module "security_group" {
  source  = "terraform-aws-modules/security-group/aws"
  version = "~> 4.0"

  name        = "${var.env}-sg-lb"
  description = "Security group for ALB"
  vpc_id      = module.vpc.vpc_id

  ingress_cidr_blocks = ["0.0.0.0/0"]
  ingress_rules       = ["http-80-tcp", "all-icmp"]
  egress_rules        = ["all-all"]
}


resource "aws_security_group" "service_security_group" {
  name = "${var.env}-lb-connection"
  ingress {
    from_port = 0
    to_port   = 0
    protocol  = "-1"
    # Only allowing traffic in from the load balancer security group
    security_groups = [module.security_group.security_group_id]
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }

  vpc_id = module.vpc.vpc_id
}

## ECS Cluster

resource "aws_ecs_cluster" "default" {
  name = "${var.env}-cluster"
}

## ECR

data "aws_ecr_repository" "gateway_ecr" {
  name = "gateway-${var.env}"
}

## ECS Task Definition

resource "aws_ecs_task_definition" "gateway_task" {
  family                   = "${var.env}-gateway-task"
  container_definitions    = <<DEFINITION
  [
    {
      "name": "${var.env}-gateway-task",
      "image": "${data.aws_ecr_repository.gateway_ecr.repository_url}:${local.gateway_version}",
      "networkMode": "awsvpc",
      "essential": true,
      "logConfiguration": {
        "logDriver": "awslogs",
        "options": {
          "awslogs-group": "${aws_cloudwatch_log_group.gateway_logs.name}",
          "awslogs-stream-prefix": "ecs",
          "awslogs-region": "${var.aws-region}"
        }
      },
      "portMappings": [
        {
          "containerPort": ${local.gateway_port},
          "hostPort": ${local.gateway_port}
        }
      ],
      "environment": [
        {
          "name": "AWS_REGION",
          "value": "${var.aws-region}"
        },
        {
          "name": "PORT",
          "value": "${local.gateway_port}"
        },
        {
          "name": "STAGE",
          "value": "${var.env}"
        },
        {
          "name": "NODE_ENV",
          "value": "development"
        },
        {
          "name": "VERSION",
          "value": "${local.gateway_version}"
        }
      ],
      "memory": 512,
      "cpu": 256
    }
  ]
  DEFINITION
  requires_compatibilities = ["FARGATE"]
  network_mode             = "awsvpc"
  memory                   = 512
  cpu                      = 256
  task_role_arn            = aws_iam_role.gateway_task_definition_role.arn
  execution_role_arn       = aws_iam_role.gateway_task_execution_role.arn
}

## ECS Service

resource "aws_ecs_service" "gateway_service" {
  name            = "${var.env}-gateway-service"
  cluster         = aws_ecs_cluster.default.id
  task_definition = aws_ecs_task_definition.gateway_task.arn
  launch_type     = "FARGATE"
  desired_count   = 1

  force_new_deployment = true

  network_configuration {
    subnets = concat(
      module.vpc.public_subnets,
      module.vpc.private_subnets,
    )
    security_groups = [
      module.sg.security_group_id,
      aws_security_group.service_security_group.id
    ]
    assign_public_ip = true
  }

  lifecycle {
    ignore_changes = [desired_count]
  }

  load_balancer {
    target_group_arn = aws_lb_target_group.target_group.arn
    container_name   = aws_ecs_task_definition.gateway_task.family
    container_port   = local.gateway_port
  }
}

## Cloudwatch Log Group

resource "aws_cloudwatch_log_group" "gateway_logs" {
  name = "${var.env}-gateway-log-group"


  tags = {
    Name = "${var.env}-gateway-log-group"
  }
}

## IAM Roles

resource "aws_iam_role" "gateway_task_definition_role" {
  name               = "${var.env}-gateway-task-definition-role"
  assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json

  tags = {
    Name = "${var.env}-gateway-task-definition-role"
  }
}

resource "aws_iam_role" "gateway_task_execution_role" {
  name               = "${var.env}-gateway-task-execution-role"
  assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json

  tags = {
    Name = "${var.env}-gateway-task-execution-role"
  }
}

data "aws_iam_policy_document" "gateway_assume_role_policy" {
  statement {
    effect  = "Allow"
    actions = ["sts:AssumeRole"]

    principals {
      type        = "Service"
      identifiers = ["ecs-tasks.amazonaws.com"]
    }
  }
}

resource "aws_iam_role_policy" "gateway_exec" {
  name   = "${var.env}-gateway-execution-role-policy"
  role   = aws_iam_role.gateway_task_execution_role.id
  policy = data.aws_iam_policy_document.gateway_exec_policy.json
}

data "aws_iam_policy_document" "gateway_exec_policy" {
  statement {
    effect    = "Allow"
    resources = ["*"]

    actions = [
      "ecr:GetAuthorizationToken",
      "ecr:BatchCheckLayerAvailability",
      "ecr:GetDownloadUrlForLayer",
      "ecr:BatchGetImage",
      "logs:CreateLogStream",
      "logs:PutLogEvents",
    ]
  }
}

## ALB

resource "aws_lb" "alb" {
  name               = "${var.env}-lb"
  load_balancer_type = "application"
  subnets            = module.vpc.public_subnets
  security_groups    = [module.security_group.security_group_id]
}

resource "aws_lb_target_group" "target_group" {
  name        = "target-group"
  port        = 80
  protocol    = "HTTP"
  target_type = "ip"
  vpc_id      = module.vpc.vpc_id

  health_check {
    matcher = "200,301,302"
    path    = "/health"
    interval = 120
    timeout = 30
  }
}

resource "aws_lb_listener" "listener" {
  load_balancer_arn = aws_alb.alb.arn
  port              = 80
  protocol          = "HTTP"
  default_action {
    type             = "forward"
    target_group_arn = aws_lb_target_group.target_group.arn
  }
}

这就是错误

Task failed ELB health checks in (target-group arn:aws:elasticloadbalancing:eu-central-1:129228585726:targetgroup/target-group/5853904c0d3ad322)

在部署之后,我看到一个ECS服务被启动了,而且它在那里工作,但是我没有看到任何检查它健康状况的请求

答案1

你的目标组使用 port = 80 ,但你的ECS任务定义指定了端口 3000 。所以这可能是你的ALB无法连接到你的容器的原因。

答案2

负载平衡器试图检查它是否能够到达指定目标端口的应用程序。在你的例子中,它是3000。

替换你的目标组资源,使用应用程序端口,以使 LB 健康检查通过。

resource "aws_lb_target_group" "target_group" {
  name        = "target-group"
  port        = 3000
  protocol    = "HTTP"
  target_type = "ip"
  vpc_id      = module.vpc.vpc_id

  health_check {
    matcher = "200,301,302"
    path    = "/health"
    interval = 120
    timeout = 30
  }
}
答案3

目标组不是问题 -> 问题是错误的security_group,不允许打到3000端口

相同问题还可以参考: Terraform AWS LB healthcheck failed