我为 AWS ECS 横向扩展定义了 CloudWatch 警报。
通常它工作正常。但有时它会因以下错误而失败。500 是横向扩展的阈值。公制时间是每 5 分钟一次。并且横向扩展数据点是 2 个中的 1 个(意味着一个值在 10 分钟内超过阈值):
“错误”:“未找到度量值 [437.08774491907025, 516.9558339660845] 和违反阈值 500.0 的步进调整”
步进调整定义如下:
step_adjustment {
metric_interval_lower_bound = 0
scaling_adjustment = 1
}
报警配置:
datapoints_to_alarm = "1"
evaluation_periods = "2"
threshold = "500"
用于创建警报的 Terraform 代码
resource "aws_appautoscaling_policy" "task_count_up" {
name = "appScalingPolicy_${aws_ecs_service.sqs_to_kinesis.name}_ScaleUp"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.shared-elb-access-logs-processor.name}/${aws_ecs_service.sqs_to_kinesis.name}"
scalable_dimension = "ecs:service:DesiredCount"
step_scaling_policy_configuration {
adjustment_type = "ChangeInCapacity"
cooldown = "${var.scale_up_cooldown_seconds}"
metric_aggregation_type = "Maximum"
step_adjustment {
metric_interval_lower_bound = 0
scaling_adjustment = 1
}
}
depends_on = [
"aws_appautoscaling_target.main",
]
}
resource "aws_appautoscaling_policy" "task_count_down" {
name = "appScalingPolicy_${aws_ecs_service.sqs_to_kinesis.name}_ScaleDown"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.shared-elb-access-logs-processor.name}/${aws_ecs_service.sqs_to_kinesis.name}"
scalable_dimension = "ecs:service:DesiredCount"
step_scaling_policy_configuration {
adjustment_type = "ChangeInCapacity"
cooldown = "${var.scale_down_cooldown_seconds}"
metric_aggregation_type = "Minimum"
step_adjustment {
metric_interval_upper_bound = 0
scaling_adjustment = -1
}
}
depends_on = [
"aws_appautoscaling_target.main",
]
}