我使用以下 terraform 脚本配置了 GKE 私有集群
resource "google_container_cluster" "cluster" {
name = var.cluster_name
project = var.project
location = var.zone
network = google_compute_network.network.self_link
subnetwork = google_compute_subnetwork.subnetwork.self_link
logging_service = "logging.googleapis.com/kubernetes"
monitoring_service = "monitoring.googleapis.com/kubernetes"
remove_default_node_pool = "true"
initial_node_count = 1
addons_config {
network_policy_config {
disabled = false
}
}
workload_identity_config {
identity_namespace = format("%s.svc.id.goog", var.project)
}
master_auth {
username = ""
password = ""
client_certificate_config {
issue_client_certificate = "false"
}
}
network_policy {
enabled = "true"
}
ip_allocation_policy {
cluster_secondary_range_name = google_compute_subnetwork.subnetwork.secondary_ip_range.0.range_name
services_secondary_range_name = google_compute_subnetwork.subnetwork.secondary_ip_range.1.range_name
}
master_authorized_networks_config {
cidr_blocks {
display_name = "bastion"
cidr_block = format("%s/32", google_compute_instance.bastion.network_interface.0.network_ip)
}
}
private_cluster_config {
enable_private_endpoint = "true"
enable_private_nodes = "true"
master_ipv4_cidr_block = "172.16.0.16/28"
}
timeouts {
create = "30m"
update = "30m"
delete = "30m"
}
depends_on = [
google_project_service.service,
google_project_iam_member.service-account,
google_project_iam_member.service-account-custom,
google_compute_router_nat.nat,
]
}
resource "google_container_node_pool" "private-np-1" {
name = "private-np-1"
location = var.zone
cluster = google_container_cluster.cluster.name
node_count = "3"
management {
auto_repair = "true"
auto_upgrade = "false"
}
node_config {
machine_type = "e2-micro"
disk_type = "pd-standard"
disk_size_gb = 100
image_type = "COS"
service_account = google_service_account.gke-sa.email
oauth_scopes = [
"https://www.googleapis.com/auth/devstorage.read_only",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/servicecontrol",
"https://www.googleapis.com/auth/service.management.readonly",
"https://www.googleapis.com/auth/trace.append",
]
labels = {
cluster = var.cluster_name
}
workload_metadata_config {
node_metadata = "GKE_METADATA_SERVER"
}
metadata = {
google-compute-enable-virtio-rng = "true"
disable-legacy-endpoints = "true"
}
}
depends_on = [
google_container_cluster.cluster,
]
}
这是我的网络地形脚本:
resource "google_service_account" "gke-sa" {
account_id = format("%s-node-sa", var.cluster_name)
display_name = "GKE Security Service Account"
project = var.project
}
resource "google_project_iam_member" "service-account" {
count = length(var.service_account_iam_roles)
project = var.project
role = element(var.service_account_iam_roles, count.index)
member = format("serviceAccount:%s", google_service_account.gke-sa.email)
}
resource "google_project_service" "service" {
count = length(var.project_services)
project = var.project
service = element(var.project_services, count.index)
disable_on_destroy = false
}
resource "google_compute_network" "network" {
name = format("%s-network", var.cluster_name)
project = var.project
auto_create_subnetworks = false
depends_on = [
google_project_service.service,
]
}
resource "google_compute_subnetwork" "subnetwork" {
name = format("%s-subnet", var.cluster_name)
project = var.project
network = google_compute_network.network.self_link
region = var.region
ip_cidr_range = "10.0.0.0/24"
private_ip_google_access = true
secondary_ip_range {
range_name = format("%s-pod-range", var.cluster_name)
ip_cidr_range = "10.1.0.0/16"
}
secondary_ip_range {
range_name = format("%s-svc-range", var.cluster_name)
ip_cidr_range = "10.2.0.0/20"
}
}
resource "google_compute_address" "nat" {
name = format("%s-nat-ip", var.cluster_name)
project = var.project
region = var.region
depends_on = [
google_project_service.service,
]
}
resource "google_compute_router" "router" {
name = format("%s-cloud-router", var.cluster_name)
project = var.project
region = var.region
network = google_compute_network.network.self_link
bgp {
asn = 64514
}
}
resource "google_compute_router_nat" "nat" {
name = format("%s-cloud-nat", var.cluster_name)
project = var.project
router = google_compute_router.router.name
region = var.region
nat_ip_allocate_option = "MANUAL_ONLY"
nat_ips = [google_compute_address.nat.self_link]
source_subnetwork_ip_ranges_to_nat = "LIST_OF_SUBNETWORKS"
subnetwork {
name = google_compute_subnetwork.subnetwork.self_link
source_ip_ranges_to_nat = ["PRIMARY_IP_RANGE", "LIST_OF_SECONDARY_IP_RANGES"]
secondary_ip_range_names = [
google_compute_subnetwork.subnetwork.secondary_ip_range.0.range_name,
google_compute_subnetwork.subnetwork.secondary_ip_range.1.range_name,
]
}
}
locals {
hostname = format("%s-bastion", var.cluster_name)
}
resource "google_service_account" "bastion" {
account_id = format("%s-bastion-sa", var.cluster_name)
display_name = "GKE Bastion SA"
}
resource "google_compute_firewall" "bastion-ssh" {
name = format("%s-bastion-ssh", var.cluster_name)
network = google_compute_network.network.name
direction = "INGRESS"
project = var.project
source_ranges = ["0.0.0.0/0"]
allow {
protocol = "tcp"
ports = ["22"]
}
target_tags = ["bastion"]
}
data "template_file" "startup_script" {
template = <<-EOF
sudo apt-get update -y
sudo apt-get install -y tinyproxy
EOF
}
resource "google_compute_instance" "bastion" {
name = local.hostname
machine_type = "g1-small"
zone = var.zone
project = var.project
tags = ["bastion"]
boot_disk {
initialize_params {
image = "debian-cloud/debian-9"
}
}
metadata_startup_script = data.template_file.startup_script.rendered
network_interface {
subnetwork = google_compute_subnetwork.subnetwork.name
access_config {
// Ephemeral IP
}
}
allow_stopping_for_update = true
service_account {
email = google_service_account.bastion.email
scopes = ["cloud-platform"]
}
// This provider is used to block the subsequent providers until the instance
// is available.
provisioner "local-exec" {
command = <<EOF
READY=""
for i in $(seq 1 20); do
if gcloud compute ssh ${local.hostname} --project ${var.project} --zone ${var.region}-a --command uptime; then
READY="yes"
break;
fi
echo "Waiting for ${local.hostname} to initialize..."
sleep 10;
done
if [[ -z $READY ]]; then
echo "${local.hostname} failed to start in time."
echo "Please verify that the instance starts and then re-run `terraform apply`"
exit 1
fi
EOF
}
}
总之,上面的脚本正在做的是:
- 创建具有子网的 VPC(2 个辅助 IP 范围)
- 为所有出口创建 NAT
- 在具有 3 个节点的 VPC(只有私有端点)内创建一个 GKE 集群。
- 创建一个堡垒 GCE,允许通过 tcp:22(通过防火墙)从 Internet 进入,以便通过隧道连接到私有主端点。
这些资源都创建得很好,但是节点状态非常不稳定。只有一个节点(经常)在就绪和未知之间切换状态。其余 2 个节点始终处于未知状态。任何部署(甚至入口控制器部署)都失败了。
我不是 GKE 的专家。
这里有什么问题?为什么节点处于“未知状态”?