Deploy GKE with Terraform
Project Structure
gke-terraform/├── main.tf├── variables.tf├── outputs.tf├── versions.tf├── terraform.tfvars├── modules/│ ├── networking/│ │ ├── main.tf│ │ ├── variables.tf│ │ └── outputs.tf│ ├── gke/│ │ ├── main.tf│ │ ├── variables.tf│ │ └── outputs.tf│ └── security/│ ├── main.tf│ ├── variables.tf│ └── outputs.tf└── environments/ ├── dev.tfvars ├── staging.tfvars └── production.tfvars
Step 1 — Versions and Provider Config
# versions.tfterraform { required_version = ">= 1.6.0" required_providers { google = { source = "hashicorp/google" version = "~> 5.0" } google-beta = { source = "hashicorp/google-beta" version = "~> 5.0" } kubernetes = { source = "hashicorp/kubernetes" version = "~> 2.0" } } # Remote state backend backend "gcs" { bucket = "mycompany-terraform-state" prefix = "gke/production" }}provider "google" { project = var.project_id region = var.region}provider "google-beta" { project = var.project_id region = var.region}# K8s provider — uses GKE cluster outputprovider "kubernetes" { host = "https://${module.gke.endpoint}" token = data.google_client_config.default.access_token cluster_ca_certificate = base64decode( module.gke.ca_certificate )}data "google_client_config" "default" {}
Step 2 — Variables
# variables.tfvariable "project_id" { description = "GCP Project ID" type = string}variable "region" { description = "GCP region" type = string default = "us-central1"}variable "environment" { description = "Environment name" type = string validation { condition = contains(["dev", "staging", "production"], var.environment) error_message = "Must be dev, staging, or production" }}variable "cluster_name" { description = "GKE cluster name" type = string}# Networkingvariable "vpc_cidr" { description = "VPC CIDR range" type = string default = "10.0.0.0/16"}variable "subnet_cidr" { description = "Subnet CIDR" type = string default = "10.0.0.0/20"}variable "pods_cidr" { description = "Pod IP range" type = string default = "10.4.0.0/14"}variable "services_cidr" { description = "Services IP range" type = string default = "10.0.16.0/20"}variable "master_cidr" { description = "Control plane CIDR" type = string default = "172.16.0.0/28"}# Node poolsvariable "node_pools" { description = "Node pool configurations" type = map(object({ machine_type = string min_nodes = number max_nodes = number disk_size_gb = number disk_type = string spot = bool taints = list(object({ key = string value = string effect = string })) labels = map(string) }))}# Securityvariable "authorized_networks" { description = "Networks authorized to access K8s API" type = list(object({ cidr_block = string display_name = string })) default = []}variable "enable_private_endpoint" { description = "Disable public K8s API endpoint" type = bool default = false}# Labelsvariable "labels" { description = "Labels applied to all resources" type = map(string) default = {}}
# terraform.tfvarsproject_id = "mycompany-prod"region = "us-central1"environment = "production"cluster_name = "prod-cluster"vpc_cidr = "10.0.0.0/16"subnet_cidr = "10.0.0.0/20"pods_cidr = "10.4.0.0/14"services_cidr = "10.0.16.0/20"master_cidr = "172.16.0.0/28"node_pools = { # System components pool system = { machine_type = "n2-standard-2" min_nodes = 1 max_nodes = 3 disk_size_gb = 50 disk_type = "pd-ssd" spot = false taints = [{ key = "CriticalAddonsOnly" value = "true" effect = "NO_SCHEDULE" }] labels = { pool = "system" } } # Application workloads application = { machine_type = "n2-standard-4" min_nodes = 2 max_nodes = 20 disk_size_gb = 100 disk_type = "pd-ssd" spot = false taints = [] labels = { pool = "application" } } # Spot pool for batch workloads spot = { machine_type = "n2-standard-4" min_nodes = 0 max_nodes = 10 disk_size_gb = 100 disk_type = "pd-ssd" spot = true taints = [{ key = "cloud.google.com/gke-spot" value = "true" effect = "NO_SCHEDULE" }] labels = { pool = "spot" } }}authorized_networks = [ { cidr_block = "10.0.0.0/8" display_name = "internal" }, { cidr_block = "203.0.113.0/24" display_name = "office-vpn" }]labels = { environment = "production" team = "platform" managed_by = "terraform" cost_center = "engineering"}
Step 3 — Networking Module
# modules/networking/main.tf# ── VPC ──────────────────────────────────────────────────────resource "google_compute_network" "vpc" { name = "${var.cluster_name}-vpc" project = var.project_id auto_create_subnetworks = false routing_mode = "GLOBAL" description = "VPC for GKE cluster ${var.cluster_name}"}# ── Subnet ───────────────────────────────────────────────────resource "google_compute_subnetwork" "subnet" { name = "${var.cluster_name}-subnet" project = var.project_id region = var.region network = google_compute_network.vpc.id ip_cidr_range = var.subnet_cidr private_ip_google_access = true # reach GCP APIs privately # Secondary ranges for pods and services secondary_ip_range { range_name = "pods" ip_cidr_range = var.pods_cidr } secondary_ip_range { range_name = "services" ip_cidr_range = var.services_cidr } # VPC Flow Logs log_config { aggregation_interval = "INTERVAL_5_SEC" flow_sampling = 0.5 metadata = "INCLUDE_ALL_METADATA" }}# ── Cloud Router ─────────────────────────────────────────────resource "google_compute_router" "router" { name = "${var.cluster_name}-router" project = var.project_id region = var.region network = google_compute_network.vpc.id bgp { asn = 64514 }}# ── Cloud NAT (outbound internet for private nodes) ──────────resource "google_compute_router_nat" "nat" { name = "${var.cluster_name}-nat" project = var.project_id router = google_compute_router.router.name region = var.region nat_ip_allocate_option = "AUTO_ONLY" source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES" log_config { enable = true filter = "ERRORS_ONLY" }}# ── Firewall Rules ────────────────────────────────────────────# Allow internal traffic within VPCresource "google_compute_firewall" "allow_internal" { name = "${var.cluster_name}-allow-internal" project = var.project_id network = google_compute_network.vpc.id allow { protocol = "tcp" ports = ["0-65535"] } allow { protocol = "udp" ports = ["0-65535"] } allow { protocol = "icmp" } source_ranges = [ var.subnet_cidr, var.pods_cidr, var.services_cidr ] description = "Allow internal VPC traffic"}# Allow GCP health checks (required for load balancers)resource "google_compute_firewall" "allow_health_checks" { name = "${var.cluster_name}-allow-health-checks" project = var.project_id network = google_compute_network.vpc.id allow { protocol = "tcp" ports = ["10256", "8080", "8443"] } source_ranges = [ "35.191.0.0/16", # GCP LB health check range "130.211.0.0/22" # GCP LB health check range ] target_tags = ["gke-${var.cluster_name}"] description = "Allow GCP load balancer health checks"}# Deny all ingress by defaultresource "google_compute_firewall" "deny_all_ingress" { name = "${var.cluster_name}-deny-all-ingress" project = var.project_id network = google_compute_network.vpc.id priority = 65534 deny { protocol = "all" } source_ranges = ["0.0.0.0/0"] description = "Default deny all ingress"}
# modules/networking/outputs.tfoutput "network_id" { value = google_compute_network.vpc.id}output "network_name" { value = google_compute_network.vpc.name}output "subnet_id" { value = google_compute_subnetwork.subnet.id}output "subnet_name" { value = google_compute_subnetwork.subnet.name}output "pods_range_name" { value = "pods"}output "services_range_name" { value = "services"}
Step 4 — Security Module
# modules/security/main.tf# ── Service Account for GKE Nodes ────────────────────────────resource "google_service_account" "gke_nodes" { account_id = "${var.cluster_name}-nodes" display_name = "GKE Node Service Account" project = var.project_id description = "Minimal SA for GKE nodes"}# Minimal permissions for nodesresource "google_project_iam_member" "node_permissions" { for_each = toset([ "roles/logging.logWriter", # write logs "roles/monitoring.metricWriter", # write metrics "roles/monitoring.viewer", # read monitoring "roles/stackdriver.resourceMetadata.writer", "roles/artifactregistry.reader", # pull images ]) project = var.project_id role = each.value member = "serviceAccount:${google_service_account.gke_nodes.email}"}# ── KMS Key for etcd encryption ───────────────────────────────resource "google_kms_key_ring" "gke" { name = "${var.cluster_name}-keyring" project = var.project_id location = var.region}resource "google_kms_crypto_key" "etcd" { name = "${var.cluster_name}-etcd-key" key_ring = google_kms_key_ring.gke.id rotation_period = "7776000s" # 90 days lifecycle { prevent_destroy = true }}# Allow GKE to use KMS keyresource "google_kms_crypto_key_iam_member" "gke_kms" { crypto_key_id = google_kms_crypto_key.etcd.id role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" member = "serviceAccount:service-${data.google_project.project.number}@container-engine-robot.iam.gserviceaccount.com"}data "google_project" "project" { project_id = var.project_id}# ── Workload Identity Service Accounts ────────────────────────# Example: Workload Identity for app teamsresource "google_service_account" "workload_identity_sa" { for_each = var.workload_identity_bindings account_id = each.key display_name = each.value.display_name project = var.project_id}resource "google_project_iam_member" "workload_sa_permissions" { for_each = { for binding in flatten([ for sa_name, sa_config in var.workload_identity_bindings : [ for role in sa_config.roles : { key = "${sa_name}-${role}" sa_name = sa_name role = role } ] ]) : binding.key => binding } project = var.project_id role = each.value.role member = "serviceAccount:${google_service_account.workload_identity_sa[each.value.sa_name].email}"}resource "google_service_account_iam_member" "workload_identity_binding" { for_each = var.workload_identity_bindings service_account_id = google_service_account.workload_identity_sa[each.key].name role = "roles/iam.workloadIdentityUser" member = "serviceAccount:${var.project_id}.svc.id.goog[${each.value.namespace}/${each.value.ksa_name}]"}
# modules/security/outputs.tfoutput "node_sa_email" { value = google_service_account.gke_nodes.email}output "kms_key_id" { value = google_kms_crypto_key.etcd.id}output "workload_sa_emails" { value = { for name, sa in google_service_account.workload_identity_sa : name => sa.email }}
Step 5 — GKE Module
# modules/gke/main.tflocals { cluster_name = var.cluster_name labels = merge(var.labels, { managed_by = "terraform" environment = var.environment })}# ── GKE Cluster ──────────────────────────────────────────────resource "google_container_cluster" "primary" { provider = google-beta name = local.cluster_name project = var.project_id location = var.region # regional for HA # Remove default node pool remove_default_node_pool = true initial_node_count = 1 # Networking network = var.network_id subnetwork = var.subnet_id # VPC-native — required for private clusters ip_allocation_policy { cluster_secondary_range_name = var.pods_range_name services_secondary_range_name = var.services_range_name } # ── Private Cluster ───────────────────────────────────── private_cluster_config { enable_private_nodes = true enable_private_endpoint = var.enable_private_endpoint master_ipv4_cidr_block = var.master_cidr master_global_access_config { enabled = true # access from any region } } # ── Authorized Networks ────────────────────────────────── master_authorized_networks_config { dynamic "cidr_blocks" { for_each = var.authorized_networks content { cidr_block = cidr_blocks.value.cidr_block display_name = cidr_blocks.value.display_name } } gcp_public_cidrs_access_enabled = false } # ── Workload Identity ──────────────────────────────────── workload_identity_config { workload_pool = "${var.project_id}.svc.id.goog" } # ── Security ───────────────────────────────────────────── binary_authorization { evaluation_mode = var.environment == "production" ? "PROJECT_SINGLETON_POLICY_ENFORCE" : "DISABLED" } # etcd encryption using KMS database_encryption { state = "ENCRYPTED" key_name = var.kms_key_id } # ── Addons ─────────────────────────────────────────────── addons_config { # HTTP load balancing (required for Ingress) http_load_balancing { disabled = false } # Horizontal Pod Autoscaling horizontal_pod_autoscaling { disabled = false } # CSI driver for persistent disks gce_persistent_disk_csi_driver_config { enabled = true } # GCS FUSE driver gcs_fuse_csi_driver_config { enabled = true } # DNS config dns_cache_config { enabled = true } } # ── Network Policy ─────────────────────────────────────── network_policy { enabled = true provider = "CALICO" } datapath_provider = "ADVANCED_DATAPATH" # eBPF via Cilium # ── Logging and Monitoring ─────────────────────────────── logging_config { enable_components = [ "SYSTEM_COMPONENTS", "WORKLOADS", "APISERVER", "SCHEDULER", "CONTROLLER_MANAGER" ] } monitoring_config { enable_components = [ "SYSTEM_COMPONENTS", "WORKLOADS", "APISERVER", "SCHEDULER", "CONTROLLER_MANAGER", "STORAGE", "HPA", "POD", "DAEMONSET", "DEPLOYMENT", "STATEFULSET" ] managed_prometheus { enabled = true } advanced_datapath_observability_config { enable_metrics = true enable_relay = true } } # ── Release Channel ────────────────────────────────────── release_channel { channel = var.release_channel } # ── Maintenance Window ─────────────────────────────────── maintenance_policy { recurring_window { start_time = "2024-01-01T02:00:00Z" end_time = "2024-01-01T06:00:00Z" recurrence = "FREQ=WEEKLY;BYDAY=SA,SU" } maintenance_exclusion { exclusion_name = "black-friday" start_time = "2024-11-25T00:00:00Z" end_time = "2024-12-02T00:00:00Z" exclusion_options { scope = "NO_UPGRADES" } } } # ── Cluster Autoscaling ────────────────────────────────── cluster_autoscaling { enabled = true autoscaling_profile = "OPTIMIZE_UTILIZATION" resource_limits { resource_type = "cpu" minimum = 4 maximum = 200 } resource_limits { resource_type = "memory" minimum = 16 maximum = 800 } auto_provisioning_defaults { service_account = var.node_sa_email oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] management { auto_repair = true auto_upgrade = true } shielded_instance_config { enable_secure_boot = true enable_integrity_monitoring = true } disk_size = 100 disk_type = "pd-ssd" } } # ── Node Pool Defaults ─────────────────────────────────── node_pool_defaults { node_config_defaults { logging_variant = "MAX_THROUGHPUT" } } # ── Security posture ───────────────────────────────────── security_posture_config { mode = "BASIC" vulnerability_mode = "VULNERABILITY_BASIC" } resource_labels = local.labels lifecycle { ignore_changes = [ initial_node_count, node_pool, ] prevent_destroy = var.environment == "production" ? true : false } depends_on = [ var.node_sa_email ]}# ── Node Pools ────────────────────────────────────────────────resource "google_container_node_pool" "pools" { for_each = var.node_pools name = each.key project = var.project_id cluster = google_container_cluster.primary.name location = var.region # Autoscaling autoscaling { min_node_count = each.value.min_nodes max_node_count = each.value.max_nodes location_policy = "BALANCED" # spread across zones } # Auto-repair and upgrade management { auto_repair = true auto_upgrade = true } # Surge upgrade — zero downtime upgrade_settings { strategy = "SURGE" max_surge = 1 max_unavailable = 0 } node_config { machine_type = each.value.machine_type disk_size_gb = each.value.disk_size_gb disk_type = each.value.disk_type spot = each.value.spot image_type = "COS_CONTAINERD" service_account = var.node_sa_email oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] # Workload Identity workload_metadata_config { mode = "GKE_METADATA" } # Shielded nodes shielded_instance_config { enable_secure_boot = true enable_integrity_monitoring = true } # Taints dynamic "taint" { for_each = each.value.taints content { key = taint.value.key value = taint.value.value effect = taint.value.effect } } # Labels labels = merge(local.labels, each.value.labels) # Network tags tags = [ "gke-${local.cluster_name}", "gke-${local.cluster_name}-${each.key}" ] # Metadata metadata = { disable-legacy-endpoints = "true" } # Resource labels for cost tracking resource_labels = merge(local.labels, { pool = each.key spot = tostring(each.value.spot) }) } lifecycle { ignore_changes = [ node_config[0].resource_labels, ] }}
# modules/gke/outputs.tfoutput "cluster_name" { description = "GKE cluster name" value = google_container_cluster.primary.name}output "endpoint" { description = "Cluster endpoint" value = google_container_cluster.primary.endpoint sensitive = true}output "ca_certificate" { description = "Cluster CA certificate" value = google_container_cluster.primary.master_auth[0].cluster_ca_certificate sensitive = true}output "cluster_id" { description = "Cluster resource ID" value = google_container_cluster.primary.id}output "workload_identity_pool" { description = "Workload Identity pool" value = "${var.project_id}.svc.id.goog"}output "node_pool_names" { description = "Node pool names" value = [for np in google_container_node_pool.pools : np.name]}output "get_credentials_command" { description = "Command to get cluster credentials" value = "gcloud container clusters get-credentials ${google_container_cluster.primary.name} --region ${var.region} --project ${var.project_id}"}
Step 6 — Root Module
# main.tf# ── Networking ────────────────────────────────────────────────module "networking" { source = "./modules/networking" project_id = var.project_id region = var.region cluster_name = var.cluster_name subnet_cidr = var.subnet_cidr pods_cidr = var.pods_cidr services_cidr = var.services_cidr labels = var.labels}# ── Security ──────────────────────────────────────────────────module "security" { source = "./modules/security" project_id = var.project_id region = var.region cluster_name = var.cluster_name environment = var.environment workload_identity_bindings = { "app-sa" = { display_name = "Application Service Account" namespace = "production" ksa_name = "app-ksa" roles = [ "roles/storage.objectViewer", "roles/secretmanager.secretAccessor" ] } }}# ── GKE Cluster ───────────────────────────────────────────────module "gke" { source = "./modules/gke" project_id = var.project_id region = var.region cluster_name = var.cluster_name environment = var.environment # Networking network_id = module.networking.network_id subnet_id = module.networking.subnet_id pods_range_name = module.networking.pods_range_name services_range_name = module.networking.services_range_name master_cidr = var.master_cidr # Security kms_key_id = module.security.kms_key_id node_sa_email = module.security.node_sa_email authorized_networks = var.authorized_networks enable_private_endpoint = var.enable_private_endpoint # Node pools node_pools = var.node_pools release_channel = "REGULAR" labels = var.labels depends_on = [ module.networking, module.security ]}
# outputs.tfoutput "cluster_name" { description = "GKE cluster name" value = module.gke.cluster_name}output "cluster_endpoint" { description = "Cluster API endpoint" value = module.gke.endpoint sensitive = true}output "get_credentials" { description = "Command to configure kubectl" value = module.gke.get_credentials_command}output "workload_identity_pool" { description = "Workload Identity pool" value = module.gke.workload_identity_pool}output "node_sa_email" { description = "Node service account email" value = module.security.node_sa_email}
Step 7 — Environment Configs
# environments/dev.tfvarsproject_id = "mycompany-dev"environment = "dev"cluster_name = "dev-cluster"region = "us-central1"subnet_cidr = "10.10.0.0/20"pods_cidr = "10.10.16.0/20"services_cidr = "10.10.32.0/20"master_cidr = "172.16.0.32/28"node_pools = { application = { machine_type = "n2-standard-2" # smaller for dev min_nodes = 0 # scale to zero max_nodes = 5 disk_size_gb = 50 disk_type = "pd-standard" # cheaper disk spot = true # spot VMs in dev taints = [] labels = { pool = "application", env = "dev" } }}authorized_networks = [ { cidr_block = "0.0.0.0/0" display_name = "all-for-dev" }]enable_private_endpoint = falselabels = { environment = "dev" team = "platform" managed_by = "terraform"}
# environments/production.tfvarsproject_id = "mycompany-prod"environment = "production"cluster_name = "prod-cluster"region = "us-central1"subnet_cidr = "10.0.0.0/20"pods_cidr = "10.4.0.0/14"services_cidr = "10.0.16.0/20"master_cidr = "172.16.0.0/28"node_pools = { system = { machine_type = "n2-standard-2" min_nodes = 1 max_nodes = 3 disk_size_gb = 50 disk_type = "pd-ssd" spot = false taints = [{ key = "CriticalAddonsOnly" value = "true" effect = "NO_SCHEDULE" }] labels = { pool = "system" } } application = { machine_type = "n2-standard-8" min_nodes = 3 max_nodes = 50 disk_size_gb = 100 disk_type = "pd-ssd" spot = false taints = [] labels = { pool = "application" } } spot = { machine_type = "n2-standard-4" min_nodes = 0 max_nodes = 20 disk_size_gb = 100 disk_type = "pd-ssd" spot = true taints = [{ key = "cloud.google.com/gke-spot" value = "true" effect = "NO_SCHEDULE" }] labels = { pool = "spot" } }}authorized_networks = [ { cidr_block = "10.0.0.0/8" display_name = "internal" }, { cidr_block = "203.0.113.0/24" display_name = "office-vpn" }]enable_private_endpoint = falselabels = { environment = "production" team = "platform" managed_by = "terraform" cost_center = "engineering" criticality = "high"}
Step 8 — GitHub Actions CI/CD
# .github/workflows/terraform-gke.ymlname: Deploy GKEon: push: branches: [main] paths: - 'gke-terraform/**' pull_request: branches: [main] paths: - 'gke-terraform/**' workflow_dispatch: inputs: environment: description: Target environment required: true type: choice options: [dev, staging, production] action: description: Terraform action required: true type: choice options: [plan, apply, destroy]env: TF_VERSION: "1.6.0" WORKING_DIR: gke-terraformjobs: # ── Validate ──────────────────────────────────────────────── validate: name: Validate runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: hashicorp/setup-terraform@v3 with: terraform_version: ${{ env.TF_VERSION }} - name: Format check run: terraform fmt -check -recursive working-directory: ${{ env.WORKING_DIR }} - name: Validate run: | terraform init -backend=false terraform validate working-directory: ${{ env.WORKING_DIR }} # ── Security scan ─────────────────────────────────────────── security: name: Security Scan runs-on: ubuntu-latest needs: validate steps: - uses: actions/checkout@v4 - name: Checkov scan uses: bridgecrewio/checkov-action@master with: directory: ${{ env.WORKING_DIR }} framework: terraform soft_fail: false - name: tfsec scan uses: aquasecurity/tfsec-action@v1.0.0 with: working_directory: ${{ env.WORKING_DIR }} # ── Plan ──────────────────────────────────────────────────── plan: name: Plan ${{ matrix.environment }} runs-on: ubuntu-latest needs: [validate, security] strategy: matrix: environment: [dev, staging, production] permissions: contents: read id-token: write pull-requests: write steps: - uses: actions/checkout@v4 - id: auth uses: google-github-actions/auth@v2 with: workload_identity_provider: ${{ secrets.WIF_PROVIDER }} service_account: terraform-sa@${{ secrets[format('{0}_PROJECT_ID', upper(matrix.environment))] }}.iam.gserviceaccount.com - uses: hashicorp/setup-terraform@v3 with: terraform_version: ${{ env.TF_VERSION }} - name: Terraform Init run: | terraform init \ -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}" \ -backend-config="prefix=gke/${{ matrix.environment }}" working-directory: ${{ env.WORKING_DIR }} - name: Terraform Plan id: plan run: | terraform plan \ -var-file="environments/${{ matrix.environment }}.tfvars" \ -out=tfplan-${{ matrix.environment }} \ -no-color \ 2>&1 | tee plan-output.txt echo "exitcode=${PIPESTATUS[0]}" >> $GITHUB_OUTPUT working-directory: ${{ env.WORKING_DIR }} - name: Upload plan uses: actions/upload-artifact@v4 with: name: tfplan-${{ matrix.environment }} path: ${{ env.WORKING_DIR }}/tfplan-${{ matrix.environment }} retention-days: 1 - name: Post plan to PR uses: actions/github-script@v7 if: github.event_name == 'pull_request' with: script: | const fs = require('fs'); const plan = fs.readFileSync( '${{ env.WORKING_DIR }}/plan-output.txt', 'utf8' ); const maxLen = 65000; const truncated = plan.length > maxLen ? plan.substring(0, maxLen) + '\n... truncated' : plan; github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body: `## Terraform Plan — \`${{ matrix.environment }}\` \`\`\` ${truncated} \`\`\`` }); # ── Apply Dev ─────────────────────────────────────────────── apply-dev: name: Apply Dev runs-on: ubuntu-latest needs: plan if: github.ref == 'refs/heads/main' environment: dev permissions: contents: read id-token: write steps: - uses: actions/checkout@v4 - id: auth uses: google-github-actions/auth@v2 with: workload_identity_provider: ${{ secrets.WIF_PROVIDER }} service_account: ${{ secrets.DEV_TF_SA }} - uses: hashicorp/setup-terraform@v3 with: terraform_version: ${{ env.TF_VERSION }} - name: Download plan uses: actions/download-artifact@v4 with: name: tfplan-dev path: ${{ env.WORKING_DIR }} - name: Terraform Init run: | terraform init \ -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}" \ -backend-config="prefix=gke/dev" working-directory: ${{ env.WORKING_DIR }} - name: Terraform Apply run: terraform apply -auto-approve tfplan-dev working-directory: ${{ env.WORKING_DIR }} - name: Configure kubectl run: | ${{ steps.apply.outputs.get_credentials }} - name: Verify cluster run: | kubectl get nodes kubectl get pods -A # ── Apply Production (requires approval) ──────────────────── apply-production: name: Apply Production runs-on: ubuntu-latest needs: apply-dev environment: production # approval gate permissions: contents: read id-token: write steps: - uses: actions/checkout@v4 - id: auth uses: google-github-actions/auth@v2 with: workload_identity_provider: ${{ secrets.WIF_PROVIDER }} service_account: ${{ secrets.PROD_TF_SA }} - uses: hashicorp/setup-terraform@v3 with: terraform_version: ${{ env.TF_VERSION }} - name: Download plan uses: actions/download-artifact@v4 with: name: tfplan-production path: ${{ env.WORKING_DIR }} - name: Terraform Init run: | terraform init \ -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}" \ -backend-config="prefix=gke/production" working-directory: ${{ env.WORKING_DIR }} - name: Terraform Apply run: terraform apply -auto-approve tfplan-production working-directory: ${{ env.WORKING_DIR }} - name: Notify deployment uses: slackapi/slack-github-action@v1.26.0 with: payload: | { "text": "✅ GKE Production cluster deployed successfully\nCluster: prod-cluster\nBy: ${{ github.actor }}" } env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK }}
Step 9 — Deploy and Verify
# ── Initial setup ────────────────────────────────────────────# Create state bucketgsutil mb -p mycompany-prod \ -l us-central1 \ gs://mycompany-terraform-state# Enable versioning on state bucketgsutil versioning set on \ gs://mycompany-terraform-state# Enable required GCP APIsgcloud services enable \ container.googleapis.com \ compute.googleapis.com \ iam.googleapis.com \ cloudkms.googleapis.com \ artifactregistry.googleapis.com \ monitoring.googleapis.com \ logging.googleapis.com \ --project mycompany-prod# ── Deploy dev ───────────────────────────────────────────────cd gke-terraformterraform init \ -backend-config="bucket=mycompany-terraform-state" \ -backend-config="prefix=gke/dev"terraform plan \ -var-file="environments/dev.tfvars" \ -out=tfplan-devterraform apply tfplan-dev# ── Get cluster credentials ──────────────────────────────────gcloud container clusters get-credentials dev-cluster \ --region us-central1 \ --project mycompany-dev# ── Verify cluster ───────────────────────────────────────────kubectl get nodes# NAME STATUS ROLES AGE# gke-dev-cluster-application-xxx Ready <none> 2m# gke-dev-cluster-application-yyy Ready <none> 2mkubectl get pods -A# All system pods should be Running# Check node pool detailskubectl get nodes -L cloud.google.com/gke-nodepool,topology.kubernetes.io/zone# Verify Workload Identitykubectl create serviceaccount test-ksa -n defaultkubectl annotate serviceaccount test-ksa \ iam.gke.io/gcp-service-account=app-sa@mycompany-dev.iam.gserviceaccount.com# ── Verify private cluster ───────────────────────────────────kubectl get nodes -o wide# EXTERNAL-IP should show <none> for private nodes# ── Check cluster security ───────────────────────────────────gcloud container clusters describe dev-cluster \ --region us-central1 \ --format="yaml(masterAuth,networkConfig,privateClusterConfig)"# ── Destroy dev when done ────────────────────────────────────terraform destroy \ -var-file="environments/dev.tfvars" \ -auto-approve
Terraform State Management
# List state resourcesterraform state list# Show specific resourceterraform state show module.gke.google_container_cluster.primary# Move resource (refactoring)terraform state mv \ module.gke.google_container_node_pool.pools[\"app\"] \ module.gke.google_container_node_pool.pools[\"application\"]# Import existing clusterterraform import \ module.gke.google_container_cluster.primary \ projects/mycompany-prod/locations/us-central1/clusters/prod-cluster# Remove from state (without destroying)terraform state rm \ module.gke.google_container_node_pool.pools[\"old-pool\"]# Backup stategsutil cp \ gs://mycompany-terraform-state/gke/production/default.tfstate \ ./backup-$(date +%Y%m%d).tfstate
Common Issues and Fixes
# Issue 1 — API not enabled# Error: googleapi: Error 403: ... has not been used in projectgcloud services enable container.googleapis.com# Issue 2 — Insufficient permissions# Error: Error creating Cluster: googleapi: Error 403gcloud projects add-iam-policy-binding mycompany-prod \ --member="serviceAccount:terraform-sa@mycompany-prod.iam.gserviceaccount.com" \ --role="roles/container.admin"# Issue 3 — KMS key permission# Error: PERMISSION_DENIED: The caller does not have permissiongcloud kms keys add-iam-policy-binding etcd-key \ --keyring=prod-cluster-keyring \ --location=us-central1 \ --member="serviceAccount:service-123@container-engine-robot.iam.gserviceaccount.com" \ --role="roles/cloudkms.cryptoKeyEncrypterDecrypter"# Issue 4 — Node pool update requires replacement# Use lifecycle ignore_changes or blue/green node pool strategyterraform plan -target=module.gke.google_container_node_pool.pools# Issue 5 — Cluster stuck deleting# Remove protection and retryterraform state rm module.gke.google_container_cluster.primarygcloud container clusters delete prod-cluster \ --region us-central1 \ --project mycompany-prod
This gives you a production-ready GKE deployment — private cluster with VPC-native networking, Workload Identity, KMS encryption, multiple node pools, and a full CI/CD pipeline with security scanning, environment promotion, and approval gates.