Terraform Configuration for GKE: Best Practices

Deploy GKE with Terraform

Project Structure

gke-terraform/
├── main.tf
├── variables.tf
├── outputs.tf
├── versions.tf
├── terraform.tfvars
├── modules/
│ ├── networking/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ └── outputs.tf
│ ├── gke/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ └── outputs.tf
│ └── security/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
└── environments/
├── dev.tfvars
├── staging.tfvars
└── production.tfvars

Step 1 — Versions and Provider Config

# versions.tf
terraform {
required_version = ">= 1.6.0"
required_providers {
google = {
source = "hashicorp/google"
version = "~> 5.0"
}
google-beta = {
source = "hashicorp/google-beta"
version = "~> 5.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.0"
}
}
# Remote state backend
backend "gcs" {
bucket = "mycompany-terraform-state"
prefix = "gke/production"
}
}
provider "google" {
project = var.project_id
region = var.region
}
provider "google-beta" {
project = var.project_id
region = var.region
}
# K8s provider — uses GKE cluster output
provider "kubernetes" {
host = "https://${module.gke.endpoint}"
token = data.google_client_config.default.access_token
cluster_ca_certificate = base64decode(
module.gke.ca_certificate
)
}
data "google_client_config" "default" {}

Step 2 — Variables

# variables.tf
variable "project_id" {
description = "GCP Project ID"
type = string
}
variable "region" {
description = "GCP region"
type = string
default = "us-central1"
}
variable "environment" {
description = "Environment name"
type = string
validation {
condition = contains(["dev", "staging", "production"], var.environment)
error_message = "Must be dev, staging, or production"
}
}
variable "cluster_name" {
description = "GKE cluster name"
type = string
}
# Networking
variable "vpc_cidr" {
description = "VPC CIDR range"
type = string
default = "10.0.0.0/16"
}
variable "subnet_cidr" {
description = "Subnet CIDR"
type = string
default = "10.0.0.0/20"
}
variable "pods_cidr" {
description = "Pod IP range"
type = string
default = "10.4.0.0/14"
}
variable "services_cidr" {
description = "Services IP range"
type = string
default = "10.0.16.0/20"
}
variable "master_cidr" {
description = "Control plane CIDR"
type = string
default = "172.16.0.0/28"
}
# Node pools
variable "node_pools" {
description = "Node pool configurations"
type = map(object({
machine_type = string
min_nodes = number
max_nodes = number
disk_size_gb = number
disk_type = string
spot = bool
taints = list(object({
key = string
value = string
effect = string
}))
labels = map(string)
}))
}
# Security
variable "authorized_networks" {
description = "Networks authorized to access K8s API"
type = list(object({
cidr_block = string
display_name = string
}))
default = []
}
variable "enable_private_endpoint" {
description = "Disable public K8s API endpoint"
type = bool
default = false
}
# Labels
variable "labels" {
description = "Labels applied to all resources"
type = map(string)
default = {}
}
# terraform.tfvars
project_id = "mycompany-prod"
region = "us-central1"
environment = "production"
cluster_name = "prod-cluster"
vpc_cidr = "10.0.0.0/16"
subnet_cidr = "10.0.0.0/20"
pods_cidr = "10.4.0.0/14"
services_cidr = "10.0.16.0/20"
master_cidr = "172.16.0.0/28"
node_pools = {
# System components pool
system = {
machine_type = "n2-standard-2"
min_nodes = 1
max_nodes = 3
disk_size_gb = 50
disk_type = "pd-ssd"
spot = false
taints = [{
key = "CriticalAddonsOnly"
value = "true"
effect = "NO_SCHEDULE"
}]
labels = { pool = "system" }
}
# Application workloads
application = {
machine_type = "n2-standard-4"
min_nodes = 2
max_nodes = 20
disk_size_gb = 100
disk_type = "pd-ssd"
spot = false
taints = []
labels = { pool = "application" }
}
# Spot pool for batch workloads
spot = {
machine_type = "n2-standard-4"
min_nodes = 0
max_nodes = 10
disk_size_gb = 100
disk_type = "pd-ssd"
spot = true
taints = [{
key = "cloud.google.com/gke-spot"
value = "true"
effect = "NO_SCHEDULE"
}]
labels = { pool = "spot" }
}
}
authorized_networks = [
{
cidr_block = "10.0.0.0/8"
display_name = "internal"
},
{
cidr_block = "203.0.113.0/24"
display_name = "office-vpn"
}
]
labels = {
environment = "production"
team = "platform"
managed_by = "terraform"
cost_center = "engineering"
}

Step 3 — Networking Module

# modules/networking/main.tf
# ── VPC ──────────────────────────────────────────────────────
resource "google_compute_network" "vpc" {
name = "${var.cluster_name}-vpc"
project = var.project_id
auto_create_subnetworks = false
routing_mode = "GLOBAL"
description = "VPC for GKE cluster ${var.cluster_name}"
}
# ── Subnet ───────────────────────────────────────────────────
resource "google_compute_subnetwork" "subnet" {
name = "${var.cluster_name}-subnet"
project = var.project_id
region = var.region
network = google_compute_network.vpc.id
ip_cidr_range = var.subnet_cidr
private_ip_google_access = true # reach GCP APIs privately
# Secondary ranges for pods and services
secondary_ip_range {
range_name = "pods"
ip_cidr_range = var.pods_cidr
}
secondary_ip_range {
range_name = "services"
ip_cidr_range = var.services_cidr
}
# VPC Flow Logs
log_config {
aggregation_interval = "INTERVAL_5_SEC"
flow_sampling = 0.5
metadata = "INCLUDE_ALL_METADATA"
}
}
# ── Cloud Router ─────────────────────────────────────────────
resource "google_compute_router" "router" {
name = "${var.cluster_name}-router"
project = var.project_id
region = var.region
network = google_compute_network.vpc.id
bgp {
asn = 64514
}
}
# ── Cloud NAT (outbound internet for private nodes) ──────────
resource "google_compute_router_nat" "nat" {
name = "${var.cluster_name}-nat"
project = var.project_id
router = google_compute_router.router.name
region = var.region
nat_ip_allocate_option = "AUTO_ONLY"
source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES"
log_config {
enable = true
filter = "ERRORS_ONLY"
}
}
# ── Firewall Rules ────────────────────────────────────────────
# Allow internal traffic within VPC
resource "google_compute_firewall" "allow_internal" {
name = "${var.cluster_name}-allow-internal"
project = var.project_id
network = google_compute_network.vpc.id
allow {
protocol = "tcp"
ports = ["0-65535"]
}
allow {
protocol = "udp"
ports = ["0-65535"]
}
allow {
protocol = "icmp"
}
source_ranges = [
var.subnet_cidr,
var.pods_cidr,
var.services_cidr
]
description = "Allow internal VPC traffic"
}
# Allow GCP health checks (required for load balancers)
resource "google_compute_firewall" "allow_health_checks" {
name = "${var.cluster_name}-allow-health-checks"
project = var.project_id
network = google_compute_network.vpc.id
allow {
protocol = "tcp"
ports = ["10256", "8080", "8443"]
}
source_ranges = [
"35.191.0.0/16", # GCP LB health check range
"130.211.0.0/22" # GCP LB health check range
]
target_tags = ["gke-${var.cluster_name}"]
description = "Allow GCP load balancer health checks"
}
# Deny all ingress by default
resource "google_compute_firewall" "deny_all_ingress" {
name = "${var.cluster_name}-deny-all-ingress"
project = var.project_id
network = google_compute_network.vpc.id
priority = 65534
deny {
protocol = "all"
}
source_ranges = ["0.0.0.0/0"]
description = "Default deny all ingress"
}
# modules/networking/outputs.tf
output "network_id" {
value = google_compute_network.vpc.id
}
output "network_name" {
value = google_compute_network.vpc.name
}
output "subnet_id" {
value = google_compute_subnetwork.subnet.id
}
output "subnet_name" {
value = google_compute_subnetwork.subnet.name
}
output "pods_range_name" {
value = "pods"
}
output "services_range_name" {
value = "services"
}

Step 4 — Security Module

# modules/security/main.tf
# ── Service Account for GKE Nodes ────────────────────────────
resource "google_service_account" "gke_nodes" {
account_id = "${var.cluster_name}-nodes"
display_name = "GKE Node Service Account"
project = var.project_id
description = "Minimal SA for GKE nodes"
}
# Minimal permissions for nodes
resource "google_project_iam_member" "node_permissions" {
for_each = toset([
"roles/logging.logWriter", # write logs
"roles/monitoring.metricWriter", # write metrics
"roles/monitoring.viewer", # read monitoring
"roles/stackdriver.resourceMetadata.writer",
"roles/artifactregistry.reader", # pull images
])
project = var.project_id
role = each.value
member = "serviceAccount:${google_service_account.gke_nodes.email}"
}
# ── KMS Key for etcd encryption ───────────────────────────────
resource "google_kms_key_ring" "gke" {
name = "${var.cluster_name}-keyring"
project = var.project_id
location = var.region
}
resource "google_kms_crypto_key" "etcd" {
name = "${var.cluster_name}-etcd-key"
key_ring = google_kms_key_ring.gke.id
rotation_period = "7776000s" # 90 days
lifecycle {
prevent_destroy = true
}
}
# Allow GKE to use KMS key
resource "google_kms_crypto_key_iam_member" "gke_kms" {
crypto_key_id = google_kms_crypto_key.etcd.id
role = "roles/cloudkms.cryptoKeyEncrypterDecrypter"
member = "serviceAccount:service-${data.google_project.project.number}@container-engine-robot.iam.gserviceaccount.com"
}
data "google_project" "project" {
project_id = var.project_id
}
# ── Workload Identity Service Accounts ────────────────────────
# Example: Workload Identity for app teams
resource "google_service_account" "workload_identity_sa" {
for_each = var.workload_identity_bindings
account_id = each.key
display_name = each.value.display_name
project = var.project_id
}
resource "google_project_iam_member" "workload_sa_permissions" {
for_each = {
for binding in flatten([
for sa_name, sa_config in var.workload_identity_bindings : [
for role in sa_config.roles : {
key = "${sa_name}-${role}"
sa_name = sa_name
role = role
}
]
]) : binding.key => binding
}
project = var.project_id
role = each.value.role
member = "serviceAccount:${google_service_account.workload_identity_sa[each.value.sa_name].email}"
}
resource "google_service_account_iam_member" "workload_identity_binding" {
for_each = var.workload_identity_bindings
service_account_id = google_service_account.workload_identity_sa[each.key].name
role = "roles/iam.workloadIdentityUser"
member = "serviceAccount:${var.project_id}.svc.id.goog[${each.value.namespace}/${each.value.ksa_name}]"
}
# modules/security/outputs.tf
output "node_sa_email" {
value = google_service_account.gke_nodes.email
}
output "kms_key_id" {
value = google_kms_crypto_key.etcd.id
}
output "workload_sa_emails" {
value = {
for name, sa in google_service_account.workload_identity_sa :
name => sa.email
}
}

Step 5 — GKE Module

# modules/gke/main.tf
locals {
cluster_name = var.cluster_name
labels = merge(var.labels, {
managed_by = "terraform"
environment = var.environment
})
}
# ── GKE Cluster ──────────────────────────────────────────────
resource "google_container_cluster" "primary" {
provider = google-beta
name = local.cluster_name
project = var.project_id
location = var.region # regional for HA
# Remove default node pool
remove_default_node_pool = true
initial_node_count = 1
# Networking
network = var.network_id
subnetwork = var.subnet_id
# VPC-native — required for private clusters
ip_allocation_policy {
cluster_secondary_range_name = var.pods_range_name
services_secondary_range_name = var.services_range_name
}
# ── Private Cluster ─────────────────────────────────────
private_cluster_config {
enable_private_nodes = true
enable_private_endpoint = var.enable_private_endpoint
master_ipv4_cidr_block = var.master_cidr
master_global_access_config {
enabled = true # access from any region
}
}
# ── Authorized Networks ──────────────────────────────────
master_authorized_networks_config {
dynamic "cidr_blocks" {
for_each = var.authorized_networks
content {
cidr_block = cidr_blocks.value.cidr_block
display_name = cidr_blocks.value.display_name
}
}
gcp_public_cidrs_access_enabled = false
}
# ── Workload Identity ────────────────────────────────────
workload_identity_config {
workload_pool = "${var.project_id}.svc.id.goog"
}
# ── Security ─────────────────────────────────────────────
binary_authorization {
evaluation_mode = var.environment == "production" ? "PROJECT_SINGLETON_POLICY_ENFORCE" : "DISABLED"
}
# etcd encryption using KMS
database_encryption {
state = "ENCRYPTED"
key_name = var.kms_key_id
}
# ── Addons ───────────────────────────────────────────────
addons_config {
# HTTP load balancing (required for Ingress)
http_load_balancing {
disabled = false
}
# Horizontal Pod Autoscaling
horizontal_pod_autoscaling {
disabled = false
}
# CSI driver for persistent disks
gce_persistent_disk_csi_driver_config {
enabled = true
}
# GCS FUSE driver
gcs_fuse_csi_driver_config {
enabled = true
}
# DNS config
dns_cache_config {
enabled = true
}
}
# ── Network Policy ───────────────────────────────────────
network_policy {
enabled = true
provider = "CALICO"
}
datapath_provider = "ADVANCED_DATAPATH" # eBPF via Cilium
# ── Logging and Monitoring ───────────────────────────────
logging_config {
enable_components = [
"SYSTEM_COMPONENTS",
"WORKLOADS",
"APISERVER",
"SCHEDULER",
"CONTROLLER_MANAGER"
]
}
monitoring_config {
enable_components = [
"SYSTEM_COMPONENTS",
"WORKLOADS",
"APISERVER",
"SCHEDULER",
"CONTROLLER_MANAGER",
"STORAGE",
"HPA",
"POD",
"DAEMONSET",
"DEPLOYMENT",
"STATEFULSET"
]
managed_prometheus {
enabled = true
}
advanced_datapath_observability_config {
enable_metrics = true
enable_relay = true
}
}
# ── Release Channel ──────────────────────────────────────
release_channel {
channel = var.release_channel
}
# ── Maintenance Window ───────────────────────────────────
maintenance_policy {
recurring_window {
start_time = "2024-01-01T02:00:00Z"
end_time = "2024-01-01T06:00:00Z"
recurrence = "FREQ=WEEKLY;BYDAY=SA,SU"
}
maintenance_exclusion {
exclusion_name = "black-friday"
start_time = "2024-11-25T00:00:00Z"
end_time = "2024-12-02T00:00:00Z"
exclusion_options {
scope = "NO_UPGRADES"
}
}
}
# ── Cluster Autoscaling ──────────────────────────────────
cluster_autoscaling {
enabled = true
autoscaling_profile = "OPTIMIZE_UTILIZATION"
resource_limits {
resource_type = "cpu"
minimum = 4
maximum = 200
}
resource_limits {
resource_type = "memory"
minimum = 16
maximum = 800
}
auto_provisioning_defaults {
service_account = var.node_sa_email
oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
management {
auto_repair = true
auto_upgrade = true
}
shielded_instance_config {
enable_secure_boot = true
enable_integrity_monitoring = true
}
disk_size = 100
disk_type = "pd-ssd"
}
}
# ── Node Pool Defaults ───────────────────────────────────
node_pool_defaults {
node_config_defaults {
logging_variant = "MAX_THROUGHPUT"
}
}
# ── Security posture ─────────────────────────────────────
security_posture_config {
mode = "BASIC"
vulnerability_mode = "VULNERABILITY_BASIC"
}
resource_labels = local.labels
lifecycle {
ignore_changes = [
initial_node_count,
node_pool,
]
prevent_destroy = var.environment == "production" ? true : false
}
depends_on = [
var.node_sa_email
]
}
# ── Node Pools ────────────────────────────────────────────────
resource "google_container_node_pool" "pools" {
for_each = var.node_pools
name = each.key
project = var.project_id
cluster = google_container_cluster.primary.name
location = var.region
# Autoscaling
autoscaling {
min_node_count = each.value.min_nodes
max_node_count = each.value.max_nodes
location_policy = "BALANCED" # spread across zones
}
# Auto-repair and upgrade
management {
auto_repair = true
auto_upgrade = true
}
# Surge upgrade — zero downtime
upgrade_settings {
strategy = "SURGE"
max_surge = 1
max_unavailable = 0
}
node_config {
machine_type = each.value.machine_type
disk_size_gb = each.value.disk_size_gb
disk_type = each.value.disk_type
spot = each.value.spot
image_type = "COS_CONTAINERD"
service_account = var.node_sa_email
oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
# Workload Identity
workload_metadata_config {
mode = "GKE_METADATA"
}
# Shielded nodes
shielded_instance_config {
enable_secure_boot = true
enable_integrity_monitoring = true
}
# Taints
dynamic "taint" {
for_each = each.value.taints
content {
key = taint.value.key
value = taint.value.value
effect = taint.value.effect
}
}
# Labels
labels = merge(local.labels, each.value.labels)
# Network tags
tags = [
"gke-${local.cluster_name}",
"gke-${local.cluster_name}-${each.key}"
]
# Metadata
metadata = {
disable-legacy-endpoints = "true"
}
# Resource labels for cost tracking
resource_labels = merge(local.labels, {
pool = each.key
spot = tostring(each.value.spot)
})
}
lifecycle {
ignore_changes = [
node_config[0].resource_labels,
]
}
}
# modules/gke/outputs.tf
output "cluster_name" {
description = "GKE cluster name"
value = google_container_cluster.primary.name
}
output "endpoint" {
description = "Cluster endpoint"
value = google_container_cluster.primary.endpoint
sensitive = true
}
output "ca_certificate" {
description = "Cluster CA certificate"
value = google_container_cluster.primary.master_auth[0].cluster_ca_certificate
sensitive = true
}
output "cluster_id" {
description = "Cluster resource ID"
value = google_container_cluster.primary.id
}
output "workload_identity_pool" {
description = "Workload Identity pool"
value = "${var.project_id}.svc.id.goog"
}
output "node_pool_names" {
description = "Node pool names"
value = [for np in google_container_node_pool.pools : np.name]
}
output "get_credentials_command" {
description = "Command to get cluster credentials"
value = "gcloud container clusters get-credentials ${google_container_cluster.primary.name} --region ${var.region} --project ${var.project_id}"
}

Step 6 — Root Module

# main.tf
# ── Networking ────────────────────────────────────────────────
module "networking" {
source = "./modules/networking"
project_id = var.project_id
region = var.region
cluster_name = var.cluster_name
subnet_cidr = var.subnet_cidr
pods_cidr = var.pods_cidr
services_cidr = var.services_cidr
labels = var.labels
}
# ── Security ──────────────────────────────────────────────────
module "security" {
source = "./modules/security"
project_id = var.project_id
region = var.region
cluster_name = var.cluster_name
environment = var.environment
workload_identity_bindings = {
"app-sa" = {
display_name = "Application Service Account"
namespace = "production"
ksa_name = "app-ksa"
roles = [
"roles/storage.objectViewer",
"roles/secretmanager.secretAccessor"
]
}
}
}
# ── GKE Cluster ───────────────────────────────────────────────
module "gke" {
source = "./modules/gke"
project_id = var.project_id
region = var.region
cluster_name = var.cluster_name
environment = var.environment
# Networking
network_id = module.networking.network_id
subnet_id = module.networking.subnet_id
pods_range_name = module.networking.pods_range_name
services_range_name = module.networking.services_range_name
master_cidr = var.master_cidr
# Security
kms_key_id = module.security.kms_key_id
node_sa_email = module.security.node_sa_email
authorized_networks = var.authorized_networks
enable_private_endpoint = var.enable_private_endpoint
# Node pools
node_pools = var.node_pools
release_channel = "REGULAR"
labels = var.labels
depends_on = [
module.networking,
module.security
]
}
# outputs.tf
output "cluster_name" {
description = "GKE cluster name"
value = module.gke.cluster_name
}
output "cluster_endpoint" {
description = "Cluster API endpoint"
value = module.gke.endpoint
sensitive = true
}
output "get_credentials" {
description = "Command to configure kubectl"
value = module.gke.get_credentials_command
}
output "workload_identity_pool" {
description = "Workload Identity pool"
value = module.gke.workload_identity_pool
}
output "node_sa_email" {
description = "Node service account email"
value = module.security.node_sa_email
}

Step 7 — Environment Configs

# environments/dev.tfvars
project_id = "mycompany-dev"
environment = "dev"
cluster_name = "dev-cluster"
region = "us-central1"
subnet_cidr = "10.10.0.0/20"
pods_cidr = "10.10.16.0/20"
services_cidr = "10.10.32.0/20"
master_cidr = "172.16.0.32/28"
node_pools = {
application = {
machine_type = "n2-standard-2" # smaller for dev
min_nodes = 0 # scale to zero
max_nodes = 5
disk_size_gb = 50
disk_type = "pd-standard" # cheaper disk
spot = true # spot VMs in dev
taints = []
labels = { pool = "application", env = "dev" }
}
}
authorized_networks = [
{
cidr_block = "0.0.0.0/0"
display_name = "all-for-dev"
}
]
enable_private_endpoint = false
labels = {
environment = "dev"
team = "platform"
managed_by = "terraform"
}
# environments/production.tfvars
project_id = "mycompany-prod"
environment = "production"
cluster_name = "prod-cluster"
region = "us-central1"
subnet_cidr = "10.0.0.0/20"
pods_cidr = "10.4.0.0/14"
services_cidr = "10.0.16.0/20"
master_cidr = "172.16.0.0/28"
node_pools = {
system = {
machine_type = "n2-standard-2"
min_nodes = 1
max_nodes = 3
disk_size_gb = 50
disk_type = "pd-ssd"
spot = false
taints = [{
key = "CriticalAddonsOnly"
value = "true"
effect = "NO_SCHEDULE"
}]
labels = { pool = "system" }
}
application = {
machine_type = "n2-standard-8"
min_nodes = 3
max_nodes = 50
disk_size_gb = 100
disk_type = "pd-ssd"
spot = false
taints = []
labels = { pool = "application" }
}
spot = {
machine_type = "n2-standard-4"
min_nodes = 0
max_nodes = 20
disk_size_gb = 100
disk_type = "pd-ssd"
spot = true
taints = [{
key = "cloud.google.com/gke-spot"
value = "true"
effect = "NO_SCHEDULE"
}]
labels = { pool = "spot" }
}
}
authorized_networks = [
{
cidr_block = "10.0.0.0/8"
display_name = "internal"
},
{
cidr_block = "203.0.113.0/24"
display_name = "office-vpn"
}
]
enable_private_endpoint = false
labels = {
environment = "production"
team = "platform"
managed_by = "terraform"
cost_center = "engineering"
criticality = "high"
}

Step 8 — GitHub Actions CI/CD

# .github/workflows/terraform-gke.yml
name: Deploy GKE
on:
push:
branches: [main]
paths:
- 'gke-terraform/**'
pull_request:
branches: [main]
paths:
- 'gke-terraform/**'
workflow_dispatch:
inputs:
environment:
description: Target environment
required: true
type: choice
options: [dev, staging, production]
action:
description: Terraform action
required: true
type: choice
options: [plan, apply, destroy]
env:
TF_VERSION: "1.6.0"
WORKING_DIR: gke-terraform
jobs:
# ── Validate ────────────────────────────────────────────────
validate:
name: Validate
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Format check
run: terraform fmt -check -recursive
working-directory: ${{ env.WORKING_DIR }}
- name: Validate
run: |
terraform init -backend=false
terraform validate
working-directory: ${{ env.WORKING_DIR }}
# ── Security scan ───────────────────────────────────────────
security:
name: Security Scan
runs-on: ubuntu-latest
needs: validate
steps:
- uses: actions/checkout@v4
- name: Checkov scan
uses: bridgecrewio/checkov-action@master
with:
directory: ${{ env.WORKING_DIR }}
framework: terraform
soft_fail: false
- name: tfsec scan
uses: aquasecurity/tfsec-action@v1.0.0
with:
working_directory: ${{ env.WORKING_DIR }}
# ── Plan ────────────────────────────────────────────────────
plan:
name: Plan ${{ matrix.environment }}
runs-on: ubuntu-latest
needs: [validate, security]
strategy:
matrix:
environment: [dev, staging, production]
permissions:
contents: read
id-token: write
pull-requests: write
steps:
- uses: actions/checkout@v4
- id: auth
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
service_account: terraform-sa@${{ secrets[format('{0}_PROJECT_ID', upper(matrix.environment))] }}.iam.gserviceaccount.com
- uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Terraform Init
run: |
terraform init \
-backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}" \
-backend-config="prefix=gke/${{ matrix.environment }}"
working-directory: ${{ env.WORKING_DIR }}
- name: Terraform Plan
id: plan
run: |
terraform plan \
-var-file="environments/${{ matrix.environment }}.tfvars" \
-out=tfplan-${{ matrix.environment }} \
-no-color \
2>&1 | tee plan-output.txt
echo "exitcode=${PIPESTATUS[0]}" >> $GITHUB_OUTPUT
working-directory: ${{ env.WORKING_DIR }}
- name: Upload plan
uses: actions/upload-artifact@v4
with:
name: tfplan-${{ matrix.environment }}
path: ${{ env.WORKING_DIR }}/tfplan-${{ matrix.environment }}
retention-days: 1
- name: Post plan to PR
uses: actions/github-script@v7
if: github.event_name == 'pull_request'
with:
script: |
const fs = require('fs');
const plan = fs.readFileSync(
'${{ env.WORKING_DIR }}/plan-output.txt',
'utf8'
);
const maxLen = 65000;
const truncated = plan.length > maxLen
? plan.substring(0, maxLen) + '\n... truncated'
: plan;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: `## Terraform Plan — \`${{ matrix.environment }}\`
\`\`\`
${truncated}
\`\`\``
});
# ── Apply Dev ───────────────────────────────────────────────
apply-dev:
name: Apply Dev
runs-on: ubuntu-latest
needs: plan
if: github.ref == 'refs/heads/main'
environment: dev
permissions:
contents: read
id-token: write
steps:
- uses: actions/checkout@v4
- id: auth
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
service_account: ${{ secrets.DEV_TF_SA }}
- uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Download plan
uses: actions/download-artifact@v4
with:
name: tfplan-dev
path: ${{ env.WORKING_DIR }}
- name: Terraform Init
run: |
terraform init \
-backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}" \
-backend-config="prefix=gke/dev"
working-directory: ${{ env.WORKING_DIR }}
- name: Terraform Apply
run: terraform apply -auto-approve tfplan-dev
working-directory: ${{ env.WORKING_DIR }}
- name: Configure kubectl
run: |
${{ steps.apply.outputs.get_credentials }}
- name: Verify cluster
run: |
kubectl get nodes
kubectl get pods -A
# ── Apply Production (requires approval) ────────────────────
apply-production:
name: Apply Production
runs-on: ubuntu-latest
needs: apply-dev
environment: production # approval gate
permissions:
contents: read
id-token: write
steps:
- uses: actions/checkout@v4
- id: auth
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
service_account: ${{ secrets.PROD_TF_SA }}
- uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Download plan
uses: actions/download-artifact@v4
with:
name: tfplan-production
path: ${{ env.WORKING_DIR }}
- name: Terraform Init
run: |
terraform init \
-backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}" \
-backend-config="prefix=gke/production"
working-directory: ${{ env.WORKING_DIR }}
- name: Terraform Apply
run: terraform apply -auto-approve tfplan-production
working-directory: ${{ env.WORKING_DIR }}
- name: Notify deployment
uses: slackapi/slack-github-action@v1.26.0
with:
payload: |
{
"text": "✅ GKE Production cluster deployed successfully\nCluster: prod-cluster\nBy: ${{ github.actor }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK }}

Step 9 — Deploy and Verify

# ── Initial setup ────────────────────────────────────────────
# Create state bucket
gsutil mb -p mycompany-prod \
-l us-central1 \
gs://mycompany-terraform-state
# Enable versioning on state bucket
gsutil versioning set on \
gs://mycompany-terraform-state
# Enable required GCP APIs
gcloud services enable \
container.googleapis.com \
compute.googleapis.com \
iam.googleapis.com \
cloudkms.googleapis.com \
artifactregistry.googleapis.com \
monitoring.googleapis.com \
logging.googleapis.com \
--project mycompany-prod
# ── Deploy dev ───────────────────────────────────────────────
cd gke-terraform
terraform init \
-backend-config="bucket=mycompany-terraform-state" \
-backend-config="prefix=gke/dev"
terraform plan \
-var-file="environments/dev.tfvars" \
-out=tfplan-dev
terraform apply tfplan-dev
# ── Get cluster credentials ──────────────────────────────────
gcloud container clusters get-credentials dev-cluster \
--region us-central1 \
--project mycompany-dev
# ── Verify cluster ───────────────────────────────────────────
kubectl get nodes
# NAME STATUS ROLES AGE
# gke-dev-cluster-application-xxx Ready <none> 2m
# gke-dev-cluster-application-yyy Ready <none> 2m
kubectl get pods -A
# All system pods should be Running
# Check node pool details
kubectl get nodes -L cloud.google.com/gke-nodepool,topology.kubernetes.io/zone
# Verify Workload Identity
kubectl create serviceaccount test-ksa -n default
kubectl annotate serviceaccount test-ksa \
iam.gke.io/gcp-service-account=app-sa@mycompany-dev.iam.gserviceaccount.com
# ── Verify private cluster ───────────────────────────────────
kubectl get nodes -o wide
# EXTERNAL-IP should show <none> for private nodes
# ── Check cluster security ───────────────────────────────────
gcloud container clusters describe dev-cluster \
--region us-central1 \
--format="yaml(masterAuth,networkConfig,privateClusterConfig)"
# ── Destroy dev when done ────────────────────────────────────
terraform destroy \
-var-file="environments/dev.tfvars" \
-auto-approve

Terraform State Management

# List state resources
terraform state list
# Show specific resource
terraform state show module.gke.google_container_cluster.primary
# Move resource (refactoring)
terraform state mv \
module.gke.google_container_node_pool.pools[\"app\"] \
module.gke.google_container_node_pool.pools[\"application\"]
# Import existing cluster
terraform import \
module.gke.google_container_cluster.primary \
projects/mycompany-prod/locations/us-central1/clusters/prod-cluster
# Remove from state (without destroying)
terraform state rm \
module.gke.google_container_node_pool.pools[\"old-pool\"]
# Backup state
gsutil cp \
gs://mycompany-terraform-state/gke/production/default.tfstate \
./backup-$(date +%Y%m%d).tfstate

Common Issues and Fixes

# Issue 1 — API not enabled
# Error: googleapi: Error 403: ... has not been used in project
gcloud services enable container.googleapis.com
# Issue 2 — Insufficient permissions
# Error: Error creating Cluster: googleapi: Error 403
gcloud projects add-iam-policy-binding mycompany-prod \
--member="serviceAccount:terraform-sa@mycompany-prod.iam.gserviceaccount.com" \
--role="roles/container.admin"
# Issue 3 — KMS key permission
# Error: PERMISSION_DENIED: The caller does not have permission
gcloud kms keys add-iam-policy-binding etcd-key \
--keyring=prod-cluster-keyring \
--location=us-central1 \
--member="serviceAccount:service-123@container-engine-robot.iam.gserviceaccount.com" \
--role="roles/cloudkms.cryptoKeyEncrypterDecrypter"
# Issue 4 — Node pool update requires replacement
# Use lifecycle ignore_changes or blue/green node pool strategy
terraform plan -target=module.gke.google_container_node_pool.pools
# Issue 5 — Cluster stuck deleting
# Remove protection and retry
terraform state rm module.gke.google_container_cluster.primary
gcloud container clusters delete prod-cluster \
--region us-central1 \
--project mycompany-prod

This gives you a production-ready GKE deployment — private cluster with VPC-native networking, Workload Identity, KMS encryption, multiple node pools, and a full CI/CD pipeline with security scanning, environment promotion, and approval gates.

Leave a Reply