Design a Postgres operator that handles: failover (if primary node fails, promote replica), backup (hourly snapshots to S3), and scaling (add read replicas on demand). Outline the CRD structure, controller logic, and how you'd handle the primary node failure scenario.
CRD Design
Step 1: Define the Postgres CRD with spec for desired state and status for observed state:
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: postgresinstances.db.example.com
spec:
group: db.example.com
names:
kind: PostgresInstance
plural: postgresinstances
scope: Namespaced
versions:
- name: v1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
properties:
version:
type: string
default: "14"
primaryReplicas:
type: integer
default: 1
readReplicas:
type: integer
default: 2
backup:
type: object
properties:
schedule:
type: string
default: "0 * * * *" # hourly
retentionDays:
type: integer
default: 30
s3Bucket:
type: string
failoverTimeout:
type: string
default: "300s"
status:
type: object
properties:
phase:
type: string
enum: ["Pending", "Creating", "Ready", "Failing", "Failed"]
primaryPod:
type: string
readReplicaPods:
type: array
items:
type: string
lastBackup:
type: string
lastFailover:
type: string
conditions:
type: array
items:
type: object
properties:
type:
type: string
status:
type: string
message:
type: string
Controller Logic
Step 2: Implement reconciliation for each responsibility:
type PostgresReconciler struct {
client.Client
s3Client S3Interface
k8sClient kubernetes.Interface
}
func (r *PostgresReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
pg := &dbv1.PostgresInstance{}
r.Get(ctx, req.NamespacedName, pg)
// Reconciliation phases
// 1. CLUSTER CREATION
if pg.Status.Phase == "" {
r.createPostgresCluster(ctx, pg)
pg.Status.Phase = "Creating"
r.Status().Update(ctx, pg)
return ctrl.Result{RequeueAfter: 10*time.Second}, nil
}
// 2. PRIMARY NODE HEALTH CHECK
if err := r.checkPrimaryHealth(ctx, pg); err != nil {
// Primary is dead, initiate failover
r.initiateFailover(ctx, pg)
pg.Status.Phase = "Failing"
r.Status().Update(ctx, pg)
return ctrl.Result{RequeueAfter: 5*time.Second}, nil
}
// 3. REPLICA SCALING
if err := r.reconcileReplicaCount(ctx, pg); err != nil {
return ctrl.Result{}, err
}
// 4. BACKUP
if time.Since(pg.Status.LastBackup.Time) > 1*time.Hour {
r.performBackup(ctx, pg)
pg.Status.LastBackup = &metav1.Time{Time: time.Now()}
}
// 5. FINALIZATION
if pg.DeletionTimestamp != nil {
r.performCleanup(ctx, pg)
controllerutil.RemoveFinalizer(pg, "postgres.example.com/finalizer")
}
pg.Status.Phase = "Ready"
r.Status().Update(ctx, pg)
return ctrl.Result{RequeueAfter: 30*time.Second}, nil
}
Failover Handling
Step 3: Detect and execute failover when primary is unavailable:
func (r *PostgresReconciler) checkPrimaryHealth(ctx context.Context, pg *dbv1.PostgresInstance) error {
primaryPod := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{
Name: pg.Status.PrimaryPod,
Namespace: pg.Namespace,
}, primaryPod)
if err != nil && errors.IsNotFound(err) {
// Primary pod is gone
return fmt.Errorf("primary pod not found")
}
// Health check: can we connect to primary?
if !r.canConnectToPostgres(ctx, primaryPod) {
return fmt.Errorf("cannot connect to primary")
}
return nil
}
func (r *PostgresReconciler) initiateFailover(ctx context.Context, pg *dbv1.PostgresInstance) error {
// 1. Choose best replica to promote
replicas := pg.Status.ReadReplicaPods
if len(replicas) == 0 {
return fmt.Errorf("no replicas available for failover")
}
newPrimary := r.selectBestReplica(ctx, replicas)
// 2. Promote replica: stop replication, make it writable
cmd := fmt.Sprintf("kubectl exec %s -n %s – psql -U postgres -c 'SELECT pg_promote();'", newPrimary, pg.Namespace)
if err := exec.CommandContext(ctx, "sh", "-c", cmd).Run(); err != nil {
return fmt.Errorf("failed to promote replica: %v", err)
}
// 3. Update CRD status
pg.Status.PrimaryPod = newPrimary
pg.Status.LastFailover = &metav1.Time{Time: time.Now()}
// 4. Create new replica to replace old primary (or reuse old primary if it recovers)
r.createNewReplica(ctx, pg)
// 5. Emit event
r.Recorder.Event(pg, corev1.EventTypeWarning, "FailoverExecuted", fmt.Sprintf("Promoted replica %s to primary", newPrimary))
return nil
}
Backup Implementation
Step 4: Scheduled backups to S3:
func (r *PostgresReconciler) performBackup(ctx context.Context, pg *dbv1.PostgresInstance) error {
// 1. Create backup from primary using pg_dump
backupFile := fmt.Sprintf("/tmp/postgres-backup-%s-%d.sql", pg.Name, time.Now().Unix())
cmd := fmt.Sprintf("kubectl exec %s -n %s – pg_dump -U postgres -d postgres > %s",
pg.Status.PrimaryPod, pg.Namespace, backupFile)
if err := exec.CommandContext(ctx, "sh", "-c", cmd).Run(); err != nil {
return fmt.Errorf("backup failed: %v", err)
}
// 2. Upload to S3
file, err := os.Open(backupFile)
if err != nil {
return err
}
defer file.Close()
uploadPath := fmt.Sprintf("backups/%s/%s-%d.sql.gz", pg.Namespace, pg.Name, time.Now().Unix())
if err := r.s3Client.PutObject(ctx, &s3.PutObjectInput{
Bucket: aws.String(pg.Spec.Backup.S3Bucket),
Key: aws.String(uploadPath),
Body: file,
}); err != nil {
return fmt.Errorf("failed to upload backup to S3: %v", err)
}
// 3. Cleanup old backups (retention policy)
r.cleanupOldBackups(ctx, pg)
return nil
}
Key Design Decisions
- CRD spec is declarative (desired state), status is observational (actual state)
- Failover is automatic but recorded in status.lastFailover for auditing
- Backups run on schedule via reconciliation loop (not external CronJob)
- Replicas are managed as child resources (owned by PostgresInstance, deleted when parent deleted)
Follow-up: How would you handle network partitioning where primary and replicas can't communicate but both think they're healthy? Design a quorum-based failover.
Your operator creates a StatefulSet for the Postgres cluster. During failover, you delete the old primary pod and promote a replica. But the StatefulSet controller immediately recreates the deleted pod, undoing your failover. How do you prevent this?
StatefulSet ownership conflict: StatefulSet wants to maintain replicas, but your operator needs surgical control during failover. Solve this:
1. Don't use StatefulSet for Postgres pods. Use Deployment instead (gives you more control):
// BEFORE: StatefulSet
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres-cluster
spec:
serviceName: postgres
replicas: 3
selector:
matchLabels:
app: postgres
template:
metadata:
labels:
app: postgres
spec:
containers:
- name: postgres
image: postgres:14
// AFTER: Deployment with pod affinity to spread replicas
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgres-cluster
spec:
replicas: 3
selector:
matchLabels:
app: postgres
template:
metadata:
labels:
app: postgres
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- postgres
topologyKey: kubernetes.io/hostname
containers:
- name: postgres
image: postgres:14
2. With Deployment, you have control. During failover, delete the old primary pod and immediately patch the Deployment to remove that pod from the selector:
// During failover:
// 1. Promote replica
newPrimary := replicas[0]
r.promoteReplica(ctx, newPrimary)
// 2. Delete old primary pod
oldPrimary := pg.Status.PrimaryPod
kubectl.Delete(ctx, oldPrimary)
// 3. Update CRD to track that old primary is gone
pg.Status.PrimaryPod = newPrimary
pg.Status.DeadPods = append(pg.Status.DeadPods, oldPrimary)
// 4. Optional: patch Deployment to reduce replicas temporarily
// so it doesn’t immediately respawn the old primary
deployment := &appsv1.Deployment{}
r.Get(ctx, types.NamespacedName{Name: "postgres-cluster"}, deployment)
// We can modify replica count, labels, etc. to prevent respawning
3. Alternative: Use Deployment with a custom init container that checks the pod’s role before starting:
// Postgres pod init container
// Checks: "Is this pod supposed to be primary or replica?"
// If it thinks it should be primary but CRD says replica, exit and wait
initContainers:
- name: postgres-role-check
image: postgres-role-checker:1.0
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POSTGRES_INSTANCE
value: postgres-cluster
command:
- /bin/sh
- -c
- |
Check if this pod should be running
EXPECTED_ROLE=$(kubectl get PostgresInstance $POSTGRES_INSTANCE -o jsonpath='{.status.primaryPod}')
if [ "$POD_NAME" != "$EXPECTED_ROLE" ]; then
I’m not the primary, wait for replica setup
sleep 1000 # Sleep forever, pod will be terminated
fi
4. Best practice: Use an Operator framework (Kubebuilder, Operator SDK) that handles this complexity:
// With kubebuilder:
// Define owner references so your PostgresInstance owns the Deployment
// When PostgresInstance is deleted, Deployment is auto-deleted
// You control Deployment replicas and labels programmatically
ctrl.NewControllerManagedBy(mgr).
For(&dbv1.PostgresInstance{}).
Owns(&appsv1.Deployment{}).
Complete(&PostgresReconciler{})
// This ensures clean ownership and prevents conflicts
Follow-up: Design a test that simulates primary node failure and verifies failover completes without StatefulSet interference.
You want to support custom backup strategies: users can specify "hourly", "daily", "on-demand", or "never". Your operator must execute the right strategy. But operators can't schedule cron jobs well inside the reconciliation loop. Design a solution that supports user-defined backup policies without external cron infrastructure.
Backup scheduling inside operator: use the reconciliation loop + status tracking to decide when to backup:
1. Add backup policy to the CRD spec:
apiVersion: db.example.com/v1
kind: PostgresInstance
metadata:
name: my-postgres
spec:
backup:
policy: "hourly" # or "daily", "on-demand", "never"
schedule: "0 * * * *" # cron format for advanced users
s3Bucket: "my-backups"
retentionDays: 30
status:
lastBackupTime: "2026-04-07T10:00:00Z"
nextBackupTime: "2026-04-07T11:00:00Z"
backupStatus: "Success"
2. In reconciliation, check if it's time to backup:
func (r *PostgresReconciler) shouldBackup(ctx context.Context, pg *dbv1.PostgresInstance) bool {
policy := pg.Spec.Backup.Policy
lastBackup := pg.Status.LastBackupTime
now := time.Now()
switch policy {
case "never":
return false
case "on-demand":
// Check if user manually triggered backup
if pg.Annotations["backup.example.com/trigger"] == "true" {
delete(pg.Annotations, "backup.example.com/trigger")
return true
}
return false
case "hourly":
if lastBackup == nil || now.Sub(lastBackup.Time) > 1time.Hour {
return true
}
case "daily":
if lastBackup == nil || now.Sub(lastBackup.Time) > 24time.Hour {
return true
}
case "custom":
// Parse cron schedule from spec.backup.schedule
nextBackup := r.parseNextBackupTime(pg.Spec.Backup.Schedule, lastBackup)
if now.After(nextBackup) {
return true
}
}
return false
}
func (r *PostgresReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
pg := &dbv1.PostgresInstance{}
r.Get(ctx, req.NamespacedName, pg)
// Check if it’s time to backup
if r.shouldBackup(ctx, pg) {
if err := r.performBackup(ctx, pg); err != nil {
pg.Status.BackupStatus = "Failed"
r.Status().Update(ctx, pg)
return ctrl.Result{RequeueAfter: 5*time.Minute}, err
}
pg.Status.LastBackupTime = &metav1.Time{Time: time.Now()}
pg.Status.BackupStatus = "Success"
}
// Calculate next backup time for scheduling
if pg.Spec.Backup.Policy != "never" {
nextBackup := r.calculateNextBackupTime(pg)
pg.Status.NextBackupTime = &metav1.Time{Time: nextBackup}
// Requeue at next backup time
requeue := time.Until(nextBackup)
r.Status().Update(ctx, pg)
return ctrl.Result{RequeueAfter: requeue}, nil
}
return ctrl.Result{RequeueAfter: 5*time.Minute}, nil
}
3. Support on-demand backups via annotation:
// User manually triggers backup:
kubectl annotate PostgresInstance my-postgres backup.example.com/trigger=true --overwrite
// Operator detects annotation and immediately backs up
// After backup, annotation is cleared
4. Handle cron schedule parsing for advanced users:
import "github.com/robfig/cron/v3"
func (r *PostgresReconciler) calculateNextBackupTime(pg dbv1.PostgresInstance) time.Time {
if pg.Spec.Backup.Policy == "custom" && pg.Spec.Backup.Schedule != "" {
// Parse cron expression
parser := cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
schedule, err := parser.Parse(pg.Spec.Backup.Schedule)
if err != nil {
return time.Now().Add(1time.Hour) // Default fallback
}
return schedule.Next(time.Now())
}
switch pg.Spec.Backup.Policy {
case "hourly":
return time.Now().Add(1time.Hour)
case "daily":
return time.Now().Add(24time.Hour)
}
return time.Now().Add(1*time.Hour)
}
5. This approach has advantages:
- No external cron infrastructure needed
- Flexible: users can change policy without redeploying operator
- On-demand backups via annotation
- Status shows exact next backup time
Follow-up: How would you implement a backup verification that ensures backups are restorable without restoring them?
You deployed your Postgres operator. Users create PostgresInstance CRDs, and backups start automatically. But after a month, you realize backups are silently failing in S3 permission errors, and no one noticed. The operator logs the error but keeps running. How do you make backup failures visible and actionable?
Silent failures are dangerous. Make failures visible via multiple channels:
1. Add conditions to the CRD status to surface issues:
apiVersion: db.example.com/v1
kind: PostgresInstance
metadata:
name: my-postgres
status:
conditions:
- type: BackupHealthy
status: "False"
reason: "S3PermissionDenied"
message: "Cannot write to S3 bucket: AccessDenied"
lastTransitionTime: "2026-04-07T12:00:00Z"
- type: PrimaryHealthy
status: "True"
- type: ReplicasReady
status: "True"
2. In reconciliation, set conditions based on actual state:
func (r *PostgresReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
pg := &dbv1.PostgresInstance{}
r.Get(ctx, req.NamespacedName, pg)
// Perform backup
if r.shouldBackup(ctx, pg) {
if err := r.performBackup(ctx, pg); err != nil {
// Set condition to indicate backup failure
meta.SetStatusCondition(&pg.Status.Conditions, metav1.Condition{
Type: "BackupHealthy",
Status: "False",
Reason: "BackupFailed",
Message: fmt.Sprintf("Backup failed: %v", err),
ObservedGeneration: pg.Generation,
})
r.Status().Update(ctx, pg)
// Emit event so it shows in kubectl describe
r.Recorder.Event(pg, corev1.EventTypeWarning, "BackupFailed", err.Error())
// Requeue sooner to retry
return ctrl.Result{RequeueAfter: 5*time.Minute}, nil
}
// Success: clear the condition
meta.SetStatusCondition(&pg.Status.Conditions, metav1.Condition{
Type: "BackupHealthy",
Status: "True",
Reason: "BackupSucceeded",
Message: "Backup completed successfully",
ObservedGeneration: pg.Generation,
})
}
r.Status().Update(ctx, pg)
return ctrl.Result{RequeueAfter: 1*time.Hour}, nil
}
3. Export metrics to monitoring (Prometheus):
import "github.com/prometheus/client_golang/prometheus"
var (
backupFailures = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "postgres_backup_failures_total",
Help: "Total number of failed backups",
},
[]string{"instance", "reason"},
)
lastBackupTimestamp = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "postgres_last_backup_timestamp_seconds",
Help: "Unix timestamp of last successful backup",
},
[]string{"instance"},
)
)
// In reconciliation:
if err := r.performBackup(ctx, pg); err != nil {
backupFailures.WithLabelValues(pg.Name, extractErrorReason(err)).Inc()
return ctrl.Result{}, err
}
lastBackupTimestamp.WithLabelValues(pg.Name).Set(float64(time.Now().Unix()))
4. Add alerting rules to Prometheus:
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: postgres-operator-alerts
spec:
groups:
- name: postgres
rules:
-
alert: PostgresBackupFailed
expr: |
increase(postgres_backup_failures_total[1h]) > 0
for: 1m
annotations:
summary: "Postgres backup failed for {{ $labels.instance }}"
description: "Backup failure: {{ $labels.reason }}"
-
alert: PostgresBackupStale
expr: |
time() - postgres_last_backup_timestamp_seconds > 86400 # 24 hours
for: 10m
annotations:
summary: "No backup for {{ $labels.instance }} in 24 hours"
5. Add a kubectl plugin to surface issues easily:
#!/bin/bash
kubectl-pg-status plugin
INSTANCE=$1
kubectl get PostgresInstance $INSTANCE -o json | jq '.status.conditions[] | select(.status=="False") | {type, reason, message}'
Usage:
kubectl pg-status my-postgres
Output:
{
"type": "BackupHealthy",
"reason": "S3PermissionDenied",
"message": "Cannot write to S3: AccessDenied"
}
6. Users can now see issues immediately:
kubectl describe PostgresInstance my-postgres
# Shows conditions with reasons
kubectl logs -n postgres-system deployment/postgres-operator | grep -i "backup|error"
Shows detailed logs
Prometheus alerts notify on-call team
Best Practice: Every operator action should be auditable: conditions for state, events for history, metrics for monitoring, logs for debugging.
Follow-up: Design a dashboard that shows backup health, failover history, and replica status across all PostgresInstance objects in the cluster.
Your operator manages Postgres clusters. A user creates a PostgresInstance with 10 read replicas. The operator creates 10 pods successfully. But during scaling down from 10 to 3, the operator kills 7 pods, but the data on those pods' PVCs remains. After accidental deletion, the PVCs still consume 1TB. Design how your operator should handle PVC cleanup during scale-down.
Scale-down with PVC cleanup is dangerous. Must be explicit and auditable:
1. Distinguish between pod deletion and data deletion:
// In reconciliation, when scaling down:
// BEFORE: scaling = 10
// AFTER: scaling = 3
// Must delete 7 replicas and their data
// Step 1: Delete pods first (but keep PVCs)
for i := 3; i < 10; i++ {
pod := &corev1.Pod{}
r.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("postgres-replica-%d", i)}, pod)
r.Delete(ctx, pod)
}
// Wait for pods to terminate
time.Sleep(30*time.Second)
// Step 2: Decide what to do with PVCs
// Option A: Keep them (safe, but wastes storage)
// Option B: Delete them (risky, data is lost)
2. Add deletion policy to CRD to make it explicit:
apiVersion: db.example.com/v1
kind: PostgresInstance
metadata:
name: my-postgres
spec:
readReplicas: 3
dataRetention:
policy: "delete" # or "retain"
confirmDeletion: true # Requires explicit annotation to delete
status:
replicaCount: 3
replicaPVCs:
- postgres-replica-1-pvc
- postgres-replica-2-pvc
- postgres-replica-3-pvc
3. Implement safe PVC deletion with confirmation:
func (r *PostgresReconciler) reconcileReplicaCount(ctx context.Context, pg *dbv1.PostgresInstance) error {
// Current replicas
currentPods := r.getReplicaPods(ctx, pg)
// Desired replicas
desired := pg.Spec.ReadReplicas
current := len(currentPods)
if current > desired {
// Scaling down
podsToDelete := currentPods[desired:]
// Check deletion policy
if pg.Spec.DataRetention.Policy == "delete" {
if pg.Spec.DataRetention.ConfirmDeletion {
// Require explicit confirmation via annotation
if pg.Annotations["postgres.example.com/confirm-deletion"] != "true" {
// Emit warning event
r.Recorder.Event(pg, corev1.EventTypeWarning, "DeletionPending",
fmt.Sprintf("Scale down requested but requires confirmation. Annotate with confirm-deletion=true"))
return fmt.Errorf("deletion not confirmed")
}
}
// Perform deletion
for _, pod := range podsToDelete {
pvc := r.getPVCForPod(ctx, pod)
// Delete pod first
r.Delete(ctx, pod)
// Wait for pod to terminate
time.Sleep(10*time.Second)
// Delete PVC
r.Delete(ctx, pvc)
// Record in status
pg.Status.DeletedPVCs = append(pg.Status.DeletedPVCs, pvc.Name)
}
} else if pg.Spec.DataRetention.Policy == "retain" {
// Just delete pods, keep PVCs
for _, pod := range podsToDelete {
r.Delete(ctx, pod)
}
// PVCs remain for recovery/inspection
log.Info("Retained PVCs for deleted replicas", "pvcs", r.getPVCNames(podsToDelete))
}
}
return nil
}
4. Usage: require explicit confirmation:
// Scale down request
kubectl patch PostgresInstance my-postgres -p '{"spec":{"readReplicas":3}}'
// Operator detects pending deletion, emits event
kubectl get events
PostgresInstance my-postgres: Scale down requires confirmation
User explicitly confirms
kubectl annotate PostgresInstance my-postgres postgres.example.com/confirm-deletion=true --overwrite
// Operator detects confirmation, deletes replicas and PVCs
5. Add a safeguard: require manual annotation for destructive operations:
// Prevent accidental deletions
if pg.Spec.DataRetention.Policy == "delete" && pg.Spec.DataRetention.ConfirmDeletion {
if pg.Annotations["postgres.example.com/confirm-deletion"] != "true" {
return fmt.Errorf("deletion not confirmed")
}
// Auto-clear confirmation after 1 hour to prevent accidental re-deletes
lastConfirmed := pg.Annotations["postgres.example.com/last-confirmed"]
if lastConfirmed != "" {
lastTime, _ := time.Parse(time.RFC3339, lastConfirmed)
if time.Since(lastTime) > 1*time.Hour {
delete(pg.Annotations, "postgres.example.com/confirm-deletion")
return fmt.Errorf("deletion confirmation expired")
}
}
}
6. Alternative: Move PVCs to archive storage instead of deleting:
// Instead of deleting PVC immediately:
// 1. Copy data to long-term storage (S3, archive bucket)
r.archivePVCData(ctx, pvc)
// 2. Mark PVC for deletion after archive succeeds
pvc.Annotations["archive.example.com/archived"] = "true"
pvc.Annotations["archive.example.com/delete-after"] = time.Now().Add(30*time.Day).Format(time.RFC3339)
r.Update(ctx, pvc)
// 3. A separate cleanup controller periodically deletes PVCs marked for deletion
Best Practice: Data deletion should ALWAYS require explicit confirmation. Offer retention options (keep PVCs, archive to S3) to prevent accidental data loss.
Follow-up: Design an audit log that tracks every destructive operation (pod deletion, PVC deletion) with who triggered it and when.
Your operator has a webhook that validates PostgresInstance CRDs before they're created. The webhook rejects invalid configurations (e.g., readReplicas > 10, s3Bucket not accessible). But the webhook pod crashes periodically, and when it does, all PostgresInstance operations hang (kubectl apply blocks). Design a resilient validation strategy.
Webhook crashes block all CRD operations. Must be resilient:
1. Implement the webhook with failure policy set to "ignore" to allow operations even if webhook fails:
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: postgres-validator
webhooks:
- name: postgres-validator.example.com
failurePolicy: Ignore # Allow operations if webhook is unavailable
timeoutSeconds: 5 # Don't wait forever
sideEffects: None
admissionReviewVersions: ["v1"]
clientConfig:
service:
name: postgres-operator
namespace: postgres-system
path: "/validate"
caBundle: LS0tLS... (base64 CA cert)
rules:
- operations: ["CREATE", "UPDATE"]
apiGroups: ["db.example.com"]
apiVersions: ["v1"]
resources: ["postgresinstances"]
scope: "Namespaced"
2. Implement webhook with proper error handling:
func validatePostgres(w http.ResponseWriter, r *http.Request) {
admissionReview := admissionv1.AdmissionReview{}
json.NewDecoder(r.Body).Decode(&admissionReview)
pg := &dbv1.PostgresInstance{}
json.Unmarshal(admissionReview.Request.Object.Raw, pg)
allowed := true
var reason string
// Validate readReplicas
if pg.Spec.ReadReplicas > 10 {
allowed = false
reason = "readReplicas cannot exceed 10"
}
// Validate S3 bucket accessibility
if !r.canAccessS3Bucket(pg.Spec.Backup.S3Bucket) {
allowed = false
reason = fmt.Sprintf("S3 bucket %s not accessible", pg.Spec.Backup.S3Bucket)
}
// Return admission review response
admissionResponse := &admissionv1.AdmissionResponse{
UID: admissionReview.Request.UID,
Allowed: allowed,
Result: &metav1.Status{
Message: reason,
},
}
json.NewEncoder(w).Encode(admissionv1.AdmissionReview{
TypeMeta: metav1.TypeMeta{
APIVersion: "admission.k8s.io/v1",
Kind: "AdmissionReview",
},
Response: admissionResponse,
})
}
3. Add health check endpoint to detect webhook failures:
// Webhook health endpoint
func healthCheck(w http.ResponseWriter, r *http.Request) {
// Check if webhook can access dependencies (S3, external APIs)
if r.s3Client == nil || !r.canReachS3() {
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte("S3 unreachable"))
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte("ok"))
}
// In webhook config, add liveness/readiness
livenessProbe:
httpGet:
path: /health
port: 8443
scheme: HTTPS
failureThreshold: 2
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 8443
scheme: HTTPS
failureThreshold: 1
periodSeconds: 5
4. Use multiple webhook replicas for redundancy:
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgres-operator-webhook
namespace: postgres-system
spec:
replicas: 3 # Multiple replicas
selector:
matchLabels:
app: postgres-operator-webhook
template:
metadata:
labels:
app: postgres-operator-webhook
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- postgres-operator-webhook
topologyKey: kubernetes.io/hostname
5. Alternative: Move validation to controller instead of webhook
// Instead of blocking at admission time:
// Allow CRD creation, but immediately validate in reconciliation
func (r *PostgresReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
pg := &dbv1.PostgresInstance{}
r.Get(ctx, req.NamespacedName, pg)
// Validate
validationErrors := r.validatePostgres(pg)
if len(validationErrors) > 0 {
// Set condition instead of failing
meta.SetStatusCondition(&pg.Status.Conditions, metav1.Condition{
Type: "Valid",
Status: "False",
Reason: "ValidationFailed",
Message: strings.Join(validationErrors, "; "),
})
r.Status().Update(ctx, pg)
return ctrl.Result{RequeueAfter: 5*time.Minute}, nil
}
// Proceed with reconciliation
}
Best Practice: Use webhook for fast rejection of clearly invalid configs, but make failure graceful. Use controller validation as fallback. This way, temporary webhook failures don’t block users.
Follow-up: Design a system that allows webhook to be updated/redeployed without blocking any PostgresInstance operations.
Your operator creates a network policy that allows Postgres pods to communicate with S3 for backups. A cluster admin later deletes the network policy for security audit reasons. The operator reconciles and immediately recreates it. The admin is frustrated: "Why can't I delete policies I don't want?" Design how your operator should respect manual deletions.
Operator ownership conflict: operator creates resources, admin deletes them, operator recreates. Respect the delete intent:
1. Implement a "pause" annotation that lets admins opt out of operator management:
apiVersion: db.example.com/v1
kind: PostgresInstance
metadata:
name: my-postgres
annotations:
postgres.example.com/paused: "true" # Admin can pause operator
spec:
readReplicas: 3
status:
paused: true # Operator stops reconciling this object
2. In reconciliation, check for pause:
func (r *PostgresReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
pg := &dbv1.PostgresInstance{}
r.Get(ctx, req.NamespacedName, pg)
// Check for pause annotation
if pg.Annotations["postgres.example.com/paused"] == "true" {
pg.Status.Paused = true
r.Status().Update(ctx, pg)
// Stop reconciliation, but don’t delete child resources
// This allows admin to manually manage them
return ctrl.Result{}, nil
}
pg.Status.Paused = false
// Normal reconciliation continues
}
3. For fine-grained control, allow admins to disable specific features:
apiVersion: db.example.com/v1
kind: PostgresInstance
metadata:
name: my-postgres
spec:
features:
backup:
enabled: true
networkPolicy:
enabled: false # Admin disabled network policy creation
managed: false # Don’t recreate if manually deleted
failover:
enabled: true
4. In reconciliation, respect feature flags:
func (r *PostgresReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
pg := &dbv1.PostgresInstance{}
r.Get(ctx, req.NamespacedName, pg)
// Create network policy only if enabled and managed
if pg.Spec.Features.NetworkPolicy.Enabled {
np := &networkingv1.NetworkPolicy{}
err := r.Get(ctx, types.NamespacedName{Name: pg.Name + "-policy"}, np)
if errors.IsNotFound(err) {
// Create if missing
r.createNetworkPolicy(ctx, pg)
} else if err != nil {
return ctrl.Result{}, err
} else {
// Exists: check if admin manually deleted/modified it
if pg.Spec.Features.NetworkPolicy.Managed {
// Operator owns it, update if needed
r.updateNetworkPolicy(ctx, pg, np)
} else {
// Admin owns it, don’t modify
log.Info("NetworkPolicy not managed by operator", "instance", pg.Name)
}
}
} else {
// Feature disabled: delete if operator created it
r.deleteNetworkPolicy(ctx, pg)
}
}
5. Use owner references to identify operator-managed resources:
// When creating a resource, set owner reference
networkPolicy := &networkingv1.NetworkPolicy{
ObjectMeta: metav1.ObjectMeta{
Name: pg.Name + "-policy",
Namespace: pg.Namespace,
OwnerReferences: []metav1.OwnerReference{
*metav1.NewControllerRef(pg, dbv1.GroupVersion.WithKind("PostgresInstance")),
},
},
}
// Later, if admin deletes the NetworkPolicy:
// 1. Operator detects it’s missing
// 2. Checks if it has an owner reference to this PostgresInstance
// 3. If yes and managed=true, recreates it
// 4. If yes and managed=false, respects deletion and doesn’t recreate
6. Allow granular opt-out via finalizers:
// Operator doesn’t automatically delete child resources
// Instead, document how admin can safely delete them:
// For NetworkPolicy:
kubectl delete networkpolicy my-postgres-policy -n default
// For Backup Job:
kubectl delete cronjob my-postgres-backup -n default
// Operator will just re-create them on next reconcile
// To permanently disable: annotate with managed=false
kubectl annotate PostgresInstance my-postgres postgres.example.com/managed-networkpolicy=false
Best Practice: Operator should be conservative: create resources if they don’t exist, update if they drift, but respect deletion intent if admin removes them and marks as "not managed".
Follow-up: Design a governance policy that prevents cluster admins from accidentally deleting critical operator-managed resources.