From 36b8a193ce16afefdcd93079e7d264bb3c70167b Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 26 Mar 2026 15:52:29 +0200 Subject: [PATCH 01/19] Add tiered storage cache as a separate PVC --- api/v1beta1/kafkacluster_types.go | 21 ++-- api/v1beta1/kafkacluster_types_test.go | 18 ++++ pkg/resources/cruisecontrol/configmap.go | 7 ++ pkg/resources/cruisecontrol/configmap_test.go | 101 ++++++++++++++++++ 4 files changed, 140 insertions(+), 7 deletions(-) diff --git a/api/v1beta1/kafkacluster_types.go b/api/v1beta1/kafkacluster_types.go index 0e03045fe..32e4bb3a8 100644 --- a/api/v1beta1/kafkacluster_types.go +++ b/api/v1beta1/kafkacluster_types.go @@ -572,6 +572,12 @@ type StorageConfig struct { // the `pvcSpec` is used by default. // +optional EmptyDir *corev1.EmptyDirVolumeSource `json:"emptyDir,omitempty"` + + // TieredStorageCache indicates this storage is used for Kafka tiered storage cache + // (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + // excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + // +optional + TieredStorageCache bool `json:"tieredStorageCache,omitempty"` } // ListenersConfig defines the Kafka listener types @@ -1031,16 +1037,17 @@ func (bConfig *BrokerConfig) GetTerminationGracePeriod() int64 { } // GetStorageMountPaths returns a string with comma-separated storage mount paths that the broker uses +// for Kafka log.dirs. Tiered storage cache volumes are excluded. func (bConfig *BrokerConfig) GetStorageMountPaths() string { - var mountPaths string - for i, sc := range bConfig.StorageConfigs { - if i != len(bConfig.StorageConfigs)-1 { - mountPaths += sc.MountPath + "," - } else { - mountPaths += sc.MountPath + var mountPaths []string + for _, sc := range bConfig.StorageConfigs { + // Skip tiered storage cache volumes - they should not be in log.dirs + if sc.TieredStorageCache { + continue } + mountPaths = append(mountPaths, sc.MountPath) } - return mountPaths + return strings.Join(mountPaths, ",") } // GetNodeSelector returns the node selector for cruise control diff --git a/api/v1beta1/kafkacluster_types_test.go b/api/v1beta1/kafkacluster_types_test.go index 38a7be38e..10c3c5362 100644 --- a/api/v1beta1/kafkacluster_types_test.go +++ b/api/v1beta1/kafkacluster_types_test.go @@ -622,6 +622,24 @@ func TestGetStorageMountPaths(t *testing.T) { }, expectedMountPaths: "test-log-1,test-log-2,test-log-3,test-log-4,test-log-5", }, + { + testName: "BrokerConfig with tiered storage cache should exclude it from mount paths", + brokerConfig: &BrokerConfig{ + StorageConfigs: []StorageConfig{ + { + MountPath: "test-log-1", + }, + { + MountPath: "test-log-2", + }, + { + MountPath: "/tiered-storage-cache", + TieredStorageCache: true, + }, + }, + }, + expectedMountPaths: "test-log-1,test-log-2", + }, } for _, test := range testCases { diff --git a/pkg/resources/cruisecontrol/configmap.go b/pkg/resources/cruisecontrol/configmap.go index 20735de89..10d079d2f 100644 --- a/pkg/resources/cruisecontrol/configmap.go +++ b/pkg/resources/cruisecontrol/configmap.go @@ -346,6 +346,13 @@ func generateBrokerDisks(brokerState v1beta1.Broker, kafkaClusterSpec v1beta1.Ka // Generate log dir configuration logDirs := make(map[string]string, len(storageConfigs)) for path, conf := range storageConfigs { + // Skip tiered storage cache volumes - they should not be included in Cruise Control capacity + if conf.TieredStorageCache { + log.V(1).Info("skipping tiered storage cache volume from Cruise Control capacity", + v1beta1.BrokerIdLabelKey, brokerState.Id, "mountPath", path) + continue + } + size := parseMountPathWithSize(conf) log.V(1).Info(fmt.Sprintf("broker log.dir %s size in MB: %d", path, size), v1beta1.BrokerIdLabelKey, brokerState.Id) diff --git a/pkg/resources/cruisecontrol/configmap_test.go b/pkg/resources/cruisecontrol/configmap_test.go index 89ec08f27..5e2420ed5 100644 --- a/pkg/resources/cruisecontrol/configmap_test.go +++ b/pkg/resources/cruisecontrol/configmap_test.go @@ -1017,4 +1017,105 @@ func TestGenerateCapacityConfigWithUserProvidedInput(t *testing.T) { } }) } + + // Test tiered storage cache exclusion from capacity + t.Run("tiered storage cache should be excluded from capacity", func(t *testing.T) { + quantity, _ := resource.ParseQuantity("100Gi") + tieredCacheQuantity, _ := resource.ParseQuantity("50Gi") + cpuQuantity, _ := resource.ParseQuantity("2000m") + + kafkaCluster := v1beta1.KafkaCluster{ + Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{ + { + Id: 0, + BrokerConfig: &v1beta1.BrokerConfig{ + Resources: &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "cpu": cpuQuantity, + }, + }, + StorageConfigs: []v1beta1.StorageConfig{ + { + MountPath: "/kafka-logs", + PvcSpec: &v1.PersistentVolumeClaimSpec{ + Resources: v1.VolumeResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: quantity, + }, + }, + }, + }, + { + MountPath: "/tiered-storage-cache", + TieredStorageCache: true, + PvcSpec: &v1.PersistentVolumeClaimSpec{ + Resources: v1.VolumeResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: tieredCacheQuantity, + }, + }, + }, + }, + }, + }, + }, + }, + }, + Status: v1beta1.KafkaClusterStatus{ + BrokersState: map[string]v1beta1.BrokerState{ + "0": {}, + }, + }, + } + + expectedConfiguration := `{ + "brokerCapacities": [ + { + "brokerId": "0", + "capacity": { + "DISK": { + "/kafka-logs/kafka": "107374" + }, + "CPU": "200", + "NW_IN": "125000", + "NW_OUT": "125000" + }, + "doc": "Capacity unit used for disk is in MB, cpu is in percentage, network throughput is in KB." + } + ] +}` + + var actual JBODInvariantCapacityConfig + rawStringActual, err := GenerateCapacityConfig(&kafkaCluster, logr.Discard(), nil) + if err != nil { + t.Error(err, "could not generate capacity config") + } + err = json.Unmarshal([]byte(rawStringActual), &actual) + if err != nil { + t.Error(err, "could not unmarshal actual json") + } + + var expected JBODInvariantCapacityConfig + err = json.Unmarshal([]byte(expectedConfiguration), &expected) + if err != nil { + t.Error(err, "could not unmarshal expected json") + } + + if !reflect.DeepEqual(actual, expected) { + t.Errorf("Expected: %+v, got: %+v", expected, actual) + } + + // Verify that tiered storage cache is NOT in the capacity + actualCapacity := actual.Capacities[0].(map[string]interface{}) + diskCapacity := actualCapacity["capacity"].(map[string]interface{})["DISK"].(map[string]interface{}) + + if _, exists := diskCapacity["/tiered-storage-cache/kafka"]; exists { + t.Error("Tiered storage cache should not be in Cruise Control capacity") + } + + if _, exists := diskCapacity["/kafka-logs/kafka"]; !exists { + t.Error("Regular storage should be in Cruise Control capacity") + } + }) } From 89e99881f7fd087e88d9083b5026d1b161290f31 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 1 Apr 2026 12:28:52 +0300 Subject: [PATCH 02/19] initial commit --- charts/kafka-operator/crds/kafkaclusters.yaml | 12 + .../kafka.banzaicloud.io_kafkaclusters.yaml | 12 + .../samples/kafkacluster_tiered_storage.yaml | 147 +++++++ ...r_tiered_storage_cache_resize_initial.yaml | 98 +++++ ...er_tiered_storage_cache_resize_shrunk.yaml | 98 +++++ docs/tiered-storage-pvc-resize.md | 302 +++++++++++++++ pkg/resources/kafka/kafka.go | 237 +++++++++++- pkg/resources/kafka/kafka_test.go | 234 +++++++++++ pkg/resources/kafka/pvc.go | 9 +- tests/e2e/test_tiered_storage_cache_resize.go | 364 ++++++++++++++++++ 10 files changed, 1506 insertions(+), 7 deletions(-) create mode 100644 config/samples/kafkacluster_tiered_storage.yaml create mode 100644 config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml create mode 100644 config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml create mode 100644 docs/tiered-storage-pvc-resize.md create mode 100644 tests/e2e/test_tiered_storage_cache_resize.go diff --git a/charts/kafka-operator/crds/kafkaclusters.yaml b/charts/kafka-operator/crds/kafkaclusters.yaml index ff4636057..e527f7a1e 100644 --- a/charts/kafka-operator/crds/kafkaclusters.yaml +++ b/charts/kafka-operator/crds/kafkaclusters.yaml @@ -5100,6 +5100,12 @@ spec: the PersistentVolume backing this claim. type: string type: object + tieredStorageCache: + description: |- + TieredStorageCache indicates this storage is used for Kafka tiered storage cache + (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + type: boolean required: - mountPath type: object @@ -12124,6 +12130,12 @@ spec: to the PersistentVolume backing this claim. type: string type: object + tieredStorageCache: + description: |- + TieredStorageCache indicates this storage is used for Kafka tiered storage cache + (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + type: boolean required: - mountPath type: object diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index ff4636057..d99ceea45 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -5100,6 +5100,12 @@ spec: the PersistentVolume backing this claim. type: string type: object + tieredStorageCache: + description: |- + TieredStorageCache indicates this storage is used for Kafka tiered storage cache + (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + type: boolean required: - mountPath type: object @@ -12124,6 +12130,12 @@ spec: to the PersistentVolume backing this claim. type: string type: object + tieredStorageCache: + description: |- + TieredStorageCache indicates this storage is used for Kafka tiered storage cache + (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + type: boolean required: - mountPath type: object diff --git a/config/samples/kafkacluster_tiered_storage.yaml b/config/samples/kafkacluster_tiered_storage.yaml new file mode 100644 index 000000000..e79c5b72b --- /dev/null +++ b/config/samples/kafkacluster_tiered_storage.yaml @@ -0,0 +1,147 @@ +apiVersion: kafka.banzaicloud.io/v1beta1 +kind: KafkaCluster +metadata: + labels: + controller-tools.k8s.io: "1.0" + name: kafka-tiered-storage +spec: + kRaft: true + headlessServiceEnabled: true + oneBrokerPerNode: false + clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" + + # Cluster-wide Kafka configuration including tiered storage settings + readOnlyConfig: | + auto.create.topics.enable=false + cruise.control.metrics.topic.auto.create=true + cruise.control.metrics.topic.num.partitions=1 + cruise.control.metrics.topic.replication.factor=2 + + # Tiered Storage Configuration + # Enable remote storage + remote.log.storage.system.enable=true + + # Remote storage manager class implementation + remote.log.storage.manager.class.name=org.apache.kafka.server.log.remote.storage.RemoteLogStorageManager + + # Remote log metadata manager class implementation + remote.log.metadata.manager.class.name=org.apache.kafka.server.log.remote.metadata.storage.TopicBasedRemoteLogMetadataManager + + # Remote storage manager configuration prefix + rsm.config.remote.storage.manager.impl.class=org.apache.kafka.server.log.remote.storage.RemoteLogStorageManager + + # Tiered storage cache configuration - this path matches the tieredStorageCache volume mount + rsm.config.fetch.chunk.cache.path=/tiered-storage-cache + + brokers: + - id: 0 + brokerConfig: + # Process roles for KRaft mode + processRoles: + - broker + + # Storage configurations + storageConfigs: + # Primary Kafka log storage - included in Cruise Control capacity + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + + # Tiered storage cache - excluded from Cruise Control capacity and log.dirs + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + # Can use a different storage class optimized for cache (e.g., faster SSD) + # storageClassName: fast-ssd + resources: + requests: + storage: 50Gi + + resourceRequirements: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "1" + memory: 2Gi + + - id: 1 + brokerConfig: + processRoles: + - broker + + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + + resourceRequirements: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "1" + memory: 2Gi + + - id: 2 + brokerConfig: + processRoles: + - broker + + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + + resourceRequirements: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "1" + memory: 2Gi + + listenersConfig: + internalListeners: + - type: "plaintext" + name: "internal" + containerPort: 29092 + usedForInnerBrokerCommunication: true + - type: "plaintext" + name: "controller" + containerPort: 29093 + usedForInnerBrokerCommunication: false + usedForControllerCommunication: true + diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml new file mode 100644 index 000000000..d6cae7a43 --- /dev/null +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml @@ -0,0 +1,98 @@ +apiVersion: kafka.banzaicloud.io/v1beta1 +kind: KafkaCluster +metadata: + name: kafka-ts-resize + namespace: kafka +spec: + kRaft: true + headlessServiceEnabled: true + oneBrokerPerNode: false + clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" + readOnlyConfig: | + auto.create.topics.enable=false + rollingUpgradeConfig: + failureThreshold: 1 + cruiseControlConfig: + topicConfig: + partitions: 1 + replicationFactor: 2 + brokers: + - id: 0 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "200m" + memory: 1Gi + - id: 1 + brokerConfig: + processRoles: + - controller + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "500m" + memory: 1Gi + requests: + cpu: "100m" + memory: 512Mi + - id: 2 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "500m" + memory: 1Gi + requests: + cpu: "100m" + memory: 512Mi + listenersConfig: + internalListeners: + - type: "plaintext" + name: "internal" + containerPort: 29092 + usedForInnerBrokerCommunication: true + - type: "plaintext" + name: "controller" + containerPort: 29093 + usedForInnerBrokerCommunication: false + usedForControllerCommunication: true diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml new file mode 100644 index 000000000..88a7fc27d --- /dev/null +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml @@ -0,0 +1,98 @@ +apiVersion: kafka.banzaicloud.io/v1beta1 +kind: KafkaCluster +metadata: + name: kafka-ts-resize + namespace: kafka +spec: + kRaft: true + headlessServiceEnabled: true + oneBrokerPerNode: false + clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" + readOnlyConfig: | + auto.create.topics.enable=false + rollingUpgradeConfig: + failureThreshold: 1 + cruiseControlConfig: + topicConfig: + partitions: 1 + replicationFactor: 2 + brokers: + - id: 0 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi # Reduced from 2Gi to trigger delete-and-recreate + resourceRequirements: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "200m" + memory: 1Gi + - id: 1 + brokerConfig: + processRoles: + - controller + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "500m" + memory: 1Gi + requests: + cpu: "100m" + memory: 512Mi + - id: 2 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "500m" + memory: 1Gi + requests: + cpu: "100m" + memory: 512Mi + listenersConfig: + internalListeners: + - type: "plaintext" + name: "internal" + containerPort: 29092 + usedForInnerBrokerCommunication: true + - type: "plaintext" + name: "controller" + containerPort: 29093 + usedForInnerBrokerCommunication: false + usedForControllerCommunication: true diff --git a/docs/tiered-storage-pvc-resize.md b/docs/tiered-storage-pvc-resize.md new file mode 100644 index 000000000..7336360f4 --- /dev/null +++ b/docs/tiered-storage-pvc-resize.md @@ -0,0 +1,302 @@ +# Tiered Storage Cache PVC Resize: Reconciliation Flow + +This document explains how koperator handles resizing (shrinking) a tiered storage +cache PVC, why the flow is more involved than a regular PVC update, and the +design decisions behind the current implementation. + +--- + +## Background: why cache PVCs are different + +Kubernetes does not allow shrinking a PVC — the only supported in-place change is +growing it. For regular Kafka log volumes koperator enforces the same constraint +(`isDesiredStorageValueInvalid` returns an error on any size decrease). + +Tiered storage cache volumes are different: the data they hold is ephemeral. When a +broker restarts it will repopulate the cache from remote storage. Losing the cache +does not cause data loss; it only causes a temporary increase in read latency. +Because of this koperator implements a **delete-and-recreate** strategy for cache +PVC shrinks — but it must do so safely, coordinating with the rolling upgrade +machinery so that only one broker is down at a time and cluster health gates are +respected. + +--- + +## Components involved + +| Component | File | Role | +|-----------|------|------| +| `reconcileKafkaPvc` | `pkg/resources/kafka/kafka.go` | Creates / updates / deletes PVCs | +| `reconcileKafkaPod` | `pkg/resources/kafka/kafka.go` | Creates pods or hands off to rolling upgrade | +| `handleRollingUpgrade` | `pkg/resources/kafka/kafka.go` | Deletes a pod after checking health gates | +| `getCreatedPvcForBroker` | `pkg/resources/kafka/kafka.go` | Looks up which PVCs exist for a broker and feeds them into the pod spec | +| `generateDataVolumeAndVolumeMount` | `pkg/resources/kafka/pod.go` | Translates PVC list → pod Volume + VolumeMount entries using `ClaimName` | + +--- + +## PVC annotations used during resize + +Two annotations are written directly on PVC objects to carry state through the +resize. Because annotations live on the Kubernetes object they survive reconciler +restarts, making every step re-entrant. + +| Annotation | Value | Written on | Meaning | +|------------|-------|------------|---------| +| `koperator.adobe.com/cache-resize-state` | `pending-deletion` | Old PVC | This PVC is being replaced; skip it in pod spec generation and delete it once the broker pod stops | +| `koperator.adobe.com/cache-resize-state` | `replacement` | New PVC | This is the replacement PVC; defer to the rolling upgrade before treating it as normal | +| `koperator.adobe.com/replaces-pvc` | `` | New PVC | Traceability — records which PVC is being replaced | + +--- + +## Reconciliation loop order + +Within a single `Reconcile()` call the kafka sub-reconciler runs in this order: + +``` +1. Services +2. PodDisruptionBudgets +3. reconcileKafkaPodDelete ← graceful downscale via Cruise Control +4. Listener statuses / PKI +5. reconcileKafkaPvc ← ALL PVC work happens here, before any pod work +6. Discover running pods + bound PVCs +7. reorderBrokers ← priority sort +8. FOR each broker: + a. ConfigMap + b. per-broker Service + c. reconcileKafkaPod ← pod create OR handleRollingUpgrade + d. reconcilePerBrokerDynamicConfig +9. reconcileClusterWideDynamicConfig +``` + +PVC reconciliation always runs before pod reconciliation. The new PVC is therefore +created (and starts provisioning) before `handleRollingUpgrade` is evaluated for +the same broker in the same cycle. + +--- + +## Full flow: shrinking a tiered storage cache PVC + +### Cycle N — resize detected (pod running) + +**`reconcileKafkaPvc` — per-broker setup** + +``` +r.List(brokerPodList) +brokerPodExists = true + +No pending-deletion PVCs exist yet → no cleanup +No replacement PVCs exist yet → no resize-complete strip +effectivePvcCount == len(desiredPvcs) → no CC disk removal triggered +``` + +**`reconcileKafkaPvc` — per-desired-PVC loop** + +``` +CheckIfObjectUpdated: currentSize > desiredSize → enters resize branch + + 1. Annotate old PVC: + koperator.adobe.com/cache-resize-state: pending-deletion + (r.Update — durable immediately in etcd) + + 2. Create replacement PVC with desiredSize: + koperator.adobe.com/cache-resize-state: replacement + koperator.adobe.com/replaces-pvc: + (r.Create — provisioning starts now, in parallel with gate evaluation) + + 3. Set broker ConfigurationState = ConfigOutOfSync + (triggers handleRollingUpgrade on every cycle until pod restarts) + + continue +``` + +**`reconcileKafkaPod` — same cycle** + +`handleRollingUpgrade` sees `ConfigOutOfSync`, evaluates gates: + +| Gate | Required to pass | +|------|-----------------| +| Pod count | All expected pods exist | +| Concurrent restart limit | Terminating/Pending pods < `ConcurrentBrokerRestartCountPerRack` | +| Rack awareness | Only pods from same AZ as restarting pods | +| Replica health | `offline + out-of-sync < FailureThreshold` | + +If all pass → **broker pod is deleted** → requeue 15 s. +If any fail → requeue 15 s, try again next cycle (state is fully durable via PVC annotations). + +--- + +### Between cycles (rolling upgrade gates blocking) + +Each reconcile cycle sees: + +``` +reconcileKafkaPvc: + brokerPodExists = true + No pending-deletion PVCs → no cleanup + Replacement PVC exists, no pending-deletion → resize-complete strip + … but pod is still up → strip does NOT fire (pod must be up AND no pending-deletion) + alreadyCreated loop: skips pending-deletion PVC, finds replacement PVC + replacement PVC guard: ensure ConfigOutOfSync, continue + +reconcileKafkaPod: + handleRollingUpgrade re-evaluates gates → requeue if blocked +``` + +State is preserved entirely in PVC annotations — no in-memory or status-field +bookkeeping required. Reconciler can crash and restart at any point. + +--- + +### Cycle N+1 — pod is gone + +**`reconcileKafkaPvc` — per-broker setup** + +``` +r.List(brokerPodList) +brokerPodExists = false + +Cleanup loop: finds PVC with pending-deletion annotation + → r.Delete(oldPvc) ← safe now, broker is not running + → r.List(pvcList) ← re-list; only replacement PVC remains +``` + +**`getCreatedPvcForBroker` (called later to build pod spec)** + +Filters out any PVC with `cache-resize-state: pending-deletion` before matching +mount paths. Returns the replacement PVC for the mount path. The pod spec is built +with `ClaimName` pointing to the new PVC. + +**`reconcileKafkaPod`** + +No pod exists → creates new pod referencing the replacement PVC. Kubernetes holds +the pod in `Pending` until the replacement PVC reaches `Bound` phase. + +Because provisioning started in cycle N (when the replacement PVC was created), +the PVC is likely already `Bound` by now, minimising pod startup latency. + +--- + +### Cycle N+2 — pod is running again + +**`reconcileKafkaPvc` — resize-complete strip** + +``` +brokerPodExists = true +No pending-deletion PVCs (deleted in N+1) +Replacement PVC exists → resize complete → strip annotations: + delete koperator.adobe.com/cache-resize-state + delete koperator.adobe.com/replaces-pvc + r.Update(replacementPvc) +``` + +The PVC is now an ordinary PVC. Subsequent reconciles treat it normally. + +--- + +## Sequence diagram + +``` +Cycle N (pod UP, resize detected) + reconcileKafkaPvc + ├─ r.Update(oldPvc) annotate pending-deletion + ├─ r.Create(newPvc) replacement PVC, provisioning starts + └─ ConfigOutOfSync set + reconcileKafkaPod + └─ handleRollingUpgrade + ├─ [gates fail] → requeue 15s + └─ [gates pass] → delete pod → requeue 15s + +Cycle N+k (pod UP, gates failing — any number of cycles) + reconcileKafkaPvc + └─ replacement PVC guard: ensure ConfigOutOfSync, continue + reconcileKafkaPod + └─ handleRollingUpgrade → requeue + +Cycle N+k+1 (pod GONE) + reconcileKafkaPvc + ├─ delete pending-deletion PVC + └─ re-list: only replacement PVC remains + reconcileKafkaPod + └─ no pod → create pod (ClaimName = replacement PVC) + └─ pod Pending until PVC bound (likely already bound) + +Cycle N+k+2 (pod RUNNING) + reconcileKafkaPvc + └─ no pending-deletion PVC + replacement PVC exists → strip annotations + └─ replacement PVC becomes ordinary PVC +``` + +--- + +## Properties of this design + +| Property | Value | +|----------|-------| +| State survives reconciler crash | Yes — PVC annotations are durable in etcd | +| Idempotent re-entry | Yes — each phase checks annotations and resumes correctly | +| Atomicity gap | Eliminated — new PVC is created before old is deleted | +| Provisioning overlaps gate evaluation | Yes — new PVC created in cycle N, not N+1 | +| Observable via kubectl | Yes — `kubectl get pvc -o yaml` shows resize state directly | +| ConfigOutOfSync overloading | Reduced — `ConfigOutOfSync` still used, but the *reason* is legible in PVC annotations | +| CC disk rebalance for cache PVCs | Fixed — tiered cache PVCs are explicitly excluded from `GracefulDiskRebalanceRequired` logic | + +--- + +## Interaction with disk removal detection + +`reconcileKafkaPvc` detects regular disk removal by comparing PVC counts: + +```go +if effectivePvcCount > len(desiredPvcs) { handleDiskRemoval(...) } +``` + +The count uses `effectivePvcCount` which **excludes replacement PVCs**. During a +resize the old (pending-deletion) + new (replacement) PVCs temporarily co-exist for +the same mount path. Without this exclusion the count check would incorrectly +trigger a Cruise Control disk-removal operation. + +Inside `handleDiskRemoval` itself, the `pending-deletion` PVC is also safe: its +mount path still matches the desired spec, so `foundInDesired = true` and it is +skipped by the disk-removal state machine. + +--- + +## Interaction with GracefulDiskRebalanceRequired + +When a new PVC first becomes `Bound`, koperator normally sets +`GracefulDiskRebalanceRequired` to ask Cruise Control to rebalance data across +the broker's disks. This is correct for Kafka log volumes but wrong for tiered +storage cache volumes — CC must not account for ephemeral cache storage. + +The `alreadyCreated` loop now explicitly skips `GracefulDiskRebalanceRequired` for +any PVC annotated `tieredStorageCache: true`, including the replacement PVC. + +--- + +## Known limitations + +### ConfigOutOfSync still shared with config changes + +`ConfigOutOfSync` is set to trigger the rolling upgrade. It is the same bit used +for Kafka property changes. An observer cannot distinguish "resize pending" from +"config change" by status alone — the PVC annotations must be inspected. + +### Concurrent resize + complete storage-config removal + +If the tiered storage cache storage config is removed entirely from the spec while +a resize is in progress, both the `pending-deletion` and `replacement` PVCs have a +mount path that is no longer present in the desired spec. Without special handling +this would route them into the Cruise Control `remove_disks` path — which fails with +"log dir not found" because CC has no knowledge of ephemeral cache paths — leaving +the operator stuck in a 20-second requeue loop. + +koperator handles this as follows: + +**Pod DOWN**: both the `pending-deletion` and the orphaned `replacement` PVC are +deleted directly in `reconcileKafkaPvc` (bypassing Cruise Control). The operator +proceeds normally on the next cycle. + +**Pod UP**: koperator sets `ConfigOutOfSync` to trigger a rolling restart via +`handleRollingUpgrade`, then skips `handleDiskRemoval` for this broker entirely +(`continue` in the broker loop). Once the pod stops, the next cycle falls into the +Pod DOWN path above and cleans up both PVCs. diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 72c907bcd..261fcc117 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -68,6 +68,14 @@ const ( brokerConfigTemplate = "%s-config" brokerStorageTemplate = "%s-%d-storage-%d-" + // pvcCacheResizeStateAnnotation is set on PVCs during a tiered storage cache resize to track + // which PVC is the old one awaiting deletion and which is the new replacement. + pvcCacheResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" + pvcCacheResizePendingDeletion = "pending-deletion" + pvcCacheResizeReplacement = "replacement" + // pvcCacheResizeReplacesPVC records the name of the old PVC that the replacement supersedes. + pvcCacheResizeReplacesPVC = "koperator.adobe.com/replaces-pvc" + brokerConfigMapVolumeMount = "broker-config" kafkaDataVolumeMount = "kafka-data" @@ -153,6 +161,17 @@ func getCreatedPvcForBroker( return nil, err } + // Exclude PVCs that are pending deletion during a tiered storage cache resize. + // The replacement PVC (same mount path, smaller size) is already present and will be used instead. + n := 0 + for _, pvc := range foundPvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] != pvcCacheResizePendingDeletion { + foundPvcList.Items[n] = pvc + n++ + } + } + foundPvcList.Items = foundPvcList.Items[:n] + var missing []string for i := range storageConfigs { if storageConfigs[i].PvcSpec == nil { @@ -312,6 +331,12 @@ func (r *Reconciler) Reconcile(log logr.Logger) error { if storage.PvcSpec == nil { continue } + // Tiered storage cache PVCs are only meaningful on broker nodes. + // Controller-only nodes do not serve Kafka client traffic or hold tiered-storage + // segment data, so skip cache PVCs for them. + if storage.TieredStorageCache && brokerConfig.IsControllerNode() && !brokerConfig.IsBrokerNode() { + continue + } o, err := r.pvc(broker.Id, index, storage) if err != nil { return errors.WrapIfWithDetails(err, "failed to generate resource", "resources", "PersistentVolumeClaim") @@ -1180,8 +1205,130 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro } } - // Handle disk removal - if len(pvcList.Items) > len(desiredPvcs) { + // Resolve pod existence once per broker — used by both cleanup and resize logic below. + // A pod with a non-nil DeletionTimestamp is Terminating: its containers have stopped (or + // are stopping) and any PVCs it mounted will be released once the pod is fully gone. + // Treating a Terminating pod as "not existing" lets the pending-deletion PVC be cleaned up + // during the Terminating window rather than waiting for the narrow gap between the pod being + // fully removed from etcd and the new pod being created in the same reconcile cycle. + brokerPodList := &corev1.PodList{} + brokerPodMatchingLabels := client.MatchingLabels( + apiutil.MergeLabels( + apiutil.LabelsForKafka(r.KafkaCluster.Name), + map[string]string{banzaiv1beta1.BrokerIdLabelKey: brokerId}, + ), + ) + if err := r.List(ctx, brokerPodList, client.InNamespace(r.KafkaCluster.GetNamespace()), brokerPodMatchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "getting broker pods failed", "brokerId", brokerId) + } + brokerPodExists := false + for i := range brokerPodList.Items { + if brokerPodList.Items[i].DeletionTimestamp == nil { + brokerPodExists = true + break + } + } + + // Build a set of desired mount paths for orphaned-PVC detection below. + desiredMountPaths := make(map[string]bool, len(desiredPvcs)) + for _, dp := range desiredPvcs { + desiredMountPaths[dp.Annotations["mountPath"]] = true + } + + if brokerPodExists { + // Resize complete when the replacement PVC exists but the pending-deletion PVC is already gone: + // strip the replacement annotation so the PVC is treated as normal going forward. + var hasPendingDeletion bool + for _, pvc := range pvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + hasPendingDeletion = true + break + } + } + if !hasPendingDeletion { + for _, pvc := range pvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + pvcCopy := pvc.DeepCopy() + delete(pvcCopy.Annotations, pvcCacheResizeStateAnnotation) + delete(pvcCopy.Annotations, pvcCacheResizeReplacesPVC) + if err := r.Update(ctx, pvcCopy); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "removing replacement annotation from tiered storage cache PVC failed", "pvc", pvc.Name) + } + log.Info("Tiered storage cache PVC resize complete, removed replacement annotation", "pvc", pvc.Name) + } + } + } + } else { + // Broker pod is down — safe to delete any PVCs that were waiting for the pod to stop. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", + "brokerId", brokerId, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) + } + } + } + // Also delete replacement PVCs whose mount path is no longer desired — these are + // orphaned by a storage config removal that happened while a cache resize was in flight. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement && + !desiredMountPaths[pvc.Annotations["mountPath"]] { + log.Info("Broker pod is down — deleting orphaned replacement tiered storage cache PVC", + "brokerId", brokerId, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned replacement tiered storage cache PVC failed", "pvc", pvc.Name) + } + } + } + // Re-list so the rest of this iteration sees the current state. + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) + } + } + + // If the pod is still running and there are orphaned cache resize PVCs (storage config + // removed while a resize was in flight), trigger a rolling restart so the pod stops + // and we can delete the orphaned PVCs on the next cycle. Routing these through + // Cruise Control remove_disks would fail because cache paths are not Kafka log dirs. + if brokerPodExists { + var hasOrphanedCachePvc bool + for _, pvc := range pvcList.Items { + resizeState := pvc.Annotations[pvcCacheResizeStateAnnotation] + if (resizeState == pvcCacheResizePendingDeletion || resizeState == pvcCacheResizeReplacement) && + !desiredMountPaths[pvc.Annotations["mountPath"]] { + hasOrphanedCachePvc = true + break + } + } + if hasOrphanedCachePvc { + log.Info("Orphaned cache resize PVCs detected with broker pod running — triggering rolling restart for safe cleanup", + "brokerId", brokerId) + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) + } + } + continue // Skip disk removal — wait for pod to stop before deleting cache PVCs. + } + } + + // Handle disk removal — count only non-replacement PVCs to avoid false positives while a + // tiered storage cache resize is in flight (old + new PVC temporarily coexist). + effectivePvcCount := 0 + for _, pvc := range pvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] != pvcCacheResizeReplacement { + effectivePvcCount++ + } + } + if effectivePvcCount > len(desiredPvcs) { waitForDiskRemovalToFinish, err = handleDiskRemoval(ctx, pvcList, desiredPvcs, r, brokerId, log, desiredType, brokerVolumesState) if err != nil { return err @@ -1213,9 +1360,20 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro alreadyCreated := false for _, pvc := range pvcList.Items { - if mountPath == pvc.Annotations["mountPath"] { - currentPvc = pvc.DeepCopy() - alreadyCreated = true + if mountPath != pvc.Annotations["mountPath"] { + continue + } + // Skip the old PVC that is waiting to be deleted once the broker pod stops. + // The replacement PVC (same mount path) will be matched on the next iteration. + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + continue + } + currentPvc = pvc.DeepCopy() + alreadyCreated = true + // Trigger a CC disk rebalance only for regular data volumes. + // Tiered storage cache PVCs are ephemeral — CC must not account for them. + isCachePvc := pvc.Annotations["tieredStorageCache"] == "true" + if !isCachePvc { // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, // then we make it happening with status update. // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance @@ -1224,8 +1382,8 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro (!found || volumeState.CruiseControlVolumeState.IsDiskRemoval()) { brokerVolumesState[mountPath] = banzaiv1beta1.VolumeState{CruiseControlVolumeState: banzaiv1beta1.GracefulDiskRebalanceRequired} } - break } + break } if !alreadyCreated { @@ -1239,11 +1397,78 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro continue } if err == nil { + // A replacement PVC is the new smaller PVC staged during a tiered storage cache resize. + // It already has the right size; we only need to keep the rolling upgrade running so + // the broker pod stops and the pending-deletion PVC can be cleaned up. + if currentPvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for pending tiered storage cache PVC resize", "brokerId", brokerId) + } + } + continue + } + if k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { return errors.WrapIf(err, "could not apply last state to annotation") } + // Check if this is a tiered storage cache volume + isTieredCache := currentPvc.Annotations["tieredStorageCache"] == "true" + desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() + currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() + + // Tiered storage cache PVC shrink: stage the replacement PVC immediately so + // provisioning runs in parallel with rolling-upgrade gate evaluation. + // The old PVC is annotated pending-deletion and removed once the broker pod stops. + if isTieredCache && desiredSize < currentSize { + log.Info("Tiered storage cache size decrease detected — staging replacement PVC", + "brokerId", brokerId, + "mountPath", mountPath, + "currentSize", currentSize, + "desiredSize", desiredSize) + + // Annotate old PVC so it is skipped by getCreatedPvcForBroker and cleaned up + // the moment the broker pod stops. + oldPvcCopy := currentPvc.DeepCopy() + oldPvcCopy.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion + if err := r.Update(ctx, oldPvcCopy); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "annotating old tiered storage cache PVC for deletion failed", "pvc", currentPvc.Name) + } + log.Info("Marked old tiered storage cache PVC for deletion", "brokerId", brokerId, "pvc", currentPvc.Name) + + // Create the replacement PVC immediately so provisioning starts now. + desiredPvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizeReplacement + desiredPvc.Annotations[pvcCacheResizeReplacesPVC] = currentPvc.Name + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + if err := r.Create(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "creating replacement tiered storage cache PVC failed", "brokerId", brokerId) + } + log.Info("Created replacement tiered storage cache PVC", + "brokerId", brokerId, "pvc", desiredPvc.Name, "desiredSize", desiredSize) + + // Trigger rolling upgrade so the broker pod is restarted and the old PVC + // can be deleted on the next reconcile cycle. + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + log.Info("Marking broker ConfigOutOfSync to trigger rolling upgrade for tiered storage cache PVC resize", + "brokerId", brokerId) + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for tiered storage cache PVC resize", "brokerId", brokerId) + } + } + continue + } + + // Regular validation: size decreases are forbidden for non-cache volumes. if isDesiredStorageValueInvalid(desiredPvc, currentPvc) { return errorfactory.New(errorfactory.InternalError{}, errors.New("could not modify pvc size"), "one can not reduce the size of a PVC", "kind", desiredType) diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 4ca517e6f..323a87264 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -32,8 +32,10 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" + "github.com/banzaicloud/k8s-objectmatcher/patch" "github.com/banzaicloud/koperator/api/v1alpha1" "github.com/banzaicloud/koperator/api/v1beta1" controllerMocks "github.com/banzaicloud/koperator/controllers/tests/mocks" @@ -1257,6 +1259,15 @@ func execPvcTest(t *testing.T, testCases []PvcTestCase) { }, } + // Mock pod list — the new code always checks pod existence before disk removal. + // For these tests the broker pod is always absent. + mockClient.EXPECT().List( + context.TODO(), + gomock.AssignableToTypeOf(&corev1.PodList{}), + client.InNamespace("kafka"), + gomock.Any(), + ).Return(nil).AnyTimes() + // Set up the mockClient to return the provided test.existingPvcs mockClient.EXPECT().List( context.TODO(), @@ -1336,6 +1347,229 @@ func createPvc(name, brokerId, mountPath string) *corev1.PersistentVolumeClaim { } } +func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { + t.Parallel() + + const ( + clusterName = "kafka" + namespace = "kafka" + brokerId = "0" + mountPath = "/tiered-storage-cache" + ) + + makeTieredCachePvc := func(name, size string) *corev1.PersistentVolumeClaim { + qty := resource.MustParse(size) + pvc := &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + v1beta1.BrokerIdLabelKey: brokerId, + v1beta1.AppLabelKey: "kafka", + v1beta1.KafkaCRLabelKey: clusterName, + }, + Annotations: map[string]string{ + "mountPath": mountPath, + "tieredStorageCache": "true", + }, + }, + Spec: corev1.PersistentVolumeClaimSpec{ + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: qty, + }, + }, + }, + Status: corev1.PersistentVolumeClaimStatus{Phase: corev1.ClaimBound}, + } + // Set last-applied annotation so CheckIfObjectUpdated detects size differences + _ = patch.DefaultAnnotator.SetLastAppliedAnnotation(pvc) + return pvc + } + + runningPod := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kafka-0", + Namespace: namespace, + }, + } + terminatingPod := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kafka-0", + Namespace: namespace, + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + }, + } + + testCases := []struct { + testName string + existingPvc *corev1.PersistentVolumeClaim + desiredPvc *corev1.PersistentVolumeClaim + existingPods []corev1.Pod + // expectedUpdatePvc: old PVC annotated pending-deletion, or annotation stripped on resize-complete + expectedUpdatePvc bool + expectedCreatePvc bool + expectedDeletePvc bool + expectedError bool + }{ + { + // Pod is up: annotate old PVC as pending-deletion, create replacement PVC, + // set ConfigOutOfSync to trigger rolling upgrade. No error — state is durable. + testName: "size decrease with running pod — old PVC annotated, replacement PVC created", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + expectedUpdatePvc: true, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Terminating pod is treated as having no running pod: staging starts immediately + // so the replacement PVC is provisioned during the drain window. + testName: "size decrease with terminating pod — old PVC annotated, replacement PVC created", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{terminatingPod}, + expectedUpdatePvc: true, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Pod is terminating and old PVC already has pending-deletion annotation: + // the Terminating pod is treated as gone, so cleanup fires immediately. + testName: "pending-deletion PVC with terminating pod — old PVC deleted", + existingPvc: func() *corev1.PersistentVolumeClaim { + pvc := makeTieredCachePvc("cache-pvc-1", "100Gi") + pvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion + return pvc + }(), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{terminatingPod}, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: true, + expectedError: false, + }, + { + // Pod is already gone on first detection: stage the replacement anyway so the + // state machine is consistent. Cleanup of pending-deletion PVC happens on the + // next cycle once the annotation is observed. + testName: "size decrease with pod already gone — old PVC annotated, replacement PVC created", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{}, + expectedUpdatePvc: true, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Pod is gone and old PVC already has pending-deletion annotation: + // cleanup fires — old PVC is deleted. Replacement PVC already exists. + testName: "size decrease with pod gone and old PVC pending-deletion — old PVC deleted", + existingPvc: func() *corev1.PersistentVolumeClaim { + pvc := makeTieredCachePvc("cache-pvc-1", "100Gi") + pvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion + return pvc + }(), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{}, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: true, + expectedError: false, + }, + { + // size increase — no special handling, regular PVC update path. + testName: "size increase — regular PVC update path", + existingPvc: makeTieredCachePvc("cache-pvc-1", "50Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + existingPods: []corev1.Pod{}, + expectedUpdatePvc: true, + expectedCreatePvc: false, + expectedDeletePvc: false, + expectedError: false, + }, + } + + mockCtrl := gomock.NewController(t) + + for _, test := range testCases { + test := test + t.Run(test.testName, func(t *testing.T) { + mockClient := mocks.NewMockClient(mockCtrl) + mockSubResourceClient := mocks.NewMockSubResourceClient(mockCtrl) + + r := Reconciler{ + Reconciler: resources.Reconciler{ + Client: mockClient, + KafkaCluster: &v1beta1.KafkaCluster{ + ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: namespace}, + Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{ + Id: 0, + BrokerConfig: &v1beta1.BrokerConfig{Roles: []string{"broker"}}, + }}, + }, + }, + }, + } + + // Mock PVC list — always returns the existing PVC + mockClient.EXPECT().List( + context.TODO(), + gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaimList{}), + client.InNamespace(namespace), + gomock.Any(), + ).Do(func(ctx context.Context, list *corev1.PersistentVolumeClaimList, opts ...client.ListOption) { + list.Items = []corev1.PersistentVolumeClaim{*test.existingPvc} + }).Return(nil).AnyTimes() + + // Mock Pod list — returns the configured pods + mockClient.EXPECT().List( + context.TODO(), + gomock.AssignableToTypeOf(&corev1.PodList{}), + client.InNamespace(namespace), + gomock.Any(), + ).Do(func(ctx context.Context, list *corev1.PodList, opts ...client.ListOption) { + list.Items = test.existingPods + }).Return(nil).AnyTimes() + + // Mock PVC update (annotating pending-deletion or stripping replacement annotation) + if test.expectedUpdatePvc { + mockClient.EXPECT().Update(context.TODO(), gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaim{})).Return(nil).AnyTimes() + } + + // Mock PVC deletion + if test.expectedDeletePvc { + mockClient.EXPECT().Delete(context.TODO(), gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaim{})).Return(nil).Times(1) + } + + // Mock PVC creation + if test.expectedCreatePvc { + mockClient.EXPECT().Create(context.TODO(), gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaim{})).Return(nil).Times(1) + } + + mockClient.EXPECT().Status().Return(mockSubResourceClient).AnyTimes() + mockSubResourceClient.EXPECT().Update(context.Background(), gomock.AssignableToTypeOf(&v1beta1.KafkaCluster{})).Return(nil).AnyTimes() + + brokersDesiredPvcs := map[string][]*corev1.PersistentVolumeClaim{ + brokerId: {test.desiredPvc}, + } + + err := r.reconcileKafkaPvc(context.TODO(), logf.Log, brokersDesiredPvcs) + + if test.expectedError { + assert.NotNil(t, err, "expected an error but got nil") + } else { + assert.Nil(t, err, "expected no error but got: %v", err) + } + }) + } +} + // nolint funlen func TestReconcileConcurrentBrokerRestartsAllowed(t *testing.T) { t.Parallel() diff --git a/pkg/resources/kafka/pvc.go b/pkg/resources/kafka/pvc.go index 29516f397..67dd6167e 100644 --- a/pkg/resources/kafka/pvc.go +++ b/pkg/resources/kafka/pvc.go @@ -62,6 +62,13 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora return nil, errors.WrapIfWithDetails(err, "couldn't unmarshal Pvc spec", errCtx...) } + annotations := map[string]string{"mountPath": storage.MountPath} + + // Mark tiered storage cache PVCs with annotation for special handling + if storage.TieredStorageCache { + annotations["tieredStorageCache"] = "true" + } + return &corev1.PersistentVolumeClaim{ ObjectMeta: templates.ObjectMetaWithGeneratedNameAndAnnotations( fmt.Sprintf(brokerStorageTemplate, r.KafkaCluster.Name, brokerId, storageIndex), @@ -69,7 +76,7 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora apiutil.LabelsForKafka(r.KafkaCluster.Name), map[string]string{v1beta1.BrokerIdLabelKey: fmt.Sprintf("%d", brokerId)}, ), - map[string]string{"mountPath": storage.MountPath}, r.KafkaCluster), + annotations, r.KafkaCluster), Spec: pvcSpec, }, nil } diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go new file mode 100644 index 000000000..0decedbc5 --- /dev/null +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -0,0 +1,364 @@ +// Copyright 2025 Adobe. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package e2e + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + "emperror.dev/errors" + "github.com/gruntwork-io/terratest/modules/k8s" + ginkgo "github.com/onsi/ginkgo/v2" + gomega "github.com/onsi/gomega" +) + +const ( + tsResizeClusterName = "kafka-ts-resize" + + tsResizeInitialManifest = "../../config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml" + tsResizeShrunkManifest = "../../config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml" + + tsResizeCacheMountPath = "/tiered-storage-cache" + tsResizeInitialSize = "2Gi" + tsResizeShrunkSize = "1Gi" + + // Annotation keys written by the cache-resize reconciler. + pvcResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" + pvcResizeStatePending = "pending-deletion" + pvcResizeStateReplacement = "replacement" + + tsResizePhaseTimeout = 10 * time.Minute + tsResizePollingInterval = 15 * time.Second +) + +// pvcItem is a minimal representation of a PVC for assertion helpers. +type pvcItem struct { + Name string + Annotations map[string]string + StorageSize string +} + +// listBrokerCachePVCs returns PVCs for the given broker that have the +// tiered-storage-cache mount path annotation. +func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions, clusterName string, brokerID int) ([]pvcItem, error) { + selector := fmt.Sprintf("%s=%s,brokerId=%d", kafkaCRLabelKey, clusterName, brokerID) + + rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", "persistentvolumeclaims", + "-l", selector, + "--output", "json", + ) + if err != nil { + return nil, errors.WrapIf(err, "listing PVCs failed") + } + + var pvcList struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + Annotations map[string]string `json:"annotations"` + } `json:"metadata"` + Spec struct { + Resources struct { + Requests struct { + Storage string `json:"storage"` + } `json:"requests"` + } `json:"resources"` + } `json:"spec"` + } `json:"items"` + } + if err := json.Unmarshal([]byte(rawOutput), &pvcList); err != nil { + return nil, errors.WrapIf(err, "parsing PVC list JSON failed") + } + + var result []pvcItem + for _, item := range pvcList.Items { + if item.Metadata.Annotations["mountPath"] == tsResizeCacheMountPath { + result = append(result, pvcItem{ + Name: item.Metadata.Name, + Annotations: item.Metadata.Annotations, + StorageSize: item.Spec.Resources.Requests.Storage, + }) + } + } + return result, nil +} + +// getBrokerPodUID returns the UID of the running (non-terminating) broker pod for the +// given broker ID, or an error if no such pod is found. +func getBrokerPodUID(kubectlOptions k8s.KubectlOptions, clusterName string, brokerID int) (string, error) { + rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", "pod", + "-l", fmt.Sprintf("%s=%s,brokerId=%d,app=kafka", kafkaCRLabelKey, clusterName, brokerID), + "--output", "json", + ) + if err != nil { + return "", errors.WrapIf(err, "listing broker pods failed") + } + + var podList struct { + Items []struct { + Metadata struct { + UID string `json:"uid"` + DeletionTimestamp *string `json:"deletionTimestamp"` + } `json:"metadata"` + } `json:"items"` + } + if err := json.Unmarshal([]byte(rawOutput), &podList); err != nil { + return "", errors.WrapIf(err, "parsing pod list JSON failed") + } + + for _, item := range podList.Items { + if item.Metadata.DeletionTimestamp == nil { + return item.Metadata.UID, nil + } + } + return "", fmt.Errorf("no running broker pod found for broker %d in cluster %s", brokerID, clusterName) +} + +// testTieredStorageCachePvcResize tests the full multi-phase delete-and-recreate +// flow for a tiered storage cache PVC shrink. It: +// +// 1. Installs a minimal KRaft cluster with a 2Gi cache PVC on broker 0. +// 2. Applies an updated manifest that shrinks the cache to 1Gi. +// 3. Waits for Phase 1 (staging): both the old PVC (pending-deletion) and +// the new PVC (replacement) exist simultaneously. +// 4. Waits for Phase 2 (pod cycle): the pod restarts, the old PVC is deleted, +// and the new pod starts referencing the replacement PVC. +// 5. Waits for Phase 3 (completion): the replacement annotation is stripped and +// the broker pod is running again. +// 6. Verifies the surviving PVC carries the new 1Gi size. +// 7. Cleans up the cluster. +func testTieredStorageCachePvcResize() bool { + return ginkgo.When("Testing tiered storage cache PVC shrink (delete-and-recreate flow)", ginkgo.Ordered, func() { + var kubectlOptions k8s.KubectlOptions + var err error + var broker0PodUID string // UID of the broker-0 pod before the resize, used to detect recycling + + ginkgo.It("Acquiring K8s config and context", func() { + kubectlOptions, err = kubectlOptionsForCurrentContext() + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + kubectlOptions.Namespace = koperatorLocalHelmDescriptor.Namespace + }) + + // Pre-cleanup: forcibly remove any leftover cluster from a previous interrupted run + // so the install step always starts from a clean slate. + // We cannot rely on requireDeleteKafkaCluster here because a stuck finalizer (left by a + // mid-reconcile interruption) prevents the cascade deletion from completing in time. + // Instead we strip the finalizer first, delete the CR, then explicitly delete every + // owned resource so nothing blocks the subsequent fresh install. + ginkgo.It("Pre-cleanup: removing any leftover kafka-ts-resize cluster", func() { + // 1. Remove finalizers so the CR can be deleted regardless of operator state. + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "patch", kafkaKind, tsResizeClusterName, + "--type=merge", `--patch={"metadata":{"finalizers":[]}}`, + ) + // 2. Delete the CR itself (ignore not-found). + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "delete", kafkaKind, tsResizeClusterName, "--ignore-not-found", + ) + // 3. Explicitly delete PVCs — they carry a pvc-protection finalizer that + // blocks cascade GC while pods are still bound. + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "delete", "persistentvolumeclaims", + "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), + "--ignore-not-found", + ) + // 4. Delete pods so they release PVC mounts promptly. + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "delete", "pods", + "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), + "--ignore-not-found", "--grace-period=0", + ) + // 5. Wait for the CR itself to be gone (GC will handle the rest). + gomega.Eventually(context.Background(), func() error { + out, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", kafkaKind, tsResizeClusterName, + ) + if err != nil || strings.Contains(out, "NotFound") || strings.Contains(out, "not found") { + return nil + } + return errors.New("KafkaCluster CR still exists") + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + }) + + ginkgo.It("Installing KafkaCluster with tiered storage cache PVC (2Gi)", func() { + ginkgo.By("Applying initial KafkaCluster manifest") + applyK8sResourceManifest(kubectlOptions, tsResizeInitialManifest) + + ginkgo.By("Waiting for all broker pods to be ready") + // kubectl wait --for=condition=Ready fails immediately when no pods exist yet. + // We don't wait for ClusterRunning — the cluster may stay in ClusterReconciling + // because GracefulDiskRebalanceRequired for log volumes needs CC, which this minimal + // test cluster does not deploy. Pod readiness is sufficient to proceed with the resize. + gomega.Eventually(context.Background(), func() error { + output, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", "pod", + "-l", fmt.Sprintf("%s=%s,app=kafka,isBrokerNode=true", kafkaCRLabelKey, tsResizeClusterName), + "--output", "json", + ) + if err != nil { + return errors.WrapIf(err, "listing pods failed") + } + var podList struct { + Items []struct { + Status struct { + Conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + } `json:"conditions"` + } `json:"status"` + } `json:"items"` + } + if err := json.Unmarshal([]byte(output), &podList); err != nil { + return errors.WrapIf(err, "parsing pod list JSON failed") + } + if len(podList.Items) < 1 { + return fmt.Errorf("expected at least 1 broker pod, got %d", len(podList.Items)) + } + for _, pod := range podList.Items { + ready := false + for _, cond := range pod.Status.Conditions { + if cond.Type == "Ready" && cond.Status == "True" { + ready = true + break + } + } + if !ready { + return errors.New("not all pods are ready yet") + } + } + return nil + }, kafkaClusterCreateTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + + ginkgo.By("Capturing broker-0 pod UID before resize") + broker0PodUID, err = getBrokerPodUID(kubectlOptions, tsResizeClusterName, 0) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(broker0PodUID).NotTo(gomega.BeEmpty()) + + ginkgo.By("Verifying initial cache PVC size is " + tsResizeInitialSize) + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(pvcs).To(gomega.HaveLen(1), "expected exactly one cache PVC for broker 0") + gomega.Expect(pvcs[0].StorageSize).To(gomega.Equal(tsResizeInitialSize)) + }) + + ginkgo.It("Triggering cache PVC shrink from 2Gi to 1Gi", func() { + ginkgo.By("Applying shrunk KafkaCluster manifest") + applyK8sResourceManifest(kubectlOptions, tsResizeShrunkManifest) + }) + + ginkgo.It("Phase 1: old PVC annotated pending-deletion and replacement PVC created", func() { + ginkgo.By("Waiting until both pending-deletion and replacement PVCs coexist for broker 0") + gomega.Eventually(context.Background(), func() error { + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + if err != nil { + return err + } + var hasPendingDeletion, hasReplacement bool + for _, pvc := range pvcs { + switch pvc.Annotations[pvcResizeStateAnnotation] { + case pvcResizeStatePending: + hasPendingDeletion = true + case pvcResizeStateReplacement: + hasReplacement = true + } + } + if !hasPendingDeletion { + return errors.New("no PVC with pending-deletion annotation yet") + } + if !hasReplacement { + return errors.New("no PVC with replacement annotation yet") + } + return nil + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + }) + + ginkgo.It("Phase 2: broker pod restarts, pending-deletion PVC is deleted", func() { + ginkgo.By("Waiting for broker-0 pod to be recycled (UID change indicates rolling restart)") + // We detect recycling by UID change rather than waiting for the pod count to hit zero. + // The pod may restart fast enough to be back before the next polling tick, causing a + // "pod count == 0" check to time out even when the restart already completed. + gomega.Eventually(context.Background(), func() error { + uid, err := getBrokerPodUID(kubectlOptions, tsResizeClusterName, 0) + if err != nil { + // Pod may be absent mid-restart; treat as not-yet-recycled. + return err + } + if uid == broker0PodUID { + return errors.New("broker-0 pod has not been recycled yet (same UID)") + } + return nil + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + + ginkgo.By("Waiting for the pending-deletion PVC to be deleted") + gomega.Eventually(context.Background(), func() error { + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + if err != nil { + return err + } + for _, pvc := range pvcs { + if pvc.Annotations[pvcResizeStateAnnotation] == pvcResizeStatePending { + return fmt.Errorf("pending-deletion PVC %s still exists", pvc.Name) + } + } + return nil + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + }) + + ginkgo.It("Phase 3: broker pod running again with new PVC, replacement annotation stripped", func() { + ginkgo.By("Waiting for the broker-0 pod to come back Ready") + err = waitK8sResourceCondition(kubectlOptions, "pod", "condition=Ready", + tsResizePhaseTimeout, + fmt.Sprintf("%s=%s,brokerId=0,app=kafka", kafkaCRLabelKey, tsResizeClusterName), "") + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for replacement annotation to be stripped from the surviving PVC") + gomega.Eventually(context.Background(), func() error { + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + if err != nil { + return err + } + if len(pvcs) != 1 { + return fmt.Errorf("expected 1 cache PVC for broker 0, got %d", len(pvcs)) + } + if state := pvcs[0].Annotations[pvcResizeStateAnnotation]; state != "" { + return fmt.Errorf("cache-resize-state annotation still present: %q", state) + } + return nil + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + }) + + ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() { + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(pvcs).To(gomega.HaveLen(1)) + gomega.Expect(pvcs[0].StorageSize).To(gomega.Equal(tsResizeShrunkSize), + "surviving PVC should carry the shrunk size") + gomega.Expect(strings.Contains(pvcs[0].Name, tsResizeClusterName)).To(gomega.BeTrue(), + "surviving PVC should belong to the test cluster") + }) + + // requireDeleteKafkaCluster receives a copy of kubectlOptions at registration + // time, so Namespace must be set here (not inside the It block above). + kubectlOptions.Namespace = koperatorLocalHelmDescriptor.Namespace + // requireDeleteKafkaCluster registers its own It block — must be called at + // the When scope, not nested inside an It. + requireDeleteKafkaCluster(kubectlOptions, tsResizeClusterName) + }) +} From 73d7b8b8e46017bcc8ed97bf83e6911a4838ee51 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 1 Apr 2026 15:01:34 +0300 Subject: [PATCH 03/19] fixed test --- .../samples/kafkacluster_tiered_storage.yaml | 147 ------------------ ...r_tiered_storage_cache_resize_initial.yaml | 25 +-- ...er_tiered_storage_cache_resize_shrunk.yaml | 25 +-- tests/e2e/koperator_suite_test.go | 1 + 4 files changed, 3 insertions(+), 195 deletions(-) delete mode 100644 config/samples/kafkacluster_tiered_storage.yaml diff --git a/config/samples/kafkacluster_tiered_storage.yaml b/config/samples/kafkacluster_tiered_storage.yaml deleted file mode 100644 index e79c5b72b..000000000 --- a/config/samples/kafkacluster_tiered_storage.yaml +++ /dev/null @@ -1,147 +0,0 @@ -apiVersion: kafka.banzaicloud.io/v1beta1 -kind: KafkaCluster -metadata: - labels: - controller-tools.k8s.io: "1.0" - name: kafka-tiered-storage -spec: - kRaft: true - headlessServiceEnabled: true - oneBrokerPerNode: false - clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" - - # Cluster-wide Kafka configuration including tiered storage settings - readOnlyConfig: | - auto.create.topics.enable=false - cruise.control.metrics.topic.auto.create=true - cruise.control.metrics.topic.num.partitions=1 - cruise.control.metrics.topic.replication.factor=2 - - # Tiered Storage Configuration - # Enable remote storage - remote.log.storage.system.enable=true - - # Remote storage manager class implementation - remote.log.storage.manager.class.name=org.apache.kafka.server.log.remote.storage.RemoteLogStorageManager - - # Remote log metadata manager class implementation - remote.log.metadata.manager.class.name=org.apache.kafka.server.log.remote.metadata.storage.TopicBasedRemoteLogMetadataManager - - # Remote storage manager configuration prefix - rsm.config.remote.storage.manager.impl.class=org.apache.kafka.server.log.remote.storage.RemoteLogStorageManager - - # Tiered storage cache configuration - this path matches the tieredStorageCache volume mount - rsm.config.fetch.chunk.cache.path=/tiered-storage-cache - - brokers: - - id: 0 - brokerConfig: - # Process roles for KRaft mode - processRoles: - - broker - - # Storage configurations - storageConfigs: - # Primary Kafka log storage - included in Cruise Control capacity - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 100Gi - - # Tiered storage cache - excluded from Cruise Control capacity and log.dirs - - mountPath: "/tiered-storage-cache" - tieredStorageCache: true - pvcSpec: - accessModes: - - ReadWriteOnce - # Can use a different storage class optimized for cache (e.g., faster SSD) - # storageClassName: fast-ssd - resources: - requests: - storage: 50Gi - - resourceRequirements: - limits: - cpu: "2" - memory: 4Gi - requests: - cpu: "1" - memory: 2Gi - - - id: 1 - brokerConfig: - processRoles: - - broker - - storageConfigs: - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 100Gi - - - mountPath: "/tiered-storage-cache" - tieredStorageCache: true - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - - resourceRequirements: - limits: - cpu: "2" - memory: 4Gi - requests: - cpu: "1" - memory: 2Gi - - - id: 2 - brokerConfig: - processRoles: - - broker - - storageConfigs: - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 100Gi - - - mountPath: "/tiered-storage-cache" - tieredStorageCache: true - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - - resourceRequirements: - limits: - cpu: "2" - memory: 4Gi - requests: - cpu: "1" - memory: 2Gi - - listenersConfig: - internalListeners: - - type: "plaintext" - name: "internal" - containerPort: 29092 - usedForInnerBrokerCommunication: true - - type: "plaintext" - name: "controller" - containerPort: 29093 - usedForInnerBrokerCommunication: false - usedForControllerCommunication: true - diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml index d6cae7a43..b4df5bbd3 100644 --- a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml @@ -12,10 +12,7 @@ spec: auto.create.topics.enable=false rollingUpgradeConfig: failureThreshold: 1 - cruiseControlConfig: - topicConfig: - partitions: 1 - replicationFactor: 2 + cruiseControlConfig: {} brokers: - id: 0 brokerConfig: @@ -65,26 +62,6 @@ spec: requests: cpu: "100m" memory: 512Mi - - id: 2 - brokerConfig: - processRoles: - - broker - terminationGracePeriodSeconds: 10 - storageConfigs: - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 2Gi - resourceRequirements: - limits: - cpu: "500m" - memory: 1Gi - requests: - cpu: "100m" - memory: 512Mi listenersConfig: internalListeners: - type: "plaintext" diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml index 88a7fc27d..5d158905a 100644 --- a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml @@ -12,10 +12,7 @@ spec: auto.create.topics.enable=false rollingUpgradeConfig: failureThreshold: 1 - cruiseControlConfig: - topicConfig: - partitions: 1 - replicationFactor: 2 + cruiseControlConfig: {} brokers: - id: 0 brokerConfig: @@ -65,26 +62,6 @@ spec: requests: cpu: "100m" memory: 512Mi - - id: 2 - brokerConfig: - processRoles: - - broker - terminationGracePeriodSeconds: 10 - storageConfigs: - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 2Gi - resourceRequirements: - limits: - cpu: "500m" - memory: 1Gi - requests: - cpu: "100m" - memory: 512Mi listenersConfig: internalListeners: - type: "plaintext" diff --git a/tests/e2e/koperator_suite_test.go b/tests/e2e/koperator_suite_test.go index 3e2c35c3a..64c461e9c 100644 --- a/tests/e2e/koperator_suite_test.go +++ b/tests/e2e/koperator_suite_test.go @@ -78,6 +78,7 @@ var _ = ginkgo.When("Testing e2e test altogether", ginkgo.Ordered, func() { testProduceConsumeInternal() testJmxExporter() testUninstallKafkaCluster() + testTieredStorageCachePvcResize() testUninstall() snapshotClusterAndCompare(snapshottedInfo) }) From 83967634f9b1af3b749fd7c82e95e6961406f27c Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 1 Apr 2026 16:22:00 +0300 Subject: [PATCH 04/19] fixed docs --- .../kafka.banzaicloud.io_kafkaclusters.yaml | 2 +- docs/tiered-storage-pvc-resize.md | 301 +++--------------- 2 files changed, 45 insertions(+), 258 deletions(-) diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index d99ceea45..e527f7a1e 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -5100,7 +5100,7 @@ spec: the PersistentVolume backing this claim. type: string type: object - tieredStorageCache: + tieredStorageCache: description: |- TieredStorageCache indicates this storage is used for Kafka tiered storage cache (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be diff --git a/docs/tiered-storage-pvc-resize.md b/docs/tiered-storage-pvc-resize.md index 7336360f4..820f1e9a8 100644 --- a/docs/tiered-storage-pvc-resize.md +++ b/docs/tiered-storage-pvc-resize.md @@ -1,230 +1,55 @@ -# Tiered Storage Cache PVC Resize: Reconciliation Flow +# Tiered Storage Cache PVC Resize -This document explains how koperator handles resizing (shrinking) a tiered storage -cache PVC, why the flow is more involved than a regular PVC update, and the -design decisions behind the current implementation. +Kubernetes does not support shrinking a PVC in-place. Because tiered storage cache +data is ephemeral (repopulated from remote storage on broker restart), koperator +implements a **delete-and-recreate** strategy for cache PVC shrinks, coordinated +with the rolling upgrade machinery so only one broker is affected at a time. --- -## Background: why cache PVCs are different +## Annotations -Kubernetes does not allow shrinking a PVC — the only supported in-place change is -growing it. For regular Kafka log volumes koperator enforces the same constraint -(`isDesiredStorageValueInvalid` returns an error on any size decrease). - -Tiered storage cache volumes are different: the data they hold is ephemeral. When a -broker restarts it will repopulate the cache from remote storage. Losing the cache -does not cause data loss; it only causes a temporary increase in read latency. -Because of this koperator implements a **delete-and-recreate** strategy for cache -PVC shrinks — but it must do so safely, coordinating with the rolling upgrade -machinery so that only one broker is down at a time and cluster health gates are -respected. - ---- - -## Components involved - -| Component | File | Role | -|-----------|------|------| -| `reconcileKafkaPvc` | `pkg/resources/kafka/kafka.go` | Creates / updates / deletes PVCs | -| `reconcileKafkaPod` | `pkg/resources/kafka/kafka.go` | Creates pods or hands off to rolling upgrade | -| `handleRollingUpgrade` | `pkg/resources/kafka/kafka.go` | Deletes a pod after checking health gates | -| `getCreatedPvcForBroker` | `pkg/resources/kafka/kafka.go` | Looks up which PVCs exist for a broker and feeds them into the pod spec | -| `generateDataVolumeAndVolumeMount` | `pkg/resources/kafka/pod.go` | Translates PVC list → pod Volume + VolumeMount entries using `ClaimName` | - ---- - -## PVC annotations used during resize - -Two annotations are written directly on PVC objects to carry state through the -resize. Because annotations live on the Kubernetes object they survive reconciler -restarts, making every step re-entrant. +Two annotations are written on PVC objects to carry state across reconcile cycles. +They survive reconciler restarts, making every step re-entrant. | Annotation | Value | Written on | Meaning | |------------|-------|------------|---------| -| `koperator.adobe.com/cache-resize-state` | `pending-deletion` | Old PVC | This PVC is being replaced; skip it in pod spec generation and delete it once the broker pod stops | -| `koperator.adobe.com/cache-resize-state` | `replacement` | New PVC | This is the replacement PVC; defer to the rolling upgrade before treating it as normal | +| `koperator.adobe.com/cache-resize-state` | `pending-deletion` | Old PVC | Being replaced; excluded from pod spec; deleted once broker pod stops | +| `koperator.adobe.com/cache-resize-state` | `replacement` | New PVC | Replacement PVC; rolling upgrade must complete before annotations are stripped | | `koperator.adobe.com/replaces-pvc` | `` | New PVC | Traceability — records which PVC is being replaced | --- -## Reconciliation loop order - -Within a single `Reconcile()` call the kafka sub-reconciler runs in this order: - -``` -1. Services -2. PodDisruptionBudgets -3. reconcileKafkaPodDelete ← graceful downscale via Cruise Control -4. Listener statuses / PKI -5. reconcileKafkaPvc ← ALL PVC work happens here, before any pod work -6. Discover running pods + bound PVCs -7. reorderBrokers ← priority sort -8. FOR each broker: - a. ConfigMap - b. per-broker Service - c. reconcileKafkaPod ← pod create OR handleRollingUpgrade - d. reconcilePerBrokerDynamicConfig -9. reconcileClusterWideDynamicConfig -``` - -PVC reconciliation always runs before pod reconciliation. The new PVC is therefore -created (and starts provisioning) before `handleRollingUpgrade` is evaluated for -the same broker in the same cycle. - ---- - -## Full flow: shrinking a tiered storage cache PVC - -### Cycle N — resize detected (pod running) - -**`reconcileKafkaPvc` — per-broker setup** - -``` -r.List(brokerPodList) -brokerPodExists = true - -No pending-deletion PVCs exist yet → no cleanup -No replacement PVCs exist yet → no resize-complete strip -effectivePvcCount == len(desiredPvcs) → no CC disk removal triggered -``` - -**`reconcileKafkaPvc` — per-desired-PVC loop** +## Resize flow -``` -CheckIfObjectUpdated: currentSize > desiredSize → enters resize branch - - 1. Annotate old PVC: - koperator.adobe.com/cache-resize-state: pending-deletion - (r.Update — durable immediately in etcd) +### Cycle N — resize detected, pod running - 2. Create replacement PVC with desiredSize: - koperator.adobe.com/cache-resize-state: replacement - koperator.adobe.com/replaces-pvc: - (r.Create — provisioning starts now, in parallel with gate evaluation) +1. The old PVC is annotated `pending-deletion`. +2. A replacement PVC with the new (smaller) size is created and annotated `replacement`. Provisioning starts immediately. +3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart via `handleRollingUpgrade`. +4. `handleRollingUpgrade` evaluates health gates (replica health, concurrent restart limit, rack awareness). If all pass the broker pod is deleted and the cycle requeues. If any gate fails the state is preserved in PVC annotations and retried next cycle. - 3. Set broker ConfigurationState = ConfigOutOfSync - (triggers handleRollingUpgrade on every cycle until pod restarts) +### Cycle N+1 — pod is absent - continue -``` +A pod is considered absent when it either does not exist or has a non-nil `DeletionTimestamp` (Terminating). Treating a Terminating pod as absent allows cleanup to start during the pod's Terminating window rather than waiting for it to fully disappear from etcd. -**`reconcileKafkaPod` — same cycle** +1. The pending-deletion PVC is deleted. +2. A new broker pod is created referencing the replacement PVC. Because provisioning started in cycle N the PVC is likely already `Bound`, minimising startup latency. -`handleRollingUpgrade` sees `ConfigOutOfSync`, evaluates gates: +### Cycle N+2 — pod is present again -| Gate | Required to pass | -|------|-----------------| -| Pod count | All expected pods exist | -| Concurrent restart limit | Terminating/Pending pods < `ConcurrentBrokerRestartCountPerRack` | -| Rack awareness | Only pods from same AZ as restarting pods | -| Replica health | `offline + out-of-sync < FailureThreshold` | +The strip fires as soon as a non-Terminating pod exists for the broker and no pending-deletion PVC remains — the pod does not need to be fully Running. -If all pass → **broker pod is deleted** → requeue 15 s. -If any fail → requeue 15 s, try again next cycle (state is fully durable via PVC annotations). +1. No pending-deletion PVC remains and the replacement PVC exists → resize is complete. +2. The `cache-resize-state` and `replaces-pvc` annotations are stripped from the replacement PVC, which becomes an ordinary PVC from this point forward. --- -### Between cycles (rolling upgrade gates blocking) +## Grow vs shrink -Each reconcile cycle sees: +A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC spec is updated with the larger size and Kubernetes expands the volume without a pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No annotations are written and no rolling restart is triggered. -``` -reconcileKafkaPvc: - brokerPodExists = true - No pending-deletion PVCs → no cleanup - Replacement PVC exists, no pending-deletion → resize-complete strip - … but pod is still up → strip does NOT fire (pod must be up AND no pending-deletion) - alreadyCreated loop: skips pending-deletion PVC, finds replacement PVC - replacement PVC guard: ensure ConfigOutOfSync, continue - -reconcileKafkaPod: - handleRollingUpgrade re-evaluates gates → requeue if blocked -``` - -State is preserved entirely in PVC annotations — no in-memory or status-field -bookkeeping required. Reconciler can crash and restart at any point. - ---- - -### Cycle N+1 — pod is gone - -**`reconcileKafkaPvc` — per-broker setup** - -``` -r.List(brokerPodList) -brokerPodExists = false - -Cleanup loop: finds PVC with pending-deletion annotation - → r.Delete(oldPvc) ← safe now, broker is not running - → r.List(pvcList) ← re-list; only replacement PVC remains -``` - -**`getCreatedPvcForBroker` (called later to build pod spec)** - -Filters out any PVC with `cache-resize-state: pending-deletion` before matching -mount paths. Returns the replacement PVC for the mount path. The pod spec is built -with `ClaimName` pointing to the new PVC. - -**`reconcileKafkaPod`** - -No pod exists → creates new pod referencing the replacement PVC. Kubernetes holds -the pod in `Pending` until the replacement PVC reaches `Bound` phase. - -Because provisioning started in cycle N (when the replacement PVC was created), -the PVC is likely already `Bound` by now, minimising pod startup latency. - ---- - -### Cycle N+2 — pod is running again - -**`reconcileKafkaPvc` — resize-complete strip** - -``` -brokerPodExists = true -No pending-deletion PVCs (deleted in N+1) -Replacement PVC exists → resize complete → strip annotations: - delete koperator.adobe.com/cache-resize-state - delete koperator.adobe.com/replaces-pvc - r.Update(replacementPvc) -``` - -The PVC is now an ordinary PVC. Subsequent reconciles treat it normally. - ---- - -## Sequence diagram - -``` -Cycle N (pod UP, resize detected) - reconcileKafkaPvc - ├─ r.Update(oldPvc) annotate pending-deletion - ├─ r.Create(newPvc) replacement PVC, provisioning starts - └─ ConfigOutOfSync set - reconcileKafkaPod - └─ handleRollingUpgrade - ├─ [gates fail] → requeue 15s - └─ [gates pass] → delete pod → requeue 15s - -Cycle N+k (pod UP, gates failing — any number of cycles) - reconcileKafkaPvc - └─ replacement PVC guard: ensure ConfigOutOfSync, continue - reconcileKafkaPod - └─ handleRollingUpgrade → requeue - -Cycle N+k+1 (pod GONE) - reconcileKafkaPvc - ├─ delete pending-deletion PVC - └─ re-list: only replacement PVC remains - reconcileKafkaPod - └─ no pod → create pod (ClaimName = replacement PVC) - └─ pod Pending until PVC bound (likely already bound) - -Cycle N+k+2 (pod RUNNING) - reconcileKafkaPvc - └─ no pending-deletion PVC + replacement PVC exists → strip annotations - └─ replacement PVC becomes ordinary PVC -``` +A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrinking is only supported for tiered storage cache volumes — regular Kafka log volumes reject any size decrease. --- @@ -232,8 +57,7 @@ Cycle N+k+2 (pod RUNNING) | Property | Value | |----------|-------| -| State survives reconciler crash | Yes — PVC annotations are durable in etcd | -| Idempotent re-entry | Yes — each phase checks annotations and resumes correctly | +| State survives reconciler crash | Mostly — PVC annotations are durable in etcd; the one non-re-entrant window is between annotating the old PVC and creating the replacement, but `ConfigOutOfSync` set in that cycle persists in broker status so the rolling upgrade still proceeds | | Atomicity gap | Eliminated — new PVC is created before old is deleted | | Provisioning overlaps gate evaluation | Yes — new PVC created in cycle N, not N+1 | | Observable via kubectl | Yes — `kubectl get pvc -o yaml` shows resize state directly | @@ -242,61 +66,24 @@ Cycle N+k+2 (pod RUNNING) --- -## Interaction with disk removal detection - -`reconcileKafkaPvc` detects regular disk removal by comparing PVC counts: +## Sequence diagram -```go -if effectivePvcCount > len(desiredPvcs) { handleDiskRemoval(...) } ``` +Cycle N (pod UP, resize detected) + ├─ annotate old PVC: pending-deletion + ├─ create replacement PVC (provisioning starts) + ├─ set ConfigOutOfSync + └─ handleRollingUpgrade + ├─ [gates fail] → requeue 15s, retry next cycle + └─ [gates pass] → delete pod → requeue 15s -The count uses `effectivePvcCount` which **excludes replacement PVCs**. During a -resize the old (pending-deletion) + new (replacement) PVCs temporarily co-exist for -the same mount path. Without this exclusion the count check would incorrectly -trigger a Cruise Control disk-removal operation. - -Inside `handleDiskRemoval` itself, the `pending-deletion` PVC is also safe: its -mount path still matches the desired spec, so `foundInDesired = true` and it is -skipped by the disk-removal state machine. - ---- - -## Interaction with GracefulDiskRebalanceRequired - -When a new PVC first becomes `Bound`, koperator normally sets -`GracefulDiskRebalanceRequired` to ask Cruise Control to rebalance data across -the broker's disks. This is correct for Kafka log volumes but wrong for tiered -storage cache volumes — CC must not account for ephemeral cache storage. - -The `alreadyCreated` loop now explicitly skips `GracefulDiskRebalanceRequired` for -any PVC annotated `tieredStorageCache: true`, including the replacement PVC. - ---- - -## Known limitations - -### ConfigOutOfSync still shared with config changes - -`ConfigOutOfSync` is set to trigger the rolling upgrade. It is the same bit used -for Kafka property changes. An observer cannot distinguish "resize pending" from -"config change" by status alone — the PVC annotations must be inspected. - -### Concurrent resize + complete storage-config removal - -If the tiered storage cache storage config is removed entirely from the spec while -a resize is in progress, both the `pending-deletion` and `replacement` PVCs have a -mount path that is no longer present in the desired spec. Without special handling -this would route them into the Cruise Control `remove_disks` path — which fails with -"log dir not found" because CC has no knowledge of ephemeral cache paths — leaving -the operator stuck in a 20-second requeue loop. - -koperator handles this as follows: +Cycle N+k (pod UP, gates failing — any number of cycles) + └─ ensure ConfigOutOfSync, requeue -**Pod DOWN**: both the `pending-deletion` and the orphaned `replacement` PVC are -deleted directly in `reconcileKafkaPvc` (bypassing Cruise Control). The operator -proceeds normally on the next cycle. +Cycle N+k+1 (pod ABSENT — gone or Terminating) + ├─ delete pending-deletion PVC + └─ create new pod bound to replacement PVC -**Pod UP**: koperator sets `ConfigOutOfSync` to trigger a rolling restart via -`handleRollingUpgrade`, then skips `handleDiskRemoval` for this broker entirely -(`continue` in the broker loop). Once the pod stops, the next cycle falls into the -Pod DOWN path above and cleans up both PVCs. +Cycle N+k+2 (pod PRESENT — non-Terminating, not necessarily Running) + └─ strip annotations → replacement PVC becomes ordinary PVC +``` From 874788e7768952e58181c9bbe6cc77e664765627 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 2 Apr 2026 11:43:32 +0300 Subject: [PATCH 05/19] lint errors --- pkg/resources/kafka/configmap.go | 4 +- pkg/resources/kafka/kafka.go | 558 ++++++++++-------- pkg/resources/kafka/kafka_test.go | 19 +- pkg/resources/kafka/pvc.go | 2 +- tests/e2e/test_tiered_storage_cache_resize.go | 22 +- 5 files changed, 335 insertions(+), 270 deletions(-) diff --git a/pkg/resources/kafka/configmap.go b/pkg/resources/kafka/configmap.go index 6e1874a61..02ec0a1b0 100644 --- a/pkg/resources/kafka/configmap.go +++ b/pkg/resources/kafka/configmap.go @@ -245,14 +245,14 @@ func configureBrokerKRaftMode(bConfig *v1beta1.BrokerConfig, brokerID int32, kaf // this is to support the zk to kRaft migration func shouldUseKRaftModeForBroker(brokerReadOnlyConfig *properties.Properties) bool { migrationBrokerKRaftMode, found := brokerReadOnlyConfig.Get(kafkautils.MigrationBrokerKRaftMode) - return !found || migrationBrokerKRaftMode.Value() == "true" + return !found || migrationBrokerKRaftMode.Value() == annotationTrue } // Returns true by default (not in migration) OR when MigrationBrokerControllerQuorumConfigEnabled is set and 'true'. // this is to support the zk to kRaft migration func shouldConfigureControllerQuorumForBroker(brokerReadOnlyConfig *properties.Properties) bool { migrationBrokerControllerQuorumConfigEnabled, found := brokerReadOnlyConfig.Get(kafkautils.MigrationBrokerControllerQuorumConfigEnabled) - return !found || migrationBrokerControllerQuorumConfigEnabled.Value() == "true" + return !found || migrationBrokerControllerQuorumConfigEnabled.Value() == annotationTrue } func configureBrokerZKMode(brokerID int32, kafkaCluster *v1beta1.KafkaCluster, config *properties.Properties, extListenerStatuses, intListenerStatuses, diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 261fcc117..3cdf99aca 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -76,6 +76,9 @@ const ( // pvcCacheResizeReplacesPVC records the name of the old PVC that the replacement supersedes. pvcCacheResizeReplacesPVC = "koperator.adobe.com/replaces-pvc" + // annotationTrue is the string value used for boolean-true annotations and config comparisons. + annotationTrue = "true" + brokerConfigMapVolumeMount = "broker-config" kafkaDataVolumeMount = "kafka-data" @@ -1162,7 +1165,6 @@ func (r *Reconciler) isPodTainted(log logr.Logger, pod *corev1.Pod) bool { return selector.Matches(labels.Set(pod.Labels)) } -//nolint:funlen func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, brokersDesiredPvcs map[string][]*corev1.PersistentVolumeClaim) error { brokersVolumesState := make(map[string]map[string]banzaiv1beta1.VolumeState) var brokerIds []string @@ -1171,21 +1173,16 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro for brokerId, desiredPvcs := range brokersDesiredPvcs { desiredType := reflect.TypeOf(&corev1.PersistentVolumeClaim{}) brokerVolumesState := make(map[string]banzaiv1beta1.VolumeState) - pvcList := &corev1.PersistentVolumeClaimList{} - matchingLabels := client.MatchingLabels( apiutil.MergeLabels( apiutil.LabelsForKafka(r.KafkaCluster.Name), map[string]string{banzaiv1beta1.BrokerIdLabelKey: brokerId}, ), ) - log = log.WithValues("kind", desiredType) - err := r.List(ctx, pvcList, - client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels) - if err != nil { + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) } @@ -1193,7 +1190,6 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro if err != nil { return errors.WrapIfWithDetails(err, "could not determine if broker is controller", "brokerId", brokerId) } - if isController { if len(desiredPvcs) != 1 { return errors.New("controller broker can have only one volume") @@ -1211,113 +1207,22 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro // Treating a Terminating pod as "not existing" lets the pending-deletion PVC be cleaned up // during the Terminating window rather than waiting for the narrow gap between the pod being // fully removed from etcd and the new pod being created in the same reconcile cycle. - brokerPodList := &corev1.PodList{} - brokerPodMatchingLabels := client.MatchingLabels( - apiutil.MergeLabels( - apiutil.LabelsForKafka(r.KafkaCluster.Name), - map[string]string{banzaiv1beta1.BrokerIdLabelKey: brokerId}, - ), - ) - if err := r.List(ctx, brokerPodList, client.InNamespace(r.KafkaCluster.GetNamespace()), brokerPodMatchingLabels); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "getting broker pods failed", "brokerId", brokerId) - } - brokerPodExists := false - for i := range brokerPodList.Items { - if brokerPodList.Items[i].DeletionTimestamp == nil { - brokerPodExists = true - break - } + brokerPodExists, err := r.getBrokerPodExists(ctx, brokerId) + if err != nil { + return err } - // Build a set of desired mount paths for orphaned-PVC detection below. desiredMountPaths := make(map[string]bool, len(desiredPvcs)) for _, dp := range desiredPvcs { desiredMountPaths[dp.Annotations["mountPath"]] = true } - if brokerPodExists { - // Resize complete when the replacement PVC exists but the pending-deletion PVC is already gone: - // strip the replacement annotation so the PVC is treated as normal going forward. - var hasPendingDeletion bool - for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - hasPendingDeletion = true - break - } - } - if !hasPendingDeletion { - for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { - pvcCopy := pvc.DeepCopy() - delete(pvcCopy.Annotations, pvcCacheResizeStateAnnotation) - delete(pvcCopy.Annotations, pvcCacheResizeReplacesPVC) - if err := r.Update(ctx, pvcCopy); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "removing replacement annotation from tiered storage cache PVC failed", "pvc", pvc.Name) - } - log.Info("Tiered storage cache PVC resize complete, removed replacement annotation", "pvc", pvc.Name) - } - } - } - } else { - // Broker pod is down — safe to delete any PVCs that were waiting for the pod to stop. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", - "brokerId", brokerId, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) - } - } - } - // Also delete replacement PVCs whose mount path is no longer desired — these are - // orphaned by a storage config removal that happened while a cache resize was in flight. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement && - !desiredMountPaths[pvc.Annotations["mountPath"]] { - log.Info("Broker pod is down — deleting orphaned replacement tiered storage cache PVC", - "brokerId", brokerId, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting orphaned replacement tiered storage cache PVC failed", "pvc", pvc.Name) - } - } - } - // Re-list so the rest of this iteration sees the current state. - if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) - } + skipBroker, err := r.handleBrokerCacheResizeCleanup(ctx, log, brokerId, pvcList, desiredMountPaths, brokerPodExists, matchingLabels) + if err != nil { + return err } - - // If the pod is still running and there are orphaned cache resize PVCs (storage config - // removed while a resize was in flight), trigger a rolling restart so the pod stops - // and we can delete the orphaned PVCs on the next cycle. Routing these through - // Cruise Control remove_disks would fail because cache paths are not Kafka log dirs. - if brokerPodExists { - var hasOrphanedCachePvc bool - for _, pvc := range pvcList.Items { - resizeState := pvc.Annotations[pvcCacheResizeStateAnnotation] - if (resizeState == pvcCacheResizePendingDeletion || resizeState == pvcCacheResizeReplacement) && - !desiredMountPaths[pvc.Annotations["mountPath"]] { - hasOrphanedCachePvc = true - break - } - } - if hasOrphanedCachePvc { - log.Info("Orphaned cache resize PVCs detected with broker pod running — triggering rolling restart for safe cleanup", - "brokerId", brokerId) - if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { - if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - banzaiv1beta1.ConfigOutOfSync, log); err != nil { - return errorfactory.New(errorfactory.StatusUpdateError{}, err, - "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) - } - } - continue // Skip disk removal — wait for pod to stop before deleting cache PVCs. - } + if skipBroker { + continue } // Handle disk removal — count only non-replacement PVCs to avoid false positives while a @@ -1335,176 +1240,333 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro } } - for _, desiredPvc := range desiredPvcs { - currentPvc := desiredPvc.DeepCopy() - log.V(1).Info("searching with label because name is empty") + if err := r.reconcileDesiredPvcsForBroker(ctx, log, brokerId, desiredPvcs, pvcList, matchingLabels, desiredType, brokerVolumesState); err != nil { + return err + } - err := r.List(ctx, pvcList, - client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels) - if err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) - } + if len(brokerVolumesState) > 0 { + brokerIds = append(brokerIds, brokerId) + brokersVolumesState[brokerId] = brokerVolumesState + } + } - mountPath := currentPvc.Annotations["mountPath"] - // Creating the first PersistentVolume For Pod - if len(pvcList.Items) == 0 { - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } - if err := r.Create(ctx, desiredPvc); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) - } - log.Info("resource created") - continue - } + if len(brokersVolumesState) > 0 { + if err := k8sutil.UpdateBrokerStatus(r.Client, brokerIds, r.KafkaCluster, brokersVolumesState, log); err != nil { + return err + } + } + + if waitForDiskRemovalToFinish { + return errorfactory.New(errorfactory.CruiseControlTaskRunning{}, errors.New("Disk removal pending"), "Disk removal pending") + } + + return nil +} - alreadyCreated := false +// getBrokerPodExists returns true if a non-terminating pod exists for the given broker. +func (r *Reconciler) getBrokerPodExists(ctx context.Context, brokerId string) (bool, error) { + brokerPodList := &corev1.PodList{} + matchingLabels := client.MatchingLabels( + apiutil.MergeLabels( + apiutil.LabelsForKafka(r.KafkaCluster.Name), + map[string]string{banzaiv1beta1.BrokerIdLabelKey: brokerId}, + ), + ) + if err := r.List(ctx, brokerPodList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return false, errorfactory.New(errorfactory.APIFailure{}, err, "getting broker pods failed", "brokerId", brokerId) + } + for i := range brokerPodList.Items { + if brokerPodList.Items[i].DeletionTimestamp == nil { + return true, nil + } + } + return false, nil +} + +// handleBrokerCacheResizeCleanup manages tiered storage cache resize PVC cleanup based on broker pod state. +// It returns true when the caller should skip the rest of the reconcile loop for this broker (orphaned PVCs +// detected while the pod is running — a rolling restart must complete first). +// +//nolint:funlen +func (r *Reconciler) handleBrokerCacheResizeCleanup( + ctx context.Context, + log logr.Logger, + brokerId string, + pvcList *corev1.PersistentVolumeClaimList, + desiredMountPaths map[string]bool, + brokerPodExists bool, + matchingLabels client.MatchingLabels, +) (bool, error) { + desiredType := reflect.TypeOf(&corev1.PersistentVolumeClaim{}) + if brokerPodExists { + // Resize complete when the replacement PVC exists but the pending-deletion PVC is already gone: + // strip the replacement annotation so the PVC is treated as normal going forward. + var hasPendingDeletion bool + for _, pvc := range pvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + hasPendingDeletion = true + break + } + } + if !hasPendingDeletion { for _, pvc := range pvcList.Items { - if mountPath != pvc.Annotations["mountPath"] { - continue + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + pvcCopy := pvc.DeepCopy() + delete(pvcCopy.Annotations, pvcCacheResizeStateAnnotation) + delete(pvcCopy.Annotations, pvcCacheResizeReplacesPVC) + if err := r.Update(ctx, pvcCopy); err != nil { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "removing replacement annotation from tiered storage cache PVC failed", "pvc", pvc.Name) + } + log.Info("Tiered storage cache PVC resize complete, removed replacement annotation", "pvc", pvc.Name) } - // Skip the old PVC that is waiting to be deleted once the broker pod stops. - // The replacement PVC (same mount path) will be matched on the next iteration. - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - continue + } + } + } else { + // Broker pod is down — safe to delete any PVCs that were waiting for the pod to stop. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", + "brokerId", brokerId, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) } - currentPvc = pvc.DeepCopy() - alreadyCreated = true - // Trigger a CC disk rebalance only for regular data volumes. - // Tiered storage cache PVCs are ephemeral — CC must not account for them. - isCachePvc := pvc.Annotations["tieredStorageCache"] == "true" - if !isCachePvc { - // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, - // then we make it happening with status update. - // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance - volumeState, found := r.KafkaCluster.Status.BrokersState[brokerId].GracefulActionState.VolumeStates[mountPath] - if currentPvc.Status.Phase == corev1.ClaimBound && - (!found || volumeState.CruiseControlVolumeState.IsDiskRemoval()) { - brokerVolumesState[mountPath] = banzaiv1beta1.VolumeState{CruiseControlVolumeState: banzaiv1beta1.GracefulDiskRebalanceRequired} - } + } + } + // Also delete replacement PVCs whose mount path is no longer desired — these are + // orphaned by a storage config removal that happened while a cache resize was in flight. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement && + !desiredMountPaths[pvc.Annotations["mountPath"]] { + log.Info("Broker pod is down — deleting orphaned replacement tiered storage cache PVC", + "brokerId", brokerId, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned replacement tiered storage cache PVC failed", "pvc", pvc.Name) } - break } + } + // Re-list so the rest of this iteration sees the current state. + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return false, errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) + } + } - if !alreadyCreated { - // Creating the 2+ PersistentVolumes for Pod - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } - if err := r.Create(ctx, desiredPvc); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) + // If the pod is still running and there are orphaned cache resize PVCs (storage config + // removed while a resize was in flight), trigger a rolling restart so the pod stops + // and we can delete the orphaned PVCs on the next cycle. Routing these through + // Cruise Control remove_disks would fail because cache paths are not Kafka log dirs. + if brokerPodExists { + var hasOrphanedCachePvc bool + for _, pvc := range pvcList.Items { + resizeState := pvc.Annotations[pvcCacheResizeStateAnnotation] + if (resizeState == pvcCacheResizePendingDeletion || resizeState == pvcCacheResizeReplacement) && + !desiredMountPaths[pvc.Annotations["mountPath"]] { + hasOrphanedCachePvc = true + break + } + } + if hasOrphanedCachePvc { + log.Info("Orphaned cache resize PVCs detected with broker pod running — triggering rolling restart for safe cleanup", + "brokerId", brokerId) + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) } + } + return true, nil // Skip disk removal — wait for pod to stop before deleting cache PVCs. + } + } + + return false, nil +} + +// reconcileDesiredPvcsForBroker reconciles the desired PVCs for a single broker. +// +//nolint:funlen +func (r *Reconciler) reconcileDesiredPvcsForBroker( + ctx context.Context, + log logr.Logger, + brokerId string, + desiredPvcs []*corev1.PersistentVolumeClaim, + pvcList *corev1.PersistentVolumeClaimList, + matchingLabels client.MatchingLabels, + desiredType reflect.Type, + brokerVolumesState map[string]banzaiv1beta1.VolumeState, +) error { + for _, desiredPvc := range desiredPvcs { + currentPvc := desiredPvc.DeepCopy() + log.V(1).Info("searching with label because name is empty") + + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) + } + + mountPath := currentPvc.Annotations["mountPath"] + // Creating the first PersistentVolume For Pod + if len(pvcList.Items) == 0 { + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + if err := r.Create(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) + } + log.Info("resource created") + continue + } + + alreadyCreated := false + for _, pvc := range pvcList.Items { + if mountPath != pvc.Annotations["mountPath"] { continue } - if err == nil { - // A replacement PVC is the new smaller PVC staged during a tiered storage cache resize. - // It already has the right size; we only need to keep the rolling upgrade running so - // the broker pod stops and the pending-deletion PVC can be cleaned up. - if currentPvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { - if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { - if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - banzaiv1beta1.ConfigOutOfSync, log); err != nil { - return errorfactory.New(errorfactory.StatusUpdateError{}, err, - "could not mark broker ConfigOutOfSync for pending tiered storage cache PVC resize", "brokerId", brokerId) - } - } - continue + // Skip the old PVC that is waiting to be deleted once the broker pod stops. + // The replacement PVC (same mount path) will be matched on the next iteration. + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + continue + } + currentPvc = pvc.DeepCopy() + alreadyCreated = true + // Trigger a CC disk rebalance only for regular data volumes. + // Tiered storage cache PVCs are ephemeral — CC must not account for them. + if pvc.Annotations["tieredStorageCache"] != annotationTrue { + // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, + // then we make it happening with status update. + // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance + volumeState, found := r.KafkaCluster.Status.BrokersState[brokerId].GracefulActionState.VolumeStates[mountPath] + if currentPvc.Status.Phase == corev1.ClaimBound && + (!found || volumeState.CruiseControlVolumeState.IsDiskRemoval()) { + brokerVolumesState[mountPath] = banzaiv1beta1.VolumeState{CruiseControlVolumeState: banzaiv1beta1.GracefulDiskRebalanceRequired} } + } + break + } - if k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } + if !alreadyCreated { + // Creating the 2+ PersistentVolumes for Pod + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + if err := r.Create(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) + } + continue + } - // Check if this is a tiered storage cache volume - isTieredCache := currentPvc.Annotations["tieredStorageCache"] == "true" - desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() - currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() - - // Tiered storage cache PVC shrink: stage the replacement PVC immediately so - // provisioning runs in parallel with rolling-upgrade gate evaluation. - // The old PVC is annotated pending-deletion and removed once the broker pod stops. - if isTieredCache && desiredSize < currentSize { - log.Info("Tiered storage cache size decrease detected — staging replacement PVC", - "brokerId", brokerId, - "mountPath", mountPath, - "currentSize", currentSize, - "desiredSize", desiredSize) - - // Annotate old PVC so it is skipped by getCreatedPvcForBroker and cleaned up - // the moment the broker pod stops. - oldPvcCopy := currentPvc.DeepCopy() - oldPvcCopy.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion - if err := r.Update(ctx, oldPvcCopy); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "annotating old tiered storage cache PVC for deletion failed", "pvc", currentPvc.Name) - } - log.Info("Marked old tiered storage cache PVC for deletion", "brokerId", brokerId, "pvc", currentPvc.Name) + // A replacement PVC is the new smaller PVC staged during a tiered storage cache resize. + // It already has the right size; we only need to keep the rolling upgrade running so + // the broker pod stops and the pending-deletion PVC can be cleaned up. + if currentPvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for pending tiered storage cache PVC resize", "brokerId", brokerId) + } + } + continue + } - // Create the replacement PVC immediately so provisioning starts now. - desiredPvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizeReplacement - desiredPvc.Annotations[pvcCacheResizeReplacesPVC] = currentPvc.Name - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } - if err := r.Create(ctx, desiredPvc); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "creating replacement tiered storage cache PVC failed", "brokerId", brokerId) - } - log.Info("Created replacement tiered storage cache PVC", - "brokerId", brokerId, "pvc", desiredPvc.Name, "desiredSize", desiredSize) - - // Trigger rolling upgrade so the broker pod is restarted and the old PVC - // can be deleted on the next reconcile cycle. - if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { - log.Info("Marking broker ConfigOutOfSync to trigger rolling upgrade for tiered storage cache PVC resize", - "brokerId", brokerId) - if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - banzaiv1beta1.ConfigOutOfSync, log); err != nil { - return errorfactory.New(errorfactory.StatusUpdateError{}, err, - "could not mark broker ConfigOutOfSync for tiered storage cache PVC resize", "brokerId", brokerId) - } - } - continue - } + if !k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { + continue + } - // Regular validation: size decreases are forbidden for non-cache volumes. - if isDesiredStorageValueInvalid(desiredPvc, currentPvc) { - return errorfactory.New(errorfactory.InternalError{}, errors.New("could not modify pvc size"), - "one can not reduce the size of a PVC", "kind", desiredType) - } + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } - resReq := desiredPvc.Spec.Resources.Requests - labels := desiredPvc.Labels - desiredPvc = currentPvc.DeepCopy() - desiredPvc.Spec.Resources.Requests = resReq - desiredPvc.Labels = labels + // Check if this is a tiered storage cache volume + isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue + desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() + currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() - if err := r.Update(ctx, desiredPvc); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "updating resource failed", "kind", desiredType) - } - log.Info("resource updated") - } + // Tiered storage cache PVC shrink: stage the replacement PVC immediately so + // provisioning runs in parallel with rolling-upgrade gate evaluation. + // The old PVC is annotated pending-deletion and removed once the broker pod stops. + if isTieredCache && desiredSize < currentSize { + if err := r.stageTieredCachePVCShrink(ctx, log, brokerId, mountPath, currentPvc, desiredPvc, currentSize, desiredSize); err != nil { + return err } + continue } - if len(brokerVolumesState) > 0 { - brokerIds = append(brokerIds, brokerId) - brokersVolumesState[brokerId] = brokerVolumesState + // Regular validation: size decreases are forbidden for non-cache volumes. + if isDesiredStorageValueInvalid(desiredPvc, currentPvc) { + return errorfactory.New(errorfactory.InternalError{}, errors.New("could not modify pvc size"), + "one can not reduce the size of a PVC", "kind", desiredType) } - } - if len(brokersVolumesState) > 0 { - err := k8sutil.UpdateBrokerStatus(r.Client, brokerIds, r.KafkaCluster, brokersVolumesState, log) - if err != nil { - return err + resReq := desiredPvc.Spec.Resources.Requests + labels := desiredPvc.Labels + desiredPvc = currentPvc.DeepCopy() + desiredPvc.Spec.Resources.Requests = resReq + desiredPvc.Labels = labels + + if err := r.Update(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "updating resource failed", "kind", desiredType) } + log.Info("resource updated") } + return nil +} - if waitForDiskRemovalToFinish { - return errorfactory.New(errorfactory.CruiseControlTaskRunning{}, errors.New("Disk removal pending"), "Disk removal pending") +// stageTieredCachePVCShrink handles shrinking a tiered storage cache PVC by staging a replacement +// and triggering a rolling restart so the old PVC can be deleted once the broker pod stops. +func (r *Reconciler) stageTieredCachePVCShrink( + ctx context.Context, + log logr.Logger, + brokerId string, + mountPath string, + currentPvc *corev1.PersistentVolumeClaim, + desiredPvc *corev1.PersistentVolumeClaim, + currentSize int64, + desiredSize int64, +) error { + log.Info("Tiered storage cache size decrease detected — staging replacement PVC", + "brokerId", brokerId, + "mountPath", mountPath, + "currentSize", currentSize, + "desiredSize", desiredSize) + + // Annotate old PVC so it is skipped by getCreatedPvcForBroker and cleaned up + // the moment the broker pod stops. + oldPvcCopy := currentPvc.DeepCopy() + oldPvcCopy.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion + if err := r.Update(ctx, oldPvcCopy); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "annotating old tiered storage cache PVC for deletion failed", "pvc", currentPvc.Name) + } + log.Info("Marked old tiered storage cache PVC for deletion", "brokerId", brokerId, "pvc", currentPvc.Name) + + // Create the replacement PVC immediately so provisioning starts now. + desiredPvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizeReplacement + desiredPvc.Annotations[pvcCacheResizeReplacesPVC] = currentPvc.Name + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + if err := r.Create(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "creating replacement tiered storage cache PVC failed", "brokerId", brokerId) } + log.Info("Created replacement tiered storage cache PVC", + "brokerId", brokerId, "pvc", desiredPvc.Name, "desiredSize", desiredSize) + // Trigger rolling upgrade so the broker pod is restarted and the old PVC + // can be deleted on the next reconcile cycle. + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + log.Info("Marking broker ConfigOutOfSync to trigger rolling upgrade for tiered storage cache PVC resize", + "brokerId", brokerId) + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for tiered storage cache PVC resize", "brokerId", brokerId) + } + } return nil } diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 323a87264..c07fa002f 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -31,11 +31,12 @@ import ( logf "sigs.k8s.io/controller-runtime/pkg/log" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "github.com/banzaicloud/k8s-objectmatcher/patch" + "github.com/banzaicloud/koperator/api/v1alpha1" "github.com/banzaicloud/koperator/api/v1beta1" controllerMocks "github.com/banzaicloud/koperator/controllers/tests/mocks" @@ -1402,15 +1403,15 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { } testCases := []struct { - testName string - existingPvc *corev1.PersistentVolumeClaim - desiredPvc *corev1.PersistentVolumeClaim - existingPods []corev1.Pod + testName string + existingPvc *corev1.PersistentVolumeClaim + desiredPvc *corev1.PersistentVolumeClaim + existingPods []corev1.Pod // expectedUpdatePvc: old PVC annotated pending-deletion, or annotation stripped on resize-complete - expectedUpdatePvc bool - expectedCreatePvc bool - expectedDeletePvc bool - expectedError bool + expectedUpdatePvc bool + expectedCreatePvc bool + expectedDeletePvc bool + expectedError bool }{ { // Pod is up: annotate old PVC as pending-deletion, create replacement PVC, diff --git a/pkg/resources/kafka/pvc.go b/pkg/resources/kafka/pvc.go index 67dd6167e..b423743e4 100644 --- a/pkg/resources/kafka/pvc.go +++ b/pkg/resources/kafka/pvc.go @@ -66,7 +66,7 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora // Mark tiered storage cache PVCs with annotation for special handling if storage.TieredStorageCache { - annotations["tieredStorageCache"] = "true" + annotations["tieredStorageCache"] = annotationTrue } return &corev1.PersistentVolumeClaim{ diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 0decedbc5..b45ffdb04 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -38,12 +38,14 @@ const ( tsResizeShrunkSize = "1Gi" // Annotation keys written by the cache-resize reconciler. - pvcResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" - pvcResizeStatePending = "pending-deletion" + pvcResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" + pvcResizeStatePending = "pending-deletion" pvcResizeStateReplacement = "replacement" tsResizePhaseTimeout = 10 * time.Minute tsResizePollingInterval = 15 * time.Second + + tsResizeBrokerID = 0 ) // pvcItem is a minimal representation of a PVC for assertion helpers. @@ -53,10 +55,10 @@ type pvcItem struct { StorageSize string } -// listBrokerCachePVCs returns PVCs for the given broker that have the +// listBrokerCachePVCs returns PVCs for broker tsResizeBrokerID that have the // tiered-storage-cache mount path annotation. -func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions, clusterName string, brokerID int) ([]pvcItem, error) { - selector := fmt.Sprintf("%s=%s,brokerId=%d", kafkaCRLabelKey, clusterName, brokerID) +func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { + selector := fmt.Sprintf("%s=%s,brokerId=%d", kafkaCRLabelKey, tsResizeClusterName, tsResizeBrokerID) rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, "get", "persistentvolumeclaims", @@ -252,7 +254,7 @@ func testTieredStorageCachePvcResize() bool { gomega.Expect(broker0PodUID).NotTo(gomega.BeEmpty()) ginkgo.By("Verifying initial cache PVC size is " + tsResizeInitialSize) - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) gomega.Expect(err).NotTo(gomega.HaveOccurred()) gomega.Expect(pvcs).To(gomega.HaveLen(1), "expected exactly one cache PVC for broker 0") gomega.Expect(pvcs[0].StorageSize).To(gomega.Equal(tsResizeInitialSize)) @@ -266,7 +268,7 @@ func testTieredStorageCachePvcResize() bool { ginkgo.It("Phase 1: old PVC annotated pending-deletion and replacement PVC created", func() { ginkgo.By("Waiting until both pending-deletion and replacement PVCs coexist for broker 0") gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } @@ -308,7 +310,7 @@ func testTieredStorageCachePvcResize() bool { ginkgo.By("Waiting for the pending-deletion PVC to be deleted") gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } @@ -330,7 +332,7 @@ func testTieredStorageCachePvcResize() bool { ginkgo.By("Waiting for replacement annotation to be stripped from the surviving PVC") gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } @@ -345,7 +347,7 @@ func testTieredStorageCachePvcResize() bool { }) ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() { - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) gomega.Expect(err).NotTo(gomega.HaveOccurred()) gomega.Expect(pvcs).To(gomega.HaveLen(1)) gomega.Expect(pvcs[0].StorageSize).To(gomega.Equal(tsResizeShrunkSize), From 252a3a64f48f5e478518da75fffce0b423825683 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 2 Apr 2026 14:00:53 +0300 Subject: [PATCH 06/19] lock setup_envtest_version, add comment for rennovate to pick it up --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 01f5ba493..b4afd2eb1 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ GOLANGCI_VERSION = 2.11.4 # renovate: datasource=github-releases depName=golangc LICENSEI_VERSION = 0.9.0 # renovate: datasource=github-releases depName=goph/licensei CONTROLLER_GEN_VERSION = v0.20.1 # renovate: datasource=github-releases depName=kubernetes-sigs/controller-tools ENVTEST_K8S_VERSION = 1.35.0 # renovate: datasource=github-releases depName=kubernetes-sigs/controller-tools extractVersion=^envtest-v(?.+)$ -SETUP_ENVTEST_VERSION := latest +SETUP_ENVTEST_VERSION := v0.23.3 # renovate: datasource=github-releases depName=kubernetes-sigs/controller-runtime ADDLICENSE_VERSION := 1.2.0 # renovate: datasource=github-releases depName=google/addlicense GOTEMPLATE_VERSION := 3.12.0 # renovate: datasource=github-releases depName=cznic/gotemplate MOCKGEN_VERSION := 0.6.0 # renovate: datasource=github-releases depName=uber-go/mock From 27c1b7a8ebc8b0bb846efdadd42427a00b32328d Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 2 Apr 2026 15:59:38 +0300 Subject: [PATCH 07/19] add annotation during reconcile --- pkg/resources/kafka/kafka.go | 47 +++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 3cdf99aca..830652bd5 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1225,6 +1225,27 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro continue } + // Delete tiered storage cache PVCs whose mount path is no longer desired. + // These are not Kafka log dirs so they must never go through CC disk removal. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + if !desiredMountPaths[pvc.Annotations["mountPath"]] { + log.Info("Deleting removed tiered storage cache PVC", + "brokerId", brokerId, "mountPath", pvc.Annotations["mountPath"], "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting removed tiered storage cache PVC failed", "pvc", pvc.Name) + } + } + } + // Re-list so the disk removal count below reflects the deletions above. + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) + } + // Handle disk removal — count only non-replacement PVCs to avoid false positives while a // tiered storage cache resize is in flight (old + new PVC temporarily coexist). effectivePvcCount := 0 @@ -1432,9 +1453,27 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( } currentPvc = pvc.DeepCopy() alreadyCreated = true + + // Backfill the tieredStorageCache annotation if the desired PVC has it but the existing + // PVC does not — handles PVCs created before the annotation was introduced. + if desiredPvc.Annotations["tieredStorageCache"] == annotationTrue && + currentPvc.Annotations["tieredStorageCache"] != annotationTrue { + pvcCopy := currentPvc.DeepCopy() + if pvcCopy.Annotations == nil { + pvcCopy.Annotations = make(map[string]string) + } + pvcCopy.Annotations["tieredStorageCache"] = annotationTrue + if err := r.Update(ctx, pvcCopy); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "backfilling tieredStorageCache annotation on existing PVC failed", "pvc", pvcCopy.Name) + } + log.Info("Backfilled tieredStorageCache annotation on existing PVC", "pvc", pvcCopy.Name) + currentPvc = pvcCopy + } + // Trigger a CC disk rebalance only for regular data volumes. // Tiered storage cache PVCs are ephemeral — CC must not account for them. - if pvc.Annotations["tieredStorageCache"] != annotationTrue { + if currentPvc.Annotations["tieredStorageCache"] != annotationTrue { // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, // then we make it happening with status update. // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance @@ -1480,8 +1519,10 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( return errors.WrapIf(err, "could not apply last state to annotation") } - // Check if this is a tiered storage cache volume - isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue + // Check if this is a tiered storage cache volume. Fall back to the desired PVC annotation + // for PVCs created before the tieredStorageCache annotation was introduced. + isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue || + desiredPvc.Annotations["tieredStorageCache"] == annotationTrue desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() From aff525258fcec64dc2e455379fcc5941787ac4f2 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 2 Apr 2026 16:01:35 +0300 Subject: [PATCH 08/19] fixed SETUP_ENVTEST_VERSION temporarily to allow builds to pass --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b4afd2eb1..3126de365 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ GOLANGCI_VERSION = 2.11.4 # renovate: datasource=github-releases depName=golangc LICENSEI_VERSION = 0.9.0 # renovate: datasource=github-releases depName=goph/licensei CONTROLLER_GEN_VERSION = v0.20.1 # renovate: datasource=github-releases depName=kubernetes-sigs/controller-tools ENVTEST_K8S_VERSION = 1.35.0 # renovate: datasource=github-releases depName=kubernetes-sigs/controller-tools extractVersion=^envtest-v(?.+)$ -SETUP_ENVTEST_VERSION := v0.23.3 # renovate: datasource=github-releases depName=kubernetes-sigs/controller-runtime +SETUP_ENVTEST_VERSION := v0.0.0-20260311120938-7f576c06d187 # last commit with go 1.25 requirement; 598e330b (2026-03-31) bumped to go 1.26 ADDLICENSE_VERSION := 1.2.0 # renovate: datasource=github-releases depName=google/addlicense GOTEMPLATE_VERSION := 3.12.0 # renovate: datasource=github-releases depName=cznic/gotemplate MOCKGEN_VERSION := 0.6.0 # renovate: datasource=github-releases depName=uber-go/mock From fdc14f43c84cb8fa0186996ab35ca5620a7669e2 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Tue, 7 Apr 2026 14:13:14 +0300 Subject: [PATCH 09/19] pr review --- api/v1beta1/common_types.go | 13 + charts/kafka-operator/crds/kafkaclusters.yaml | 10 + .../kafka.banzaicloud.io_kafkaclusters.yaml | 10 + docs/tiered-storage-pvc-resize.md | 86 +++-- pkg/k8sutil/status.go | 11 + pkg/resources/kafka/configmap.go | 4 + pkg/resources/kafka/kafka.go | 318 ++++++++++++------ pkg/resources/kafka/kafka_test.go | 119 +++---- tests/e2e/test_tiered_storage_cache_resize.go | 116 ++++--- 9 files changed, 461 insertions(+), 226 deletions(-) diff --git a/api/v1beta1/common_types.go b/api/v1beta1/common_types.go index 902967438..c596c36ac 100644 --- a/api/v1beta1/common_types.go +++ b/api/v1beta1/common_types.go @@ -222,6 +222,16 @@ type VolumeState struct { CruiseControlOperationReference *corev1.LocalObjectReference `json:"cruiseControlOperationReference,omitempty"` } +// CacheResizeState tracks the resize lifecycle of a tiered storage cache PVC for a given mount path. +type CacheResizeState string + +const ( + // CacheResizePendingDeletion indicates that the old cache PVC at this mount path is waiting + // to be deleted once the broker pod stops. A replacement PVC with the new desired size has + // already been created at the same mount path. + CacheResizePendingDeletion CacheResizeState = "pending-deletion" +) + // BrokerState holds information about broker state type BrokerState struct { // RackAwarenessState holds info about rack awareness status @@ -240,6 +250,9 @@ type BrokerState struct { Image string `json:"image,omitempty"` // Compressed data from broker configuration to restore broker pod in specific cases ConfigurationBackup string `json:"configurationBackup,omitempty"` + // CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. + // An entry is present only while a resize is in progress; it is cleared once cleanup completes. + CacheVolumeStates map[string]CacheResizeState `json:"cacheVolumeStates,omitempty"` } const ( diff --git a/charts/kafka-operator/crds/kafkaclusters.yaml b/charts/kafka-operator/crds/kafkaclusters.yaml index e527f7a1e..0debe0c6f 100644 --- a/charts/kafka-operator/crds/kafkaclusters.yaml +++ b/charts/kafka-operator/crds/kafkaclusters.yaml @@ -23825,6 +23825,16 @@ spec: additionalProperties: description: BrokerState holds information about broker state properties: + cacheVolumeStates: + additionalProperties: + description: CacheResizeState tracks the resize lifecycle of + a tiered storage cache PVC for a given mount path. + type: string + description: CacheVolumeStates tracks in-flight tiered storage + cache PVC resize operations, keyed by mount path. An entry + is present only while a resize is in progress; it is cleared + once cleanup completes. + type: object configurationBackup: description: Compressed data from broker configuration to restore broker pod in specific cases diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index e527f7a1e..0debe0c6f 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -23825,6 +23825,16 @@ spec: additionalProperties: description: BrokerState holds information about broker state properties: + cacheVolumeStates: + additionalProperties: + description: CacheResizeState tracks the resize lifecycle of + a tiered storage cache PVC for a given mount path. + type: string + description: CacheVolumeStates tracks in-flight tiered storage + cache PVC resize operations, keyed by mount path. An entry + is present only while a resize is in progress; it is cleared + once cleanup completes. + type: object configurationBackup: description: Compressed data from broker configuration to restore broker pod in specific cases diff --git a/docs/tiered-storage-pvc-resize.md b/docs/tiered-storage-pvc-resize.md index 820f1e9a8..dd21d7a1d 100644 --- a/docs/tiered-storage-pvc-resize.md +++ b/docs/tiered-storage-pvc-resize.md @@ -7,16 +7,27 @@ with the rolling upgrade machinery so only one broker is affected at a time. --- -## Annotations +## State tracking -Two annotations are written on PVC objects to carry state across reconcile cycles. -They survive reconciler restarts, making every step re-entrant. +Resize state is stored in the `KafkaCluster` CR status under +`status.brokersState[].cacheVolumeStates`, keyed by mount path. +This keeps the KafkaCluster CR the single source of truth for all in-flight +broker operations and avoids a second, parallel state store on PVC objects. -| Annotation | Value | Written on | Meaning | -|------------|-------|------------|---------| -| `koperator.adobe.com/cache-resize-state` | `pending-deletion` | Old PVC | Being replaced; excluded from pod spec; deleted once broker pod stops | -| `koperator.adobe.com/cache-resize-state` | `replacement` | New PVC | Replacement PVC; rolling upgrade must complete before annotations are stripped | -| `koperator.adobe.com/replaces-pvc` | `` | New PVC | Traceability — records which PVC is being replaced | +| Field | Value | Meaning | +|-------|-------|---------| +| `status.brokersState[N].cacheVolumeStates[]` | `pending-deletion` | A resize is in flight for this mount path. The old PVC (larger size) is waiting to be deleted once the broker pod stops; the replacement PVC (desired smaller size) has already been created. | + +The entry is cleared once the old PVC has been deleted and the broker pod has +restarted. An empty map means no resize is in progress. + +Two PVC annotations that describe what a PVC **is** (not operational state) are +always present on cache PVCs: + +| Annotation | Value | Purpose | +|------------|-------|---------| +| `mountPath` | `` | Used throughout reconcile logic to match PVCs to storage configs | +| `tieredStorageCache` | `"true"` | Identifies cache PVCs for special handling: skipped from `log.dirs` and CC capacity config | --- @@ -24,32 +35,45 @@ They survive reconciler restarts, making every step re-entrant. ### Cycle N — resize detected, pod running -1. The old PVC is annotated `pending-deletion`. -2. A replacement PVC with the new (smaller) size is created and annotated `replacement`. Provisioning starts immediately. -3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart via `handleRollingUpgrade`. -4. `handleRollingUpgrade` evaluates health gates (replica health, concurrent restart limit, rack awareness). If all pass the broker pod is deleted and the cycle requeues. If any gate fails the state is preserved in PVC annotations and retried next cycle. +1. `status.brokersState[N].cacheVolumeStates[]` is set to `pending-deletion` + in the KafkaCluster CR status. This is the durable record that a resize is in flight. +2. A replacement PVC with the new (smaller) size is created. Provisioning starts immediately. +3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart + via `handleRollingUpgrade`. +4. `handleRollingUpgrade` evaluates health gates (replica health, concurrent restart limit, + rack awareness). If all pass the broker pod is deleted and the cycle requeues. If any gate + fails the state persists in the CR and is retried next cycle. ### Cycle N+1 — pod is absent -A pod is considered absent when it either does not exist or has a non-nil `DeletionTimestamp` (Terminating). Treating a Terminating pod as absent allows cleanup to start during the pod's Terminating window rather than waiting for it to fully disappear from etcd. +A pod is considered absent when it either does not exist or has a non-nil +`DeletionTimestamp` (Terminating). Treating a Terminating pod as absent allows +cleanup to start during the pod's Terminating window rather than waiting for it +to fully disappear from etcd. -1. The pending-deletion PVC is deleted. -2. A new broker pod is created referencing the replacement PVC. Because provisioning started in cycle N the PVC is likely already `Bound`, minimising startup latency. +1. The old PVC (the one whose size differs from the desired size at that mount path) + is deleted. +2. The `cacheVolumeStates` entry for that mount path is cleared from the CR status. +3. A new broker pod is created referencing the replacement PVC. Because provisioning + started in cycle N the PVC is likely already `Bound`, minimising startup latency. ### Cycle N+2 — pod is present again -The strip fires as soon as a non-Terminating pod exists for the broker and no pending-deletion PVC remains — the pod does not need to be fully Running. - -1. No pending-deletion PVC remains and the replacement PVC exists → resize is complete. -2. The `cache-resize-state` and `replaces-pvc` annotations are stripped from the replacement PVC, which becomes an ordinary PVC from this point forward. +1. No `cacheVolumeStates` entry remains for the mount path → resize is complete. +2. The replacement PVC is now an ordinary cache PVC with no special state attached. --- ## Grow vs shrink -A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC spec is updated with the larger size and Kubernetes expands the volume without a pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No annotations are written and no rolling restart is triggered. +A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC +spec is updated with the larger size and Kubernetes expands the volume without a +pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No +`cacheVolumeStates` entry is written and no rolling restart is triggered. -A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrinking is only supported for tiered storage cache volumes — regular Kafka log volumes reject any size decrease. +A cache PVC **shrink** uses the delete-and-recreate flow described above. +Shrinking is only supported for tiered storage cache volumes — regular Kafka log +volumes reject any size decrease with an error. --- @@ -57,12 +81,13 @@ A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrink | Property | Value | |----------|-------| -| State survives reconciler crash | Mostly — PVC annotations are durable in etcd; the one non-re-entrant window is between annotating the old PVC and creating the replacement, but `ConfigOutOfSync` set in that cycle persists in broker status so the rolling upgrade still proceeds | -| Atomicity gap | Eliminated — new PVC is created before old is deleted | -| Provisioning overlaps gate evaluation | Yes — new PVC created in cycle N, not N+1 | -| Observable via kubectl | Yes — `kubectl get pvc -o yaml` shows resize state directly | -| ConfigOutOfSync overloading | Reduced — `ConfigOutOfSync` still used, but the *reason* is legible in PVC annotations | -| CC disk rebalance for cache PVCs | Fixed — tiered cache PVCs are explicitly excluded from `GracefulDiskRebalanceRequired` logic | +| State survives reconciler crash | Yes — `cacheVolumeStates` is written to the KafkaCluster CR (etcd) before the replacement PVC is created; every step is re-entrant | +| Single source of truth | Yes — all broker state (configuration, graceful actions, cache resize) lives in `status.brokersState` | +| Atomicity gap | Eliminated — replacement PVC is created before old is deleted | +| Provisioning overlaps gate evaluation | Yes — replacement PVC created in cycle N, not N+1 | +| Observable via kubectl | Yes — `kubectl get kafkacluster -o jsonpath='{.status.brokersState}'` shows resize state; an empty `cacheVolumeStates` means no resize is in progress | +| CC disk rebalance for cache PVCs | Excluded — tiered cache PVCs are explicitly skipped from `GracefulDiskRebalanceRequired` and CC capacity config | +| `log.dirs` for cache PVCs | Excluded — `generateStorageConfig` skips volumes with `TieredStorageCache: true` | --- @@ -70,7 +95,7 @@ A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrink ``` Cycle N (pod UP, resize detected) - ├─ annotate old PVC: pending-deletion + ├─ set cacheVolumeStates[mountPath] = pending-deletion in CR status ├─ create replacement PVC (provisioning starts) ├─ set ConfigOutOfSync └─ handleRollingUpgrade @@ -81,9 +106,10 @@ Cycle N+k (pod UP, gates failing — any number of cycles) └─ ensure ConfigOutOfSync, requeue Cycle N+k+1 (pod ABSENT — gone or Terminating) - ├─ delete pending-deletion PVC + ├─ delete old PVC (identified as the PVC at mountPath whose size ≠ desired) + ├─ clear cacheVolumeStates[mountPath] from CR status └─ create new pod bound to replacement PVC Cycle N+k+2 (pod PRESENT — non-Terminating, not necessarily Running) - └─ strip annotations → replacement PVC becomes ordinary PVC + └─ cacheVolumeStates entry is absent → resize complete, no further action ``` diff --git a/pkg/k8sutil/status.go b/pkg/k8sutil/status.go index 4836cb3ab..e48a305ae 100644 --- a/pkg/k8sutil/status.go +++ b/pkg/k8sutil/status.go @@ -174,6 +174,17 @@ func generateBrokerState(brokerIDs []string, cluster *banzaicloudv1beta1.KafkaCl case banzaicloudv1beta1.KafkaVersion: brokerState.Image = s.Image brokerState.Version = s.Version + case map[string]banzaicloudv1beta1.CacheResizeState: + if brokerState.CacheVolumeStates == nil { + brokerState.CacheVolumeStates = make(map[string]banzaicloudv1beta1.CacheResizeState) + } + for mountPath, state := range s { + if state == "" { + delete(brokerState.CacheVolumeStates, mountPath) + } else { + brokerState.CacheVolumeStates[mountPath] = state + } + } } brokersState[brokerID] = brokerState } diff --git a/pkg/resources/kafka/configmap.go b/pkg/resources/kafka/configmap.go index 02ec0a1b0..00282f7d2 100644 --- a/pkg/resources/kafka/configmap.go +++ b/pkg/resources/kafka/configmap.go @@ -411,6 +411,10 @@ func getMountPathsFromBrokerConfigMap(configMap *corev1.ConfigMap) ([]string, er func generateStorageConfig(sConfig []v1beta1.StorageConfig) []string { mountPaths := make([]string, 0, len(sConfig)) for _, storage := range sConfig { + // Tiered storage cache volumes are not Kafka log dirs — exclude them from log.dirs. + if storage.TieredStorageCache { + continue + } mountPaths = append(mountPaths, util.StorageConfigKafkaMountPath(storage.MountPath)) } return mountPaths diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 830652bd5..fa2af8f13 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -68,14 +68,6 @@ const ( brokerConfigTemplate = "%s-config" brokerStorageTemplate = "%s-%d-storage-%d-" - // pvcCacheResizeStateAnnotation is set on PVCs during a tiered storage cache resize to track - // which PVC is the old one awaiting deletion and which is the new replacement. - pvcCacheResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" - pvcCacheResizePendingDeletion = "pending-deletion" - pvcCacheResizeReplacement = "replacement" - // pvcCacheResizeReplacesPVC records the name of the old PVC that the replacement supersedes. - pvcCacheResizeReplacesPVC = "koperator.adobe.com/replaces-pvc" - // annotationTrue is the string value used for boolean-true annotations and config comparisons. annotationTrue = "true" @@ -151,7 +143,9 @@ func getCreatedPvcForBroker( c client.Reader, brokerID int32, storageConfigs []banzaiv1beta1.StorageConfig, - namespace, crName string) ([]corev1.PersistentVolumeClaim, error) { + namespace, crName string, + pendingDeletionMountPaths map[string]bool, +) ([]corev1.PersistentVolumeClaim, error) { foundPvcList := &corev1.PersistentVolumeClaimList{} matchingLabels := client.MatchingLabels( apiutil.MergeLabels( @@ -164,14 +158,27 @@ func getCreatedPvcForBroker( return nil, err } - // Exclude PVCs that are pending deletion during a tiered storage cache resize. - // The replacement PVC (same mount path, smaller size) is already present and will be used instead. + // Filter the PVC list: + // 1. Always exclude terminating PVCs (DeletionTimestamp set) — they are released from pods + // and should not be mounted by the new broker pod. + // 2. For mount paths with an in-flight cache resize (pendingDeletionMountPaths), two PVCs + // temporarily coexist (old + replacement). Keep only one so the pod spec has a unique + // mount path (the reconciler will use the correctly-sized one during PVC reconciliation). + seenMountPaths := make(map[string]bool) n := 0 for _, pvc := range foundPvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] != pvcCacheResizePendingDeletion { - foundPvcList.Items[n] = pvc - n++ + if pvc.DeletionTimestamp != nil { + continue + } + mp := pvc.Annotations["mountPath"] + if pendingDeletionMountPaths[mp] { + if seenMountPaths[mp] { + continue + } + seenMountPaths[mp] = true } + foundPvcList.Items[n] = pvc + n++ } foundPvcList.Items = foundPvcList.Items[:n] @@ -460,7 +467,14 @@ func (r *Reconciler) Reconcile(log logr.Logger) error { } } - pvcs, err := getCreatedPvcForBroker(ctx, r.Client, broker.Id, brokerConfig.StorageConfigs, r.KafkaCluster.Namespace, r.KafkaCluster.Name) + pendingDeletionMountPaths := make(map[string]bool) + brokerIdStr := strconv.Itoa(int(broker.Id)) + for mp, state := range r.KafkaCluster.Status.BrokersState[brokerIdStr].CacheVolumeStates { + if state == banzaiv1beta1.CacheResizePendingDeletion { + pendingDeletionMountPaths[mp] = true + } + } + pvcs, err := getCreatedPvcForBroker(ctx, r.Client, broker.Id, brokerConfig.StorageConfigs, r.KafkaCluster.Namespace, r.KafkaCluster.Name, pendingDeletionMountPaths) if err != nil { return errors.WrapIfWithDetails(err, "failed to list PVC's") } @@ -1217,7 +1231,7 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro desiredMountPaths[dp.Annotations["mountPath"]] = true } - skipBroker, err := r.handleBrokerCacheResizeCleanup(ctx, log, brokerId, pvcList, desiredMountPaths, brokerPodExists, matchingLabels) + skipBroker, err := r.handleBrokerCacheResizeCleanup(ctx, log, brokerId, pvcList, desiredPvcs, desiredMountPaths, brokerPodExists, matchingLabels) if err != nil { return err } @@ -1229,7 +1243,7 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro // These are not Kafka log dirs so they must never go through CC disk removal. for i := range pvcList.Items { pvc := &pvcList.Items[i] - if pvc.Annotations["tieredStorageCache"] != annotationTrue { + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { continue } if !desiredMountPaths[pvc.Annotations["mountPath"]] { @@ -1241,16 +1255,66 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro } } } + + // Delete duplicate non-terminating cache PVCs at desired mount paths when no resize is in + // flight. These are orphaned replacements from a double-staging race (two reconcile cycles + // both staged a replacement before the first deletion propagated). Keep only the Bound one; + // delete any Pending duplicates. Safe only when the broker pod is down. + if !brokerPodExists { + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + boundByMountPath := make(map[string]bool) + for _, pvc := range pvcList.Items { + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + mp := pvc.Annotations["mountPath"] + if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + continue + } + if pvc.Status.Phase == corev1.ClaimBound { + boundByMountPath[mp] = true + } + } + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + mp := pvc.Annotations["mountPath"] + if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + continue + } + // Delete a Pending duplicate only when a Bound PVC already exists at this path. + if pvc.Status.Phase == corev1.ClaimPending && boundByMountPath[mp] { + log.Info("Deleting orphaned Pending cache PVC (duplicate at same mount path)", + "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned Pending cache PVC failed", "pvc", pvc.Name) + } + } + } + } + // Re-list so the disk removal count below reflects the deletions above. if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) } - // Handle disk removal — count only non-replacement PVCs to avoid false positives while a - // tiered storage cache resize is in flight (old + new PVC temporarily coexist). + // Handle disk removal — for mount paths with an in-flight cache resize two PVCs temporarily + // coexist (old pending-deletion + new replacement). Count each such path only once to avoid + // a false-positive disk-removal trigger. + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + countedCacheMountPaths := make(map[string]bool) effectivePvcCount := 0 for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] != pvcCacheResizeReplacement { + mp := pvc.Annotations["mountPath"] + if cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + if !countedCacheMountPaths[mp] { + countedCacheMountPaths[mp] = true + effectivePvcCount++ + } + } else { effectivePvcCount++ } } @@ -1305,8 +1369,8 @@ func (r *Reconciler) getBrokerPodExists(ctx context.Context, brokerId string) (b } // handleBrokerCacheResizeCleanup manages tiered storage cache resize PVC cleanup based on broker pod state. -// It returns true when the caller should skip the rest of the reconcile loop for this broker (orphaned PVCs -// detected while the pod is running — a rolling restart must complete first). +// It returns true when the caller should skip the rest of the reconcile loop for this broker (orphaned resize +// state detected while the pod is running — a rolling restart must complete first). // //nolint:funlen func (r *Reconciler) handleBrokerCacheResizeCleanup( @@ -1314,60 +1378,109 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( log logr.Logger, brokerId string, pvcList *corev1.PersistentVolumeClaimList, + desiredPvcs []*corev1.PersistentVolumeClaim, desiredMountPaths map[string]bool, brokerPodExists bool, matchingLabels client.MatchingLabels, ) (bool, error) { + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + if len(cacheVolumeStates) == 0 { + return false, nil + } + desiredType := reflect.TypeOf(&corev1.PersistentVolumeClaim{}) + + // Build desired size per mount path for identifying the old PVC during cleanup. + desiredSizeByMountPath := make(map[string]int64, len(desiredPvcs)) + for _, dp := range desiredPvcs { + if s := dp.Spec.Resources.Requests.Storage(); s != nil { + desiredSizeByMountPath[dp.Annotations["mountPath"]] = s.Value() + } + } + + // stateUpdates accumulates CacheVolumeStates changes to apply in a single status update. + // An empty CacheResizeState value means "delete this entry from the map". + stateUpdates := make(map[string]banzaiv1beta1.CacheResizeState) + if brokerPodExists { - // Resize complete when the replacement PVC exists but the pending-deletion PVC is already gone: - // strip the replacement annotation so the PVC is treated as normal going forward. - var hasPendingDeletion bool - for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - hasPendingDeletion = true - break + // Resize complete when the replacement PVC (size == desired) already exists at the mount + // path and no old PVC (size != desired) remains. Using a size-based check instead of a + // count-based one makes the logic crash-safe: if the reconciler crashes between writing + // pending-deletion state and creating the replacement PVC, the next cycle sees only the + // old PVC (size != desired) and correctly does NOT clear state, letting + // reconcileDesiredPvcsForBroker re-create the replacement. + for mp, state := range cacheVolumeStates { + if state != banzaiv1beta1.CacheResizePendingDeletion { + continue } - } - if !hasPendingDeletion { + desiredSize := desiredSizeByMountPath[mp] + replacementExists := false + oldPvcExists := false for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { - pvcCopy := pvc.DeepCopy() - delete(pvcCopy.Annotations, pvcCacheResizeStateAnnotation) - delete(pvcCopy.Annotations, pvcCacheResizeReplacesPVC) - if err := r.Update(ctx, pvcCopy); err != nil { - return false, errorfactory.New(errorfactory.APIFailure{}, err, - "removing replacement annotation from tiered storage cache PVC failed", "pvc", pvc.Name) + if pvc.Annotations["mountPath"] != mp { + continue + } + if pvcSize := pvc.Spec.Resources.Requests.Storage(); pvcSize != nil { + if pvcSize.Value() == desiredSize { + replacementExists = true + } else { + oldPvcExists = true } - log.Info("Tiered storage cache PVC resize complete, removed replacement annotation", "pvc", pvc.Name) } } + if replacementExists && !oldPvcExists { + stateUpdates[mp] = "" + log.Info("Tiered storage cache PVC resize complete — clearing resize state", + "brokerId", brokerId, "mountPath", mp) + } } } else { - // Broker pod is down — safe to delete any PVCs that were waiting for the pod to stop. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", - "brokerId", brokerId, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return false, errorfactory.New(errorfactory.APIFailure{}, err, - "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) + // Broker pod is down — safe to delete old PVCs and clean up orphaned state. + for mp, state := range cacheVolumeStates { + if state != banzaiv1beta1.CacheResizePendingDeletion { + continue + } + if !desiredMountPaths[mp] { + // Mount path is no longer desired: delete all PVCs at this path (old + replacement). + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations["mountPath"] != mp { + continue + } + log.Info("Broker pod is down — deleting orphaned cache resize PVC", + "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned cache resize PVC failed", "pvc", pvc.Name) + } + } + } else { + // Mount path still desired: delete the old PVC (the one whose size differs from desired). + desiredSize := desiredSizeByMountPath[mp] + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations["mountPath"] != mp { + continue + } + if pvc.Spec.Resources.Requests.Storage().Value() != desiredSize { + log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", + "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) + } + break + } } } + stateUpdates[mp] = "" } - // Also delete replacement PVCs whose mount path is no longer desired — these are - // orphaned by a storage config removal that happened while a cache resize was in flight. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement && - !desiredMountPaths[pvc.Annotations["mountPath"]] { - log.Info("Broker pod is down — deleting orphaned replacement tiered storage cache PVC", - "brokerId", brokerId, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return false, errorfactory.New(errorfactory.APIFailure{}, err, - "deleting orphaned replacement tiered storage cache PVC failed", "pvc", pvc.Name) - } + + if len(stateUpdates) > 0 { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + stateUpdates, log); err != nil { + return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not clear cache resize state after PVC cleanup", "brokerId", brokerId) } } // Re-list so the rest of this iteration sees the current state. @@ -1376,31 +1489,31 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( } } - // If the pod is still running and there are orphaned cache resize PVCs (storage config - // removed while a resize was in flight), trigger a rolling restart so the pod stops - // and we can delete the orphaned PVCs on the next cycle. Routing these through - // Cruise Control remove_disks would fail because cache paths are not Kafka log dirs. - if brokerPodExists { - var hasOrphanedCachePvc bool - for _, pvc := range pvcList.Items { - resizeState := pvc.Annotations[pvcCacheResizeStateAnnotation] - if (resizeState == pvcCacheResizePendingDeletion || resizeState == pvcCacheResizeReplacement) && - !desiredMountPaths[pvc.Annotations["mountPath"]] { - hasOrphanedCachePvc = true - break - } + if brokerPodExists && len(stateUpdates) > 0 { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + stateUpdates, log); err != nil { + return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not clear cache resize state after resize complete", "brokerId", brokerId) } - if hasOrphanedCachePvc { - log.Info("Orphaned cache resize PVCs detected with broker pod running — triggering rolling restart for safe cleanup", - "brokerId", brokerId) - if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { - if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - banzaiv1beta1.ConfigOutOfSync, log); err != nil { - return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, - "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) + } + + // If the pod is still running and any resize state entry refers to a mount path that is no + // longer desired (storage config removed mid-resize), trigger a rolling restart so the pod + // stops and the orphaned PVCs can be deleted on the next cycle. + if brokerPodExists { + for mp := range r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates { + if !desiredMountPaths[mp] { + log.Info("Orphaned cache resize state detected with broker pod running — triggering rolling restart", + "brokerId", brokerId, "mountPath", mp) + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) + } } + return true, nil } - return true, nil // Skip disk removal — wait for pod to stop before deleting cache PVCs. } } @@ -1446,11 +1559,20 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( if mountPath != pvc.Annotations["mountPath"] { continue } - // Skip the old PVC that is waiting to be deleted once the broker pod stops. - // The replacement PVC (same mount path) will be matched on the next iteration. - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + // Skip PVCs that are being deleted — the API server may still return them briefly + // after a Delete call while the pvc-protection finalizer is being removed. + // Matching a terminating PVC as the current PVC would cause a spurious re-staging. + if pvc.DeletionTimestamp != nil { continue } + // Skip the old PVC that is waiting to be deleted once the broker pod stops. + // We identify it as the PVC whose size differs from the desired size while a + // pending-deletion resize is in flight for this mount path. + if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + if desiredPvc.Spec.Resources.Requests.Storage().Value() != pvc.Spec.Resources.Requests.Storage().Value() { + continue + } + } currentPvc = pvc.DeepCopy() alreadyCreated = true @@ -1497,10 +1619,10 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } - // A replacement PVC is the new smaller PVC staged during a tiered storage cache resize. - // It already has the right size; we only need to keep the rolling upgrade running so - // the broker pod stops and the pending-deletion PVC can be cleaned up. - if currentPvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + // A resize is in flight for this mount path: the matched PVC is the replacement. + // Keep triggering ConfigOutOfSync so the rolling upgrade completes and the old PVC + // can be deleted on the next reconcile cycle. + if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, banzaiv1beta1.ConfigOutOfSync, log); err != nil { @@ -1574,19 +1696,17 @@ func (r *Reconciler) stageTieredCachePVCShrink( "currentSize", currentSize, "desiredSize", desiredSize) - // Annotate old PVC so it is skipped by getCreatedPvcForBroker and cleaned up - // the moment the broker pod stops. - oldPvcCopy := currentPvc.DeepCopy() - oldPvcCopy.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion - if err := r.Update(ctx, oldPvcCopy); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "annotating old tiered storage cache PVC for deletion failed", "pvc", currentPvc.Name) + // Record the resize in brokerState so the reconciler knows to exclude the old PVC and + // keep the rolling upgrade running until the old PVC is cleaned up. + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + map[string]banzaiv1beta1.CacheResizeState{mountPath: banzaiv1beta1.CacheResizePendingDeletion}, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not record cache resize state for tiered storage cache PVC resize", "brokerId", brokerId) } - log.Info("Marked old tiered storage cache PVC for deletion", "brokerId", brokerId, "pvc", currentPvc.Name) + log.Info("Recorded pending-deletion resize state for tiered storage cache PVC", + "brokerId", brokerId, "mountPath", mountPath, "pvc", currentPvc.Name) // Create the replacement PVC immediately so provisioning starts now. - desiredPvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizeReplacement - desiredPvc.Annotations[pvcCacheResizeReplacesPVC] = currentPvc.Name if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { return errors.WrapIf(err, "could not apply last state to annotation") } diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index c07fa002f..8eab7e2d2 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1403,24 +1403,24 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { } testCases := []struct { - testName string - existingPvc *corev1.PersistentVolumeClaim - desiredPvc *corev1.PersistentVolumeClaim - existingPods []corev1.Pod - // expectedUpdatePvc: old PVC annotated pending-deletion, or annotation stripped on resize-complete - expectedUpdatePvc bool - expectedCreatePvc bool - expectedDeletePvc bool - expectedError bool + testName string + existingPvc *corev1.PersistentVolumeClaim + desiredPvc *corev1.PersistentVolumeClaim + existingPods []corev1.Pod + initialCacheVolumeState v1beta1.CacheResizeState // pre-existing brokerState for mountPath, if any + expectedUpdatePvc bool + expectedCreatePvc bool + expectedDeletePvc bool + expectedError bool }{ { - // Pod is up: annotate old PVC as pending-deletion, create replacement PVC, - // set ConfigOutOfSync to trigger rolling upgrade. No error — state is durable. - testName: "size decrease with running pod — old PVC annotated, replacement PVC created", + // Pod is up, no prior resize state: record CacheResizePendingDeletion in brokerState, + // create replacement PVC, set ConfigOutOfSync to trigger rolling upgrade. + testName: "size decrease with running pod — resize state recorded, replacement PVC created", existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{runningPod}, - expectedUpdatePvc: true, + expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: false, expectedError: false, @@ -1428,59 +1428,52 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { { // Terminating pod is treated as having no running pod: staging starts immediately // so the replacement PVC is provisioned during the drain window. - testName: "size decrease with terminating pod — old PVC annotated, replacement PVC created", + testName: "size decrease with terminating pod — resize state recorded, replacement PVC created", existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{terminatingPod}, - expectedUpdatePvc: true, + expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: false, expectedError: false, }, { - // Pod is terminating and old PVC already has pending-deletion annotation: - // the Terminating pod is treated as gone, so cleanup fires immediately. - testName: "pending-deletion PVC with terminating pod — old PVC deleted", - existingPvc: func() *corev1.PersistentVolumeClaim { - pvc := makeTieredCachePvc("cache-pvc-1", "100Gi") - pvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion - return pvc - }(), - desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), - existingPods: []corev1.Pod{terminatingPod}, - expectedUpdatePvc: false, - expectedCreatePvc: true, - expectedDeletePvc: true, - expectedError: false, + // Resize state already recorded and pod is terminating (treated as gone): + // cleanup fires — old PVC (larger size) is deleted, state is cleared. + testName: "pending-deletion state with terminating pod — old PVC deleted", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{terminatingPod}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: true, + expectedError: false, }, { - // Pod is already gone on first detection: stage the replacement anyway so the - // state machine is consistent. Cleanup of pending-deletion PVC happens on the - // next cycle once the annotation is observed. - testName: "size decrease with pod already gone — old PVC annotated, replacement PVC created", + // Pod is already gone, no prior resize state: record state and create replacement PVC. + // Cleanup of old PVC happens on next cycle when reconciler re-observes the state. + testName: "size decrease with pod already gone — resize state recorded, replacement PVC created", existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{}, - expectedUpdatePvc: true, + expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: false, expectedError: false, }, { - // Pod is gone and old PVC already has pending-deletion annotation: - // cleanup fires — old PVC is deleted. Replacement PVC already exists. - testName: "size decrease with pod gone and old PVC pending-deletion — old PVC deleted", - existingPvc: func() *corev1.PersistentVolumeClaim { - pvc := makeTieredCachePvc("cache-pvc-1", "100Gi") - pvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion - return pvc - }(), - desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), - existingPods: []corev1.Pod{}, - expectedUpdatePvc: false, - expectedCreatePvc: true, - expectedDeletePvc: true, - expectedError: false, + // Resize state already recorded and pod is gone: + // cleanup fires — old PVC (larger size) is deleted, state is cleared. + testName: "pending-deletion state with pod gone — old PVC deleted", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: true, + expectedError: false, }, { // size increase — no special handling, regular PVC update path. @@ -1503,18 +1496,28 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { mockClient := mocks.NewMockClient(mockCtrl) mockSubResourceClient := mocks.NewMockSubResourceClient(mockCtrl) - r := Reconciler{ - Reconciler: resources.Reconciler{ - Client: mockClient, - KafkaCluster: &v1beta1.KafkaCluster{ - ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: namespace}, - Spec: v1beta1.KafkaClusterSpec{ - Brokers: []v1beta1.Broker{{ - Id: 0, - BrokerConfig: &v1beta1.BrokerConfig{Roles: []string{"broker"}}, - }}, + kafkaCluster := &v1beta1.KafkaCluster{ + ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: namespace}, + Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{ + Id: 0, + BrokerConfig: &v1beta1.BrokerConfig{Roles: []string{"broker"}}, + }}, + }, + } + if test.initialCacheVolumeState != "" { + kafkaCluster.Status.BrokersState = map[string]v1beta1.BrokerState{ + brokerId: { + CacheVolumeStates: map[string]v1beta1.CacheResizeState{ + mountPath: test.initialCacheVolumeState, }, }, + } + } + r := Reconciler{ + Reconciler: resources.Reconciler{ + Client: mockClient, + KafkaCluster: kafkaCluster, }, } diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index b45ffdb04..0256775f8 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -37,11 +37,6 @@ const ( tsResizeInitialSize = "2Gi" tsResizeShrunkSize = "1Gi" - // Annotation keys written by the cache-resize reconciler. - pvcResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" - pvcResizeStatePending = "pending-deletion" - pvcResizeStateReplacement = "replacement" - tsResizePhaseTimeout = 10 * time.Minute tsResizePollingInterval = 15 * time.Second @@ -53,6 +48,36 @@ type pvcItem struct { Name string Annotations map[string]string StorageSize string + Phase string +} + +// getCacheResizeState returns the CacheVolumeStates entry for the given broker and mount path +// from the KafkaCluster CR status, or an empty string if not set. +func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerID, mountPath string) (string, error) { + rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", kafkaKind, clusterName, + "--output", "json", + ) + if err != nil { + return "", errors.WrapIf(err, "getting KafkaCluster failed") + } + + var cr struct { + Status struct { + BrokersState map[string]struct { + CacheVolumeStates map[string]string `json:"cacheVolumeStates"` + } `json:"brokersState"` + } `json:"status"` + } + if err := json.Unmarshal([]byte(rawOutput), &cr); err != nil { + return "", errors.WrapIf(err, "parsing KafkaCluster JSON failed") + } + + brokerState, ok := cr.Status.BrokersState[brokerID] + if !ok { + return "", nil + } + return brokerState.CacheVolumeStates[mountPath], nil } // listBrokerCachePVCs returns PVCs for broker tsResizeBrokerID that have the @@ -82,6 +107,9 @@ func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { } `json:"requests"` } `json:"resources"` } `json:"spec"` + Status struct { + Phase string `json:"phase"` + } `json:"status"` } `json:"items"` } if err := json.Unmarshal([]byte(rawOutput), &pvcList); err != nil { @@ -95,6 +123,7 @@ func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { Name: item.Metadata.Name, Annotations: item.Metadata.Annotations, StorageSize: item.Spec.Resources.Requests.Storage, + Phase: item.Status.Phase, }) } } @@ -265,33 +294,31 @@ func testTieredStorageCachePvcResize() bool { applyK8sResourceManifest(kubectlOptions, tsResizeShrunkManifest) }) - ginkgo.It("Phase 1: old PVC annotated pending-deletion and replacement PVC created", func() { - ginkgo.By("Waiting until both pending-deletion and replacement PVCs coexist for broker 0") + ginkgo.It("Phase 1: resize state recorded in brokerState and replacement PVC created", func() { + ginkgo.By("Waiting until CacheVolumeStates has pending-deletion and two PVCs coexist for broker 0") gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions) + state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, + fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) if err != nil { return err } - var hasPendingDeletion, hasReplacement bool - for _, pvc := range pvcs { - switch pvc.Annotations[pvcResizeStateAnnotation] { - case pvcResizeStatePending: - hasPendingDeletion = true - case pvcResizeStateReplacement: - hasReplacement = true - } + if state != "pending-deletion" { + return fmt.Errorf("expected cacheVolumeStates[%s]=%q, got %q", + tsResizeCacheMountPath, "pending-deletion", state) } - if !hasPendingDeletion { - return errors.New("no PVC with pending-deletion annotation yet") + pvcs, err := listBrokerCachePVCs(kubectlOptions) + if err != nil { + return err } - if !hasReplacement { - return errors.New("no PVC with replacement annotation yet") + if len(pvcs) < 2 { + return fmt.Errorf("expected 2 cache PVCs (old + replacement) for broker %d, got %d", + tsResizeBrokerID, len(pvcs)) } return nil }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) }) - ginkgo.It("Phase 2: broker pod restarts, pending-deletion PVC is deleted", func() { + ginkgo.It("Phase 2: broker pod restarts, old PVC is deleted, resize state is cleared", func() { ginkgo.By("Waiting for broker-0 pod to be recycled (UID change indicates rolling restart)") // We detect recycling by UID change rather than waiting for the pod count to hit zero. // The pod may restart fast enough to be back before the next polling tick, causing a @@ -308,42 +335,53 @@ func testTieredStorageCachePvcResize() bool { return nil }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) - ginkgo.By("Waiting for the pending-deletion PVC to be deleted") + ginkgo.By("Waiting for old PVC to be deleted and resize state to be cleared") gomega.Eventually(context.Background(), func() error { + // The old (larger) PVC should be gone — only the replacement should remain. pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } + activePvcs := make([]pvcItem, 0, len(pvcs)) for _, pvc := range pvcs { - if pvc.Annotations[pvcResizeStateAnnotation] == pvcResizeStatePending { - return fmt.Errorf("pending-deletion PVC %s still exists", pvc.Name) + // Ignore PVCs that are being deleted (DeletionTimestamp set but not yet gone). + if pvc.Phase != "" { + activePvcs = append(activePvcs, pvc) } } + if len(activePvcs) != 1 { + return fmt.Errorf("expected 1 active cache PVC for broker %d after pod restart, got %d", + tsResizeBrokerID, len(activePvcs)) + } + // The resize state should be cleared once the old PVC is gone. + state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, + fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) + if err != nil { + return err + } + if state != "" { + return fmt.Errorf("expected cacheVolumeStates[%s] to be cleared, got %q", + tsResizeCacheMountPath, state) + } return nil }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) }) - ginkgo.It("Phase 3: broker pod running again with new PVC, replacement annotation stripped", func() { + ginkgo.It("Phase 3: broker pod running again with the new smaller PVC", func() { ginkgo.By("Waiting for the broker-0 pod to come back Ready") err = waitK8sResourceCondition(kubectlOptions, "pod", "condition=Ready", tsResizePhaseTimeout, fmt.Sprintf("%s=%s,brokerId=0,app=kafka", kafkaCRLabelKey, tsResizeClusterName), "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Waiting for replacement annotation to be stripped from the surviving PVC") - gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions) - if err != nil { - return err - } - if len(pvcs) != 1 { - return fmt.Errorf("expected 1 cache PVC for broker 0, got %d", len(pvcs)) - } - if state := pvcs[0].Annotations[pvcResizeStateAnnotation]; state != "" { - return fmt.Errorf("cache-resize-state annotation still present: %q", state) - } - return nil - }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + ginkgo.By("Verifying exactly one cache PVC remains with no resize state") + pvcs, err := listBrokerCachePVCs(kubectlOptions) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(pvcs).To(gomega.HaveLen(1), "expected exactly one cache PVC after resize completes") + state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, + fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(state).To(gomega.BeEmpty(), "cacheVolumeStates entry should be cleared after resize") }) ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() { From 7c22df7f00781deeefbba21295c3821df301c2ce Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Tue, 7 Apr 2026 14:40:05 +0300 Subject: [PATCH 10/19] lint, test timeout and generated manifest --- api/v1beta1/zz_generated.deepcopy.go | 7 + charts/kafka-operator/crds/kafkaclusters.yaml | 11 +- .../kafka.banzaicloud.io_kafkaclusters.yaml | 11 +- pkg/resources/kafka/kafka.go | 165 ++++++++++-------- tests/e2e/test_multidisk_removal.go | 11 +- 5 files changed, 120 insertions(+), 85 deletions(-) diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index cef0ed5b3..639d426b0 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -210,6 +210,13 @@ func (in *BrokerState) DeepCopyInto(out *BrokerState) { *out = make(ExternalListenerConfigNames, len(*in)) copy(*out, *in) } + if in.CacheVolumeStates != nil { + in, out := &in.CacheVolumeStates, &out.CacheVolumeStates + *out = make(map[string]CacheResizeState, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BrokerState. diff --git a/charts/kafka-operator/crds/kafkaclusters.yaml b/charts/kafka-operator/crds/kafkaclusters.yaml index 0debe0c6f..8b5dff9a0 100644 --- a/charts/kafka-operator/crds/kafkaclusters.yaml +++ b/charts/kafka-operator/crds/kafkaclusters.yaml @@ -23827,13 +23827,12 @@ spec: properties: cacheVolumeStates: additionalProperties: - description: CacheResizeState tracks the resize lifecycle of - a tiered storage cache PVC for a given mount path. + description: CacheResizeState tracks the resize lifecycle + of a tiered storage cache PVC for a given mount path. type: string - description: CacheVolumeStates tracks in-flight tiered storage - cache PVC resize operations, keyed by mount path. An entry - is present only while a resize is in progress; it is cleared - once cleanup completes. + description: |- + CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. + An entry is present only while a resize is in progress; it is cleared once cleanup completes. type: object configurationBackup: description: Compressed data from broker configuration to restore diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index 0debe0c6f..8b5dff9a0 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -23827,13 +23827,12 @@ spec: properties: cacheVolumeStates: additionalProperties: - description: CacheResizeState tracks the resize lifecycle of - a tiered storage cache PVC for a given mount path. + description: CacheResizeState tracks the resize lifecycle + of a tiered storage cache PVC for a given mount path. type: string - description: CacheVolumeStates tracks in-flight tiered storage - cache PVC resize operations, keyed by mount path. An entry - is present only while a resize is in progress; it is cleared - once cleanup completes. + description: |- + CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. + An entry is present only while a resize is in progress; it is cleared once cleanup completes. type: object configurationBackup: description: Compressed data from broker configuration to restore diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index fa2af8f13..f9a144765 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1239,60 +1239,13 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro continue } - // Delete tiered storage cache PVCs whose mount path is no longer desired. - // These are not Kafka log dirs so they must never go through CC disk removal. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - if !desiredMountPaths[pvc.Annotations["mountPath"]] { - log.Info("Deleting removed tiered storage cache PVC", - "brokerId", brokerId, "mountPath", pvc.Annotations["mountPath"], "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting removed tiered storage cache PVC failed", "pvc", pvc.Name) - } - } + if err := r.deleteRemovedCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { + return err } - // Delete duplicate non-terminating cache PVCs at desired mount paths when no resize is in - // flight. These are orphaned replacements from a double-staging race (two reconcile cycles - // both staged a replacement before the first deletion propagated). Keep only the Bound one; - // delete any Pending duplicates. Safe only when the broker pod is down. if !brokerPodExists { - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates - boundByMountPath := make(map[string]bool) - for _, pvc := range pvcList.Items { - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - mp := pvc.Annotations["mountPath"] - if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - continue - } - if pvc.Status.Phase == corev1.ClaimBound { - boundByMountPath[mp] = true - } - } - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - mp := pvc.Annotations["mountPath"] - if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - continue - } - // Delete a Pending duplicate only when a Bound PVC already exists at this path. - if pvc.Status.Phase == corev1.ClaimPending && boundByMountPath[mp] { - log.Info("Deleting orphaned Pending cache PVC (duplicate at same mount path)", - "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting orphaned Pending cache PVC failed", "pvc", pvc.Name) - } - } + if err := r.cleanupOrphanedDuplicateCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { + return err } } @@ -1301,24 +1254,7 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) } - // Handle disk removal — for mount paths with an in-flight cache resize two PVCs temporarily - // coexist (old pending-deletion + new replacement). Count each such path only once to avoid - // a false-positive disk-removal trigger. - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates - countedCacheMountPaths := make(map[string]bool) - effectivePvcCount := 0 - for _, pvc := range pvcList.Items { - mp := pvc.Annotations["mountPath"] - if cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - if !countedCacheMountPaths[mp] { - countedCacheMountPaths[mp] = true - effectivePvcCount++ - } - } else { - effectivePvcCount++ - } - } - if effectivePvcCount > len(desiredPvcs) { + if r.effectivePvcCount(brokerId, pvcList) > len(desiredPvcs) { waitForDiskRemovalToFinish, err = handleDiskRemoval(ctx, pvcList, desiredPvcs, r, brokerId, log, desiredType, brokerVolumesState) if err != nil { return err @@ -1348,6 +1284,97 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro return nil } +// deleteRemovedCachePVCs deletes tiered storage cache PVCs whose mount path is no longer desired. +// These volumes are not Kafka log dirs and must never go through CC disk removal. +func (r *Reconciler) deleteRemovedCachePVCs( + ctx context.Context, + log logr.Logger, + brokerId string, + pvcList *corev1.PersistentVolumeClaimList, + desiredMountPaths map[string]bool, +) error { + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + if !desiredMountPaths[pvc.Annotations["mountPath"]] { + log.Info("Deleting removed tiered storage cache PVC", + "brokerId", brokerId, "mountPath", pvc.Annotations["mountPath"], "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting removed tiered storage cache PVC failed", "pvc", pvc.Name) + } + } + } + return nil +} + +// cleanupOrphanedDuplicateCachePVCs removes Pending cache PVCs when a Bound PVC already exists at +// the same mount path and no resize is in flight. These are orphaned replacements left by a +// double-staging race. Safe to call only when the broker pod is down. +func (r *Reconciler) cleanupOrphanedDuplicateCachePVCs( + ctx context.Context, + log logr.Logger, + brokerId string, + pvcList *corev1.PersistentVolumeClaimList, + desiredMountPaths map[string]bool, +) error { + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + boundByMountPath := make(map[string]bool) + for _, pvc := range pvcList.Items { + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + mp := pvc.Annotations["mountPath"] + if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + continue + } + if pvc.Status.Phase == corev1.ClaimBound { + boundByMountPath[mp] = true + } + } + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + mp := pvc.Annotations["mountPath"] + if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + continue + } + if pvc.Status.Phase == corev1.ClaimPending && boundByMountPath[mp] { + log.Info("Deleting orphaned Pending cache PVC (duplicate at same mount path)", + "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned Pending cache PVC failed", "pvc", pvc.Name) + } + } + } + return nil +} + +// effectivePvcCount returns the number of PVCs for a broker, counting each pending-deletion cache +// mount path only once (old + replacement coexist during a shrink but represent one logical disk). +func (r *Reconciler) effectivePvcCount(brokerId string, pvcList *corev1.PersistentVolumeClaimList) int { + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + counted := make(map[string]bool) + total := 0 + for _, pvc := range pvcList.Items { + mp := pvc.Annotations["mountPath"] + if cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + if !counted[mp] { + counted[mp] = true + total++ + } + } else { + total++ + } + } + return total +} + // getBrokerPodExists returns true if a non-terminating pod exists for the given broker. func (r *Reconciler) getBrokerPodExists(ctx context.Context, brokerId string) (bool, error) { brokerPodList := &corev1.PodList{} diff --git a/tests/e2e/test_multidisk_removal.go b/tests/e2e/test_multidisk_removal.go index 18a730c89..b68f209b3 100644 --- a/tests/e2e/test_multidisk_removal.go +++ b/tests/e2e/test_multidisk_removal.go @@ -33,9 +33,10 @@ import ( ) const ( - multidiskRemovalTimeout = 1000 * time.Second // this test can take long - multidiskRemovalPollInterval = 15 * time.Second - brokerConfigTemplateFormat = "%s-config-%d" + multidiskRemovalTimeout = 1000 * time.Second // this test can take long + multidiskRemovalPollInterval = 15 * time.Second + multidiskRemovalBrokerReadinessWait = 360 * time.Second // rolling restart of all brokers after disk removal + brokerConfigTemplateFormat = "%s-config-%d" ) var ( @@ -75,7 +76,9 @@ func testMultiDiskRemoval() bool { }) ginkgo.It("Asserting Kafka brokers remain healthy", func() { - err := waitK8sResourceCondition(kubectlOptions, "pod", "condition=Ready", defaultPodReadinessWaitTime, + // Use multidiskRemovalBrokerReadinessWait: after disk removal, CC triggers a rolling + // restart of all brokers, so 180s (defaultPodReadinessWaitTime) is too tight in kind. + err := waitK8sResourceCondition(kubectlOptions, "pod", "condition=Ready", multidiskRemovalBrokerReadinessWait, v1beta1.KafkaCRLabelKey+"="+kafkaClusterName+",app=kafka", "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) }) From 1cb25d2dadfbf63e96347d14b4e204ca408d102a Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 8 Apr 2026 15:53:57 +0300 Subject: [PATCH 11/19] fix e2e test --- pkg/resources/kafka/kafka.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index f9a144765..651e15c5e 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1660,14 +1660,6 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } - if !k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { - continue - } - - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } - // Check if this is a tiered storage cache volume. Fall back to the desired PVC annotation // for PVCs created before the tieredStorageCache annotation was introduced. isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue || @@ -1685,6 +1677,14 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } + if !k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { + continue + } + + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + // Regular validation: size decreases are forbidden for non-cache volumes. if isDesiredStorageValueInvalid(desiredPvc, currentPvc) { return errorfactory.New(errorfactory.InternalError{}, errors.New("could not modify pvc size"), From 262a9ebe27e3826e734c8dc9383103f7f96be9d8 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 6 May 2026 18:22:09 +0300 Subject: [PATCH 12/19] fix: prevent data loss via tieredStorageCache spec flip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A KafkaCluster CR writer could flip tieredStorageCache from false to true on an existing storageConfigs entry, causing the operator to silently exclude the volume from log.dirs and Cruise Control capacity, and subsequently route a shrink through the delete-and-recreate path that bypasses graceful disk drain — destroying live log data. Fix in three layers: 1. Validating webhook rejects flips on existing storageConfigs entries (matched per mountPath, scoped per BrokerConfigGroup and per Broker). 2. deleteRemovedCachePVCs is now gated on !brokerPodExists so a removal while the broker pod is still running cannot issue a Delete against a mounted PVC. 3. The tiered-cache classification trusts only the persisted PVC annotation, never the desired-spec annotation. The annotation backfill block is removed, since it converted the unprotected CR field into a persistent ground-truth tag on live data PVCs. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/resources/kafka/kafka.go | 36 ++--- pkg/webhooks/errors.go | 5 + pkg/webhooks/kafkacluster_validator.go | 58 ++++++++ pkg/webhooks/kafkacluster_validator_test.go | 139 ++++++++++++++++++++ 4 files changed, 213 insertions(+), 25 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index df88e562d..42c44b18f 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1238,11 +1238,13 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro continue } - if err := r.deleteRemovedCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { - return err - } - + // Only delete cache PVCs when the broker pod is gone — issuing Delete against a mounted + // PVC sets the DeletionTimestamp and is held by the pvc-protection finalizer, but the + // intent is wrong and the resulting PVC state confuses subsequent reconciles. if !brokerPodExists { + if err := r.deleteRemovedCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { + return err + } if err := r.cleanupOrphanedDuplicateCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { return err } @@ -1602,23 +1604,6 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( currentPvc = pvc.DeepCopy() alreadyCreated = true - // Backfill the tieredStorageCache annotation if the desired PVC has it but the existing - // PVC does not — handles PVCs created before the annotation was introduced. - if desiredPvc.Annotations["tieredStorageCache"] == annotationTrue && - currentPvc.Annotations["tieredStorageCache"] != annotationTrue { - pvcCopy := currentPvc.DeepCopy() - if pvcCopy.Annotations == nil { - pvcCopy.Annotations = make(map[string]string) - } - pvcCopy.Annotations["tieredStorageCache"] = annotationTrue - if err := r.Update(ctx, pvcCopy); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "backfilling tieredStorageCache annotation on existing PVC failed", "pvc", pvcCopy.Name) - } - log.Info("Backfilled tieredStorageCache annotation on existing PVC", "pvc", pvcCopy.Name) - currentPvc = pvcCopy - } - // Trigger a CC disk rebalance only for regular data volumes. // Tiered storage cache PVCs are ephemeral — CC must not account for them. if currentPvc.Annotations["tieredStorageCache"] != annotationTrue { @@ -1659,10 +1644,11 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } - // Check if this is a tiered storage cache volume. Fall back to the desired PVC annotation - // for PVCs created before the tieredStorageCache annotation was introduced. - isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue || - desiredPvc.Annotations["tieredStorageCache"] == annotationTrue + // Trust only the persisted PVC annotation when classifying — never the desired-spec + // annotation. Reading from the desired side would let a CR-edit (flipping + // tieredStorageCache: false → true on an existing data volume) route a real log-dir PVC + // through the cache-shrink delete-and-recreate path, bypassing graceful disk drain. + isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() diff --git a/pkg/webhooks/errors.go b/pkg/webhooks/errors.go index 58f3d9ef1..0938847e5 100644 --- a/pkg/webhooks/errors.go +++ b/pkg/webhooks/errors.go @@ -29,6 +29,7 @@ const ( unsupportedRemovingStorageMsg = "removing storage from a broker is not supported" invalidExternalListenerStartingPortErrMsg = "invalid external listener starting port number" invalidContainerPortForIngressControllerErrMsg = "invalid trarget port number for ingress controller deployment" + immutableTieredStorageCacheErrMsg = "tieredStorageCache is immutable on existing storageConfigs entries; remove the entry and re-add to change" // errorDuringValidationMsg is added to infrastructure errors (e.g. failed to connect), but not to field validation errors errorDuringValidationMsg = "error during validation" @@ -62,6 +63,10 @@ func IsAdmissionInvalidExternalListenerPort(err error) bool { return apierrors.IsInvalid(err) && strings.Contains(err.Error(), invalidExternalListenerStartingPortErrMsg) } +func IsAdmissionImmutableTieredStorageCache(err error) bool { + return apierrors.IsInvalid(err) && strings.Contains(err.Error(), immutableTieredStorageCacheErrMsg) +} + func IsAdmissionErrorDuringValidation(err error) bool { return apierrors.IsInternalError(err) && strings.Contains(err.Error(), errorDuringValidationMsg) } diff --git a/pkg/webhooks/kafkacluster_validator.go b/pkg/webhooks/kafkacluster_validator.go index d9f7b8932..6b51d7901 100644 --- a/pkg/webhooks/kafkacluster_validator.go +++ b/pkg/webhooks/kafkacluster_validator.go @@ -39,6 +39,7 @@ type KafkaClusterValidator struct { func (s KafkaClusterValidator) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (warnings admission.Warnings, err error) { var allErrs field.ErrorList + kafkaClusterOld := oldObj.(*banzaicloudv1beta1.KafkaCluster) kafkaClusterNew := newObj.(*banzaicloudv1beta1.KafkaCluster) log := s.Log.WithValues("name", kafkaClusterNew.GetName(), "namespace", kafkaClusterNew.GetNamespace()) @@ -47,6 +48,8 @@ func (s KafkaClusterValidator) ValidateUpdate(ctx context.Context, oldObj, newOb allErrs = append(allErrs, listenerErrs...) } + allErrs = append(allErrs, checkTieredStorageCacheImmutability(&kafkaClusterOld.Spec, &kafkaClusterNew.Spec)...) + if len(allErrs) == 0 { return nil, nil } @@ -81,6 +84,61 @@ func (s KafkaClusterValidator) ValidateDelete(ctx context.Context, obj runtime.O return nil, nil } +// checkTieredStorageCacheImmutability rejects updates that flip the TieredStorageCache flag on an +// existing storageConfigs entry (matched by mountPath, scoped per brokerConfigGroup or per broker). +// Flipping false→true on a live data PVC silently removes it from log.dirs and Cruise Control +// capacity, and enables a delete-and-recreate shrink path that bypasses graceful disk drain — an +// irreversible data-loss vector when applied to a volume that already holds Kafka log segments. +// To change this property, callers must remove the entry and re-add it in a subsequent apply. +func checkTieredStorageCacheImmutability(oldSpec, newSpec *banzaicloudv1beta1.KafkaClusterSpec) field.ErrorList { + var allErrs field.ErrorList + + for groupName, newGroup := range newSpec.BrokerConfigGroups { + oldGroup, ok := oldSpec.BrokerConfigGroups[groupName] + if !ok { + continue + } + groupPath := field.NewPath("spec").Child("brokerConfigGroups").Key(groupName).Child("storageConfigs") + allErrs = append(allErrs, diffTieredStorageCache(oldGroup.StorageConfigs, newGroup.StorageConfigs, groupPath)...) + } + + oldByID := make(map[int32]banzaicloudv1beta1.Broker, len(oldSpec.Brokers)) + for _, b := range oldSpec.Brokers { + oldByID[b.Id] = b + } + for i, newBroker := range newSpec.Brokers { + oldBroker, ok := oldByID[newBroker.Id] + if !ok || oldBroker.BrokerConfig == nil || newBroker.BrokerConfig == nil { + continue + } + brokerPath := field.NewPath("spec").Child("brokers").Index(i).Child("brokerConfig").Child("storageConfigs") + allErrs = append(allErrs, diffTieredStorageCache(oldBroker.BrokerConfig.StorageConfigs, newBroker.BrokerConfig.StorageConfigs, brokerPath)...) + } + + return allErrs +} + +func diffTieredStorageCache(oldConfigs, newConfigs []banzaicloudv1beta1.StorageConfig, basePath *field.Path) field.ErrorList { + var allErrs field.ErrorList + oldByPath := make(map[string]banzaicloudv1beta1.StorageConfig, len(oldConfigs)) + for _, sc := range oldConfigs { + oldByPath[sc.MountPath] = sc + } + for i, newSC := range newConfigs { + oldSC, ok := oldByPath[newSC.MountPath] + if !ok { + continue + } + if oldSC.TieredStorageCache != newSC.TieredStorageCache { + allErrs = append(allErrs, field.Forbidden( + basePath.Index(i).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + )) + } + } + return allErrs +} + // checkListeners validates the spec.listenersConfig object func checkInternalAndExternalListeners(kafkaClusterSpec *banzaicloudv1beta1.KafkaClusterSpec) field.ErrorList { var allErrs field.ErrorList diff --git a/pkg/webhooks/kafkacluster_validator_test.go b/pkg/webhooks/kafkacluster_validator_test.go index 952c3f389..6193a9a8b 100644 --- a/pkg/webhooks/kafkacluster_validator_test.go +++ b/pkg/webhooks/kafkacluster_validator_test.go @@ -258,6 +258,145 @@ func TestCheckExternalListenerStartingPort(t *testing.T) { } } +func TestCheckTieredStorageCacheImmutability(t *testing.T) { + mp := func(path string, tc bool) v1beta1.StorageConfig { + return v1beta1.StorageConfig{MountPath: path, TieredStorageCache: tc} + } + + testCases := []struct { + testName string + oldSpec v1beta1.KafkaClusterSpec + newSpec v1beta1.KafkaClusterSpec + expected field.ErrorList + }{ + { + testName: "no change — empty", + oldSpec: v1beta1.KafkaClusterSpec{}, + newSpec: v1beta1.KafkaClusterSpec{}, + expected: nil, + }, + { + testName: "brokerConfigGroups: flip false→true on existing mountPath rejected", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}}, + }, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", true)}}, + }, + }, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokerConfigGroups").Key("default").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), + }, + { + testName: "brokerConfigGroups: flip true→false on existing mountPath rejected", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}}, + }, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", false)}}, + }, + }, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokerConfigGroups").Key("default").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), + }, + { + testName: "brokerConfigGroups: new entry with tieredStorageCache=true allowed", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}}, + }, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{ + mp("/kafka-logs/0", false), + mp("/cache/0", true), + }}, + }, + }, + expected: nil, + }, + { + testName: "brokerConfigGroups: remove entry, then re-add with different value — allowed (no entry to match against in old at the same mountPath after removal scenario; test single-pass: removed entry simply not in new)", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", false)}}, + }, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{}}, + }, + }, + expected: nil, + }, + { + testName: "brokers[].brokerConfig: flip false→true rejected", + oldSpec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}, + }}}, + }, + newSpec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", true)}, + }}}, + }, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokers").Index(0).Child("brokerConfig").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), + }, + { + testName: "brokers[].brokerConfig: nil old config — no error (entry didn't exist)", + oldSpec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: nil}}, + }, + newSpec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}, + }}}, + }, + expected: nil, + }, + { + testName: "brokerConfigGroup not present in old — no error (whole group is new)", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{}, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "new-group": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}}, + }, + }, + expected: nil, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.testName, func(t *testing.T) { + got := checkTieredStorageCacheImmutability(&testCase.oldSpec, &testCase.newSpec) + require.Equal(t, testCase.expected, got) + }) + } +} + func TestCheckTargetPortsCollisionForEnvoy(t *testing.T) { testCases := []struct { testName string From e84b2402efb1e766734b251fbb6200b7b09c5d67 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 6 May 2026 18:23:28 +0300 Subject: [PATCH 13/19] fix: eliminate cache PVC duplication and orphan-cleanup race Two related correctness issues in the tiered storage cache resize path: (HIGH-3) When the client times out on the replacement-PVC Create but the server has already accepted the request, the next reconcile cycle's cache hasn't observed the new PVC yet. The size filter in reconcileDesiredPvcsForBroker then excludes the old PVC, alreadyCreated remains false, and a second Create with a fresh GenerateName produces a duplicate replacement. Add a strongly-consistent (DirectClient) read before falling through to Create, gated on CacheResizePendingDeletion state at this mountPath. If a replacement at the desired size already exists, reuse it. (HIGH-2) cleanupOrphanedDuplicateCachePVCs existed specifically to mop up the duplicates produced by the staging race above. Its own implementation had a cache-lag bug: after handleBrokerCacheResizeCleanup deleted the old PVC and cleared the in-memory state in the same cycle, the cached PVC list still showed the deleted PVC as Bound (no DeletionTimestamp yet), so the function classified the legitimate replacement as a duplicate and deleted it. With the staging race closed, the cleanup function has no remaining purpose, and keeping a function whose implementation has a race that deletes legitimate replacements is worse than removing it. Delete the function and its call site. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/resources/kafka/kafka.go | 76 +++++++++++++----------------------- 1 file changed, 28 insertions(+), 48 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 42c44b18f..1e7f205c4 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1245,9 +1245,6 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro if err := r.deleteRemovedCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { return err } - if err := r.cleanupOrphanedDuplicateCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { - return err - } } // Re-list so the disk removal count below reflects the deletions above. @@ -1311,51 +1308,6 @@ func (r *Reconciler) deleteRemovedCachePVCs( return nil } -// cleanupOrphanedDuplicateCachePVCs removes Pending cache PVCs when a Bound PVC already exists at -// the same mount path and no resize is in flight. These are orphaned replacements left by a -// double-staging race. Safe to call only when the broker pod is down. -func (r *Reconciler) cleanupOrphanedDuplicateCachePVCs( - ctx context.Context, - log logr.Logger, - brokerId string, - pvcList *corev1.PersistentVolumeClaimList, - desiredMountPaths map[string]bool, -) error { - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates - boundByMountPath := make(map[string]bool) - for _, pvc := range pvcList.Items { - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - mp := pvc.Annotations["mountPath"] - if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - continue - } - if pvc.Status.Phase == corev1.ClaimBound { - boundByMountPath[mp] = true - } - } - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - mp := pvc.Annotations["mountPath"] - if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - continue - } - if pvc.Status.Phase == corev1.ClaimPending && boundByMountPath[mp] { - log.Info("Deleting orphaned Pending cache PVC (duplicate at same mount path)", - "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting orphaned Pending cache PVC failed", "pvc", pvc.Name) - } - } - } - return nil -} - // effectivePvcCount returns the number of PVCs for a broker, counting each pending-deletion cache // mount path only once (old + replacement coexist during a shrink but represent one logical disk). func (r *Reconciler) effectivePvcCount(brokerId string, pvcList *corev1.PersistentVolumeClaimList) int { @@ -1620,6 +1572,34 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( } if !alreadyCreated { + // During an in-flight cache resize, a prior reconcile cycle may have already created + // the replacement PVC server-side even if the client got a timeout, or the cached + // client may not yet have observed an in-flight Create. The size filter above + // excludes the old PVC, so alreadyCreated is false even when a replacement exists. + // A strongly-consistent (uncached) read picks up that replacement so we don't issue a + // second Create with a fresh GenerateName. + if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + liveList := &corev1.PersistentVolumeClaimList{} + if err := r.DirectClient.List(ctx, liveList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "uncached list of PVCs failed", "kind", desiredType) + } + desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() + for i := range liveList.Items { + p := &liveList.Items[i] + if p.DeletionTimestamp != nil || p.Annotations["mountPath"] != mountPath { + continue + } + if p.Spec.Resources.Requests.Storage().Value() == desiredSize { + log.Info("Replacement cache PVC already exists from a prior partial attempt; skipping Create", + "brokerId", brokerId, "mountPath", mountPath, "pvc", p.Name) + alreadyCreated = true + break + } + } + if alreadyCreated { + continue + } + } // Creating the 2+ PersistentVolumes for Pod if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { return errors.WrapIf(err, "could not apply last state to annotation") From eeae9566ac34343ee4eb8a80dfeeb07140a93f5f Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 7 May 2026 10:36:11 +0300 Subject: [PATCH 14/19] refactor: replace tieredStorageCache annotation key literal with constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PVC annotation key "tieredStorageCache" was a bare string literal at five sites across pvc.go and kafka.go (write at PVC creation, three reads gating cache-specific behavior, one test fixture). A typo at any read site would silently misclassify a cache PVC as a regular log-dir PVC, sending it through Cruise Control rebalance — silent data loss surface. Define TieredStorageCacheAnnotationKey in api/v1beta1 alongside the existing label/annotation key constants and reference it everywhere. Compile-time-checked from now on. Co-Authored-By: Claude Opus 4.7 (1M context) --- api/v1beta1/kafkacluster_types.go | 6 ++++++ pkg/resources/kafka/kafka.go | 6 +++--- pkg/resources/kafka/kafka_test.go | 4 ++-- pkg/resources/kafka/pvc.go | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/api/v1beta1/kafkacluster_types.go b/api/v1beta1/kafkacluster_types.go index 32e4bb3a8..681ba483e 100644 --- a/api/v1beta1/kafkacluster_types.go +++ b/api/v1beta1/kafkacluster_types.go @@ -52,6 +52,12 @@ const ( // IsControllerNodeKey is used to identify if the kafka pod is a controller or broker_controller IsControllerNodeKey = "isControllerNode" + // TieredStorageCacheAnnotationKey marks a PVC as backing a Kafka tiered storage cache. + // Cache PVCs are excluded from log.dirs and Cruise Control capacity, and follow a + // delete-and-recreate path on shrink. Read sites must use this constant rather than the + // literal so a typo cannot silently misclassify a cache PVC as a regular log-dir volume. + TieredStorageCacheAnnotationKey = "tieredStorageCache" + // DefaultCruiseControlImage is the default CC image used when users don't specify it in CruiseControlConfig.Image DefaultCruiseControlImage = "adobe/cruise-control:3.0.3-adbe-20250804" // renovate: datasource=docker depName=adobe/cruise-control diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 1e7f205c4..28d8eac90 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1293,7 +1293,7 @@ func (r *Reconciler) deleteRemovedCachePVCs( ) error { for i := range pvcList.Items { pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + if pvc.DeletionTimestamp != nil || pvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] != annotationTrue { continue } if !desiredMountPaths[pvc.Annotations["mountPath"]] { @@ -1558,7 +1558,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // Trigger a CC disk rebalance only for regular data volumes. // Tiered storage cache PVCs are ephemeral — CC must not account for them. - if currentPvc.Annotations["tieredStorageCache"] != annotationTrue { + if currentPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] != annotationTrue { // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, // then we make it happening with status update. // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance @@ -1628,7 +1628,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // annotation. Reading from the desired side would let a CR-edit (flipping // tieredStorageCache: false → true on an existing data volume) route a real log-dir PVC // through the cache-shrink delete-and-recreate path, bypassing graceful disk drain. - isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue + isTieredCache := currentPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 8eab7e2d2..9128788ba 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1370,8 +1370,8 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { v1beta1.KafkaCRLabelKey: clusterName, }, Annotations: map[string]string{ - "mountPath": mountPath, - "tieredStorageCache": "true", + "mountPath": mountPath, + v1beta1.TieredStorageCacheAnnotationKey: "true", }, }, Spec: corev1.PersistentVolumeClaimSpec{ diff --git a/pkg/resources/kafka/pvc.go b/pkg/resources/kafka/pvc.go index b423743e4..803885604 100644 --- a/pkg/resources/kafka/pvc.go +++ b/pkg/resources/kafka/pvc.go @@ -66,7 +66,7 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora // Mark tiered storage cache PVCs with annotation for special handling if storage.TieredStorageCache { - annotations["tieredStorageCache"] = annotationTrue + annotations[v1beta1.TieredStorageCacheAnnotationKey] = annotationTrue } return &corev1.PersistentVolumeClaim{ From 62ad71c6afbbbefe29b29868236432c067f40b04 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 7 May 2026 11:15:26 +0300 Subject: [PATCH 15/19] test: cover crash recovery and idempotent re-staging for cache resize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three test cases to TestReconcileKafkaPvcTieredCacheResize covering the recovery paths exercised by the HIGH-3 idempotency fix: 1. Pending-deletion state with running pod and no replacement — the reconciler crashed between writing state and creating the replacement. Asserts: replacement is re-staged, old PVC is not deleted, state is preserved (since the resize hasn't completed). 2. Pending-deletion with replacement present (symmetric) — the inner-match loop's size filter correctly identifies the replacement when both PVCs are visible to the cached client. No duplicate Create. 3. Pending-deletion with replacement visible only via DirectClient (asymmetric) — simulates the cache-lag window where Create succeeded server-side but the watch event hasn't propagated. The cached Client sees only the old PVC; the idempotency check uses DirectClient and detects the replacement. This case is the actual HIGH-3 regression guard: removing the DirectClient lookup makes it fail. The test setup now wires a separate mockDirectClient so cached and uncached views can diverge per-test-case. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/resources/kafka/kafka_test.go | 82 +++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 9128788ba..7803630b4 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1403,8 +1403,13 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { } testCases := []struct { - testName string - existingPvc *corev1.PersistentVolumeClaim + testName string + existingPvc *corev1.PersistentVolumeClaim + additionalExistingPvc *corev1.PersistentVolumeClaim // optional second PVC at the same mountPath (e.g., pre-staged replacement) + // directClientOnlyPvc: when set, this PVC is returned only by DirectClient (uncached) reads, + // not by the cached Client. Simulates the cache-lag window after a Create where the watch + // event hasn't propagated yet — the scenario the HIGH-3 idempotency fix protects against. + directClientOnlyPvc *corev1.PersistentVolumeClaim desiredPvc *corev1.PersistentVolumeClaim existingPods []corev1.Pod initialCacheVolumeState v1beta1.CacheResizeState // pre-existing brokerState for mountPath, if any @@ -1486,6 +1491,54 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { expectedDeletePvc: false, expectedError: false, }, + { + // Crash-recovery: state was written in a prior cycle but the replacement Create + // failed before completing. Pod is still running, so the old PVC must NOT be + // deleted and state must NOT be cleared. The reconciler must re-create the + // replacement. + testName: "pending-deletion with running pod and no replacement — replacement re-staged", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Idempotency (symmetric): pending-deletion state and a replacement at desired + // size already exists in both cached and uncached views. The inner-match loop + // finds the replacement; no second Create. + testName: "pending-deletion with running pod and replacement present — no duplicate Create", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + additionalExistingPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: false, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Idempotency (asymmetric — the actual HIGH-3 cache-lag scenario): the prior + // Create succeeded server-side, so the replacement exists in etcd, but the cached + // Client hasn't yet observed the watch event. The inner match loop sees only the + // old PVC (size-filtered out) and would fall through to a second Create — except + // the idempotency check uses DirectClient (uncached) and detects the replacement. + // If a future refactor removes the DirectClient lookup, this test fails. + testName: "pending-deletion with replacement visible only via DirectClient — no duplicate Create", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + directClientOnlyPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: false, + expectedDeletePvc: false, + expectedError: false, + }, } mockCtrl := gomock.NewController(t) @@ -1494,6 +1547,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { test := test t.Run(test.testName, func(t *testing.T) { mockClient := mocks.NewMockClient(mockCtrl) + mockDirectClient := mocks.NewMockClient(mockCtrl) // separate so tests can simulate cache vs apiserver asymmetry mockSubResourceClient := mocks.NewMockSubResourceClient(mockCtrl) kafkaCluster := &v1beta1.KafkaCluster{ @@ -1517,11 +1571,14 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { r := Reconciler{ Reconciler: resources.Reconciler{ Client: mockClient, + DirectClient: mockDirectClient, KafkaCluster: kafkaCluster, }, } - // Mock PVC list — always returns the existing PVC + // Cached Client PVC list: returns the existing PVC plus any additional. Does NOT + // include directClientOnlyPvc — that simulates the cache-lag window where a Create + // has succeeded server-side but the watch event hasn't propagated. mockClient.EXPECT().List( context.TODO(), gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaimList{}), @@ -1529,6 +1586,25 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { gomock.Any(), ).Do(func(ctx context.Context, list *corev1.PersistentVolumeClaimList, opts ...client.ListOption) { list.Items = []corev1.PersistentVolumeClaim{*test.existingPvc} + if test.additionalExistingPvc != nil { + list.Items = append(list.Items, *test.additionalExistingPvc) + } + }).Return(nil).AnyTimes() + + // Uncached DirectClient PVC list: same as cached, plus directClientOnlyPvc when set. + mockDirectClient.EXPECT().List( + context.TODO(), + gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaimList{}), + client.InNamespace(namespace), + gomock.Any(), + ).Do(func(ctx context.Context, list *corev1.PersistentVolumeClaimList, opts ...client.ListOption) { + list.Items = []corev1.PersistentVolumeClaim{*test.existingPvc} + if test.additionalExistingPvc != nil { + list.Items = append(list.Items, *test.additionalExistingPvc) + } + if test.directClientOnlyPvc != nil { + list.Items = append(list.Items, *test.directClientOnlyPvc) + } }).Return(nil).AnyTimes() // Mock Pod list — returns the configured pods From 7723489b936ddeb5d559b5cf79600d1b941ca826 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 7 May 2026 11:57:15 +0300 Subject: [PATCH 16/19] e2e tests --- ...r_tiered_storage_cache_resize_initial.yaml | 31 +++++++++++++++++- ...er_tiered_storage_cache_resize_shrunk.yaml | 31 +++++++++++++++++- tests/e2e/test_tiered_storage_cache_resize.go | 32 +++++++++++-------- 3 files changed, 79 insertions(+), 15 deletions(-) diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml index b4df5bbd3..39ec42a3f 100644 --- a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml @@ -10,9 +10,15 @@ spec: clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" readOnlyConfig: | auto.create.topics.enable=false + cruise.control.metrics.topic.auto.create=true + cruise.control.metrics.topic.num.partitions=1 + cruise.control.metrics.topic.replication.factor=1 rollingUpgradeConfig: failureThreshold: 1 - cruiseControlConfig: {} + cruiseControlConfig: + topicConfig: + partitions: 1 + replicationFactor: 2 brokers: - id: 0 brokerConfig: @@ -42,7 +48,30 @@ spec: requests: cpu: "200m" memory: 1Gi + # Second broker: present only to satisfy the CRD's minimum replicationFactor=2 constraint + # for the Cruise Control coordination topic. Carries no tiered storage cache PVC and is + # not the broker under test (tsResizeBrokerID=0). - id: 1 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "200m" + memory: 1Gi + - id: 2 brokerConfig: processRoles: - controller diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml index 5d158905a..16b966a3d 100644 --- a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml @@ -10,9 +10,15 @@ spec: clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" readOnlyConfig: | auto.create.topics.enable=false + cruise.control.metrics.topic.auto.create=true + cruise.control.metrics.topic.num.partitions=1 + cruise.control.metrics.topic.replication.factor=1 rollingUpgradeConfig: failureThreshold: 1 - cruiseControlConfig: {} + cruiseControlConfig: + topicConfig: + partitions: 1 + replicationFactor: 2 brokers: - id: 0 brokerConfig: @@ -42,7 +48,30 @@ spec: requests: cpu: "200m" memory: 1Gi + # Second broker: present only to satisfy the CRD's minimum replicationFactor=2 constraint + # for the Cruise Control coordination topic. Carries no tiered storage cache PVC and is + # not the broker under test (tsResizeBrokerID=0). - id: 1 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "200m" + memory: 1Gi + - id: 2 brokerConfig: processRoles: - controller diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 0256775f8..5e6e1cfb3 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -233,10 +233,10 @@ func testTieredStorageCachePvcResize() bool { applyK8sResourceManifest(kubectlOptions, tsResizeInitialManifest) ginkgo.By("Waiting for all broker pods to be ready") - // kubectl wait --for=condition=Ready fails immediately when no pods exist yet. - // We don't wait for ClusterRunning — the cluster may stay in ClusterReconciling - // because GracefulDiskRebalanceRequired for log volumes needs CC, which this minimal - // test cluster does not deploy. Pod readiness is sufficient to proceed with the resize. + // kubectl wait --for=condition=Ready fails immediately when no pods exist yet, so + // poll the pod list explicitly. Pod readiness is sufficient to proceed with the + // resize — we don't gate on ClusterRunning because Cruise Control may still be + // completing the post-startup rebalance when the resize is initiated. gomega.Eventually(context.Background(), func() error { output, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, "get", "pod", @@ -295,26 +295,32 @@ func testTieredStorageCachePvcResize() bool { }) ginkgo.It("Phase 1: resize state recorded in brokerState and replacement PVC created", func() { - ginkgo.By("Waiting until CacheVolumeStates has pending-deletion and two PVCs coexist for broker 0") + // Two acceptable success paths — the resize can complete in seconds, often faster + // than the polling interval, so insisting on the intermediate "pending-deletion" + // snapshot makes this phase flake-prone: + // A) In-progress: cacheVolumeStates[mp]=="pending-deletion" AND ≥2 cache PVCs. + // B) Already completed: state cleared AND exactly one cache PVC at the shrunk size. + // Either proves the resize was staged correctly; Phase 2/3 cover the remaining + // invariants regardless of which path we observed. + ginkgo.By("Waiting until the resize is observable as in-progress or completed") gomega.Eventually(context.Background(), func() error { state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) if err != nil { return err } - if state != "pending-deletion" { - return fmt.Errorf("expected cacheVolumeStates[%s]=%q, got %q", - tsResizeCacheMountPath, "pending-deletion", state) - } pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } - if len(pvcs) < 2 { - return fmt.Errorf("expected 2 cache PVCs (old + replacement) for broker %d, got %d", - tsResizeBrokerID, len(pvcs)) + if state == "pending-deletion" && len(pvcs) >= 2 { + return nil } - return nil + if state == "" && len(pvcs) == 1 && pvcs[0].StorageSize == tsResizeShrunkSize { + return nil + } + return fmt.Errorf("not at expected staging state: cacheVolumeStates[%s]=%q, %d cache PVC(s)", + tsResizeCacheMountPath, state, len(pvcs)) }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) }) From 9259c1596b11250521ba615abce00197e8d417d3 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 7 May 2026 16:51:54 +0300 Subject: [PATCH 17/19] agent review --- api/v1beta1/common_types.go | 20 +- api/v1beta1/zz_generated.deepcopy.go | 6 +- charts/kafka-operator/crds/kafkaclusters.yaml | 21 +- .../kafka.banzaicloud.io_kafkaclusters.yaml | 21 +- docs/tiered-storage-pvc-resize.md | 31 +-- pkg/k8sutil/status.go | 10 +- pkg/resources/kafka/kafka.go | 141 ++++++++++---- pkg/resources/kafka/kafka_test.go | 20 +- pkg/webhooks/kafkacluster_validator.go | 92 +++++---- pkg/webhooks/kafkacluster_validator_test.go | 180 ++++++++++-------- tests/e2e/test_tiered_storage_cache_resize.go | 27 +++ 11 files changed, 355 insertions(+), 214 deletions(-) diff --git a/api/v1beta1/common_types.go b/api/v1beta1/common_types.go index c596c36ac..5e4e4f17d 100644 --- a/api/v1beta1/common_types.go +++ b/api/v1beta1/common_types.go @@ -222,14 +222,17 @@ type VolumeState struct { CruiseControlOperationReference *corev1.LocalObjectReference `json:"cruiseControlOperationReference,omitempty"` } -// CacheResizeState tracks the resize lifecycle of a tiered storage cache PVC for a given mount path. -type CacheResizeState string +// TieredCacheVolumeState tracks the lifecycle state of a tiered storage cache PVC for a given mount path. +type TieredCacheVolumeState string const ( - // CacheResizePendingDeletion indicates that the old cache PVC at this mount path is waiting + // TieredCacheVolumeActive indicates the mount path is an active tiered storage cache volume + // with no resize operation in progress. + TieredCacheVolumeActive TieredCacheVolumeState = "active" + // TieredCacheVolumePendingDeletion indicates that the old cache PVC at this mount path is waiting // to be deleted once the broker pod stops. A replacement PVC with the new desired size has // already been created at the same mount path. - CacheResizePendingDeletion CacheResizeState = "pending-deletion" + TieredCacheVolumePendingDeletion TieredCacheVolumeState = "pending-deletion" ) // BrokerState holds information about broker state @@ -250,9 +253,12 @@ type BrokerState struct { Image string `json:"image,omitempty"` // Compressed data from broker configuration to restore broker pod in specific cases ConfigurationBackup string `json:"configurationBackup,omitempty"` - // CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. - // An entry is present only while a resize is in progress; it is cleared once cleanup completes. - CacheVolumeStates map[string]CacheResizeState `json:"cacheVolumeStates,omitempty"` + // TieredCacheVolumes tracks tiered storage cache PVC state for this broker, keyed by mount path. + // "active" means the PVC is a cache volume with no resize in progress. + // "pending-deletion" means a resize is in flight (old PVC waiting for pod to stop). + // Absent entry means the mount path is not a cache volume (or the PVC was removed). + // Written at PVC creation and updated through the resize lifecycle. + TieredCacheVolumes map[string]TieredCacheVolumeState `json:"tieredCacheVolumes,omitempty"` } const ( diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 639d426b0..5622b1775 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -210,9 +210,9 @@ func (in *BrokerState) DeepCopyInto(out *BrokerState) { *out = make(ExternalListenerConfigNames, len(*in)) copy(*out, *in) } - if in.CacheVolumeStates != nil { - in, out := &in.CacheVolumeStates, &out.CacheVolumeStates - *out = make(map[string]CacheResizeState, len(*in)) + if in.TieredCacheVolumes != nil { + in, out := &in.TieredCacheVolumes, &out.TieredCacheVolumes + *out = make(map[string]TieredCacheVolumeState, len(*in)) for key, val := range *in { (*out)[key] = val } diff --git a/charts/kafka-operator/crds/kafkaclusters.yaml b/charts/kafka-operator/crds/kafkaclusters.yaml index 8b5dff9a0..040925fc2 100644 --- a/charts/kafka-operator/crds/kafkaclusters.yaml +++ b/charts/kafka-operator/crds/kafkaclusters.yaml @@ -23825,15 +23825,6 @@ spec: additionalProperties: description: BrokerState holds information about broker state properties: - cacheVolumeStates: - additionalProperties: - description: CacheResizeState tracks the resize lifecycle - of a tiered storage cache PVC for a given mount path. - type: string - description: |- - CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. - An entry is present only while a resize is in progress; it is cleared once cleanup completes. - type: object configurationBackup: description: Compressed data from broker configuration to restore broker pod in specific cases @@ -23914,6 +23905,18 @@ spec: description: RackAwarenessState holds info about rack awareness status type: string + tieredCacheVolumes: + additionalProperties: + description: TieredCacheVolumeState tracks the lifecycle state + of a tiered storage cache PVC for a given mount path. + type: string + description: |- + TieredCacheVolumes tracks tiered storage cache PVC state for this broker, keyed by mount path. + "active" means the PVC is a cache volume with no resize in progress. + "pending-deletion" means a resize is in flight (old PVC waiting for pod to stop). + Absent entry means the mount path is not a cache volume (or the PVC was removed). + Written at PVC creation and updated through the resize lifecycle. + type: object version: description: Version holds the current version of the broker in semver format diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index 8b5dff9a0..040925fc2 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -23825,15 +23825,6 @@ spec: additionalProperties: description: BrokerState holds information about broker state properties: - cacheVolumeStates: - additionalProperties: - description: CacheResizeState tracks the resize lifecycle - of a tiered storage cache PVC for a given mount path. - type: string - description: |- - CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. - An entry is present only while a resize is in progress; it is cleared once cleanup completes. - type: object configurationBackup: description: Compressed data from broker configuration to restore broker pod in specific cases @@ -23914,6 +23905,18 @@ spec: description: RackAwarenessState holds info about rack awareness status type: string + tieredCacheVolumes: + additionalProperties: + description: TieredCacheVolumeState tracks the lifecycle state + of a tiered storage cache PVC for a given mount path. + type: string + description: |- + TieredCacheVolumes tracks tiered storage cache PVC state for this broker, keyed by mount path. + "active" means the PVC is a cache volume with no resize in progress. + "pending-deletion" means a resize is in flight (old PVC waiting for pod to stop). + Absent entry means the mount path is not a cache volume (or the PVC was removed). + Written at PVC creation and updated through the resize lifecycle. + type: object version: description: Version holds the current version of the broker in semver format diff --git a/docs/tiered-storage-pvc-resize.md b/docs/tiered-storage-pvc-resize.md index dd21d7a1d..59d5e50b2 100644 --- a/docs/tiered-storage-pvc-resize.md +++ b/docs/tiered-storage-pvc-resize.md @@ -10,16 +10,19 @@ with the rolling upgrade machinery so only one broker is affected at a time. ## State tracking Resize state is stored in the `KafkaCluster` CR status under -`status.brokersState[].cacheVolumeStates`, keyed by mount path. +`status.brokersState[].tieredCacheVolumes`, keyed by mount path. This keeps the KafkaCluster CR the single source of truth for all in-flight broker operations and avoids a second, parallel state store on PVC objects. | Field | Value | Meaning | |-------|-------|---------| -| `status.brokersState[N].cacheVolumeStates[]` | `pending-deletion` | A resize is in flight for this mount path. The old PVC (larger size) is waiting to be deleted once the broker pod stops; the replacement PVC (desired smaller size) has already been created. | +| `status.brokersState[N].tieredCacheVolumes[]` | `active` | The mount path is a tiered storage cache volume. No resize is in progress. | +| `status.brokersState[N].tieredCacheVolumes[]` | `pending-deletion` | A resize is in flight for this mount path. The old PVC (larger size) is waiting to be deleted once the broker pod stops; the replacement PVC (desired smaller size) has already been created. | +| *(absent)* | — | The mount path is not a tiered storage cache volume (or the PVC has been removed). | -The entry is cleared once the old PVC has been deleted and the broker pod has -restarted. An empty map means no resize is in progress. +The entry transitions from `pending-deletion` back to `active` once the old PVC +has been deleted and the resize is complete. An absent entry means no cache PVC +exists at that path. Two PVC annotations that describe what a PVC **is** (not operational state) are always present on cache PVCs: @@ -35,7 +38,7 @@ always present on cache PVCs: ### Cycle N — resize detected, pod running -1. `status.brokersState[N].cacheVolumeStates[]` is set to `pending-deletion` +1. `status.brokersState[N].tieredCacheVolumes[]` is set to `pending-deletion` in the KafkaCluster CR status. This is the durable record that a resize is in flight. 2. A replacement PVC with the new (smaller) size is created. Provisioning starts immediately. 3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart @@ -53,14 +56,14 @@ to fully disappear from etcd. 1. The old PVC (the one whose size differs from the desired size at that mount path) is deleted. -2. The `cacheVolumeStates` entry for that mount path is cleared from the CR status. +2. The `tieredCacheVolumes` entry for that mount path is set to `active` in the CR status. 3. A new broker pod is created referencing the replacement PVC. Because provisioning started in cycle N the PVC is likely already `Bound`, minimising startup latency. ### Cycle N+2 — pod is present again -1. No `cacheVolumeStates` entry remains for the mount path → resize is complete. -2. The replacement PVC is now an ordinary cache PVC with no special state attached. +1. No `pending-deletion` entry remains for the mount path → resize is complete. +2. The replacement PVC is now an ordinary cache PVC with `tieredCacheVolumes[] = active`. --- @@ -69,7 +72,7 @@ to fully disappear from etcd. A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC spec is updated with the larger size and Kubernetes expands the volume without a pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No -`cacheVolumeStates` entry is written and no rolling restart is triggered. +`tieredCacheVolumes` state change is made and no rolling restart is triggered. A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrinking is only supported for tiered storage cache volumes — regular Kafka log @@ -81,11 +84,11 @@ volumes reject any size decrease with an error. | Property | Value | |----------|-------| -| State survives reconciler crash | Yes — `cacheVolumeStates` is written to the KafkaCluster CR (etcd) before the replacement PVC is created; every step is re-entrant | +| State survives reconciler crash | Yes — `tieredCacheVolumes` is written to the KafkaCluster CR (etcd) before the replacement PVC is created; every step is re-entrant | | Single source of truth | Yes — all broker state (configuration, graceful actions, cache resize) lives in `status.brokersState` | | Atomicity gap | Eliminated — replacement PVC is created before old is deleted | | Provisioning overlaps gate evaluation | Yes — replacement PVC created in cycle N, not N+1 | -| Observable via kubectl | Yes — `kubectl get kafkacluster -o jsonpath='{.status.brokersState}'` shows resize state; an empty `cacheVolumeStates` means no resize is in progress | +| Observable via kubectl | Yes — `kubectl get kafkacluster -o jsonpath='{.status.brokersState}'` shows resize state; `pending-deletion` entries indicate an in-flight resize | | CC disk rebalance for cache PVCs | Excluded — tiered cache PVCs are explicitly skipped from `GracefulDiskRebalanceRequired` and CC capacity config | | `log.dirs` for cache PVCs | Excluded — `generateStorageConfig` skips volumes with `TieredStorageCache: true` | @@ -95,7 +98,7 @@ volumes reject any size decrease with an error. ``` Cycle N (pod UP, resize detected) - ├─ set cacheVolumeStates[mountPath] = pending-deletion in CR status + ├─ set tieredCacheVolumes[mountPath] = pending-deletion in CR status ├─ create replacement PVC (provisioning starts) ├─ set ConfigOutOfSync └─ handleRollingUpgrade @@ -107,9 +110,9 @@ Cycle N+k (pod UP, gates failing — any number of cycles) Cycle N+k+1 (pod ABSENT — gone or Terminating) ├─ delete old PVC (identified as the PVC at mountPath whose size ≠ desired) - ├─ clear cacheVolumeStates[mountPath] from CR status + ├─ set tieredCacheVolumes[mountPath] = active in CR status └─ create new pod bound to replacement PVC Cycle N+k+2 (pod PRESENT — non-Terminating, not necessarily Running) - └─ cacheVolumeStates entry is absent → resize complete, no further action + └─ tieredCacheVolumes[mountPath] = active → resize complete, no further action ``` diff --git a/pkg/k8sutil/status.go b/pkg/k8sutil/status.go index e48a305ae..f287727ed 100644 --- a/pkg/k8sutil/status.go +++ b/pkg/k8sutil/status.go @@ -174,15 +174,15 @@ func generateBrokerState(brokerIDs []string, cluster *banzaicloudv1beta1.KafkaCl case banzaicloudv1beta1.KafkaVersion: brokerState.Image = s.Image brokerState.Version = s.Version - case map[string]banzaicloudv1beta1.CacheResizeState: - if brokerState.CacheVolumeStates == nil { - brokerState.CacheVolumeStates = make(map[string]banzaicloudv1beta1.CacheResizeState) + case map[string]banzaicloudv1beta1.TieredCacheVolumeState: + if brokerState.TieredCacheVolumes == nil { + brokerState.TieredCacheVolumes = make(map[string]banzaicloudv1beta1.TieredCacheVolumeState) } for mountPath, state := range s { if state == "" { - delete(brokerState.CacheVolumeStates, mountPath) + delete(brokerState.TieredCacheVolumes, mountPath) } else { - brokerState.CacheVolumeStates[mountPath] = state + brokerState.TieredCacheVolumes[mountPath] = state } } } diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 28d8eac90..0d69ba612 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -469,8 +469,8 @@ func (r *Reconciler) Reconcile(log logr.Logger) error { pendingDeletionMountPaths := make(map[string]bool) brokerIdStr := strconv.Itoa(int(broker.Id)) - for mp, state := range r.KafkaCluster.Status.BrokersState[brokerIdStr].CacheVolumeStates { - if state == banzaiv1beta1.CacheResizePendingDeletion { + for mp, state := range r.KafkaCluster.Status.BrokersState[brokerIdStr].TieredCacheVolumes { + if state == banzaiv1beta1.TieredCacheVolumePendingDeletion { pendingDeletionMountPaths[mp] = true } } @@ -1282,8 +1282,20 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro return nil } -// deleteRemovedCachePVCs deletes tiered storage cache PVCs whose mount path is no longer desired. -// These volumes are not Kafka log dirs and must never go through CC disk removal. +// isBrokerPVCTieredCache reports whether the given PVC is a tiered storage cache volume, +// using status.BrokersState as the authoritative source of truth (written at PVC creation). +// Falls back to the PVC annotation for PVCs created before TieredCacheVolumes was introduced. +func (r *Reconciler) isBrokerPVCTieredCache(pvc *corev1.PersistentVolumeClaim, brokerId string) bool { + mountPath := pvc.Annotations["mountPath"] + if state := r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath]; state != "" { + return true + } + return pvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue +} + +// deleteRemovedCachePVCs deletes tiered storage cache PVCs whose mount path is no longer desired +// and clears their status entries. These volumes are not Kafka log dirs and must never go through +// CC disk removal. func (r *Reconciler) deleteRemovedCachePVCs( ctx context.Context, log logr.Logger, @@ -1291,19 +1303,54 @@ func (r *Reconciler) deleteRemovedCachePVCs( pvcList *corev1.PersistentVolumeClaimList, desiredMountPaths map[string]bool, ) error { + // pathsToUntrack accumulates mount paths whose status entry must be cleared. + // Collected in one pass so a single UpdateBrokerStatus call covers all of them. + pathsToUntrack := make(map[string]banzaiv1beta1.TieredCacheVolumeState) + for i := range pvcList.Items { pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] != annotationTrue { + if !r.isBrokerPVCTieredCache(pvc, brokerId) { continue } - if !desiredMountPaths[pvc.Annotations["mountPath"]] { + mountPath := pvc.Annotations["mountPath"] + if desiredMountPaths[mountPath] { + continue + } + // Issue Delete only if the PVC is not already terminating. + if pvc.DeletionTimestamp == nil { log.Info("Deleting removed tiered storage cache PVC", - "brokerId", brokerId, "mountPath", pvc.Annotations["mountPath"], "pvc", pvc.Name) + "brokerId", brokerId, "mountPath", mountPath, "pvc", pvc.Name) if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { return errorfactory.New(errorfactory.APIFailure{}, err, "deleting removed tiered storage cache PVC failed", "pvc", pvc.Name) } } + // Clear status regardless of PVC termination state: once deletion has been + // requested (DeletionTimestamp set), the status entry is no longer needed. + pathsToUntrack[mountPath] = "" + } + + // Second pass: clear status entries for mount paths that are no longer desired + // but whose PVC is already fully finalized and absent from the list. This handles + // the retry path where a prior cycle deleted the PVC but failed to clear status. + for mountPath, state := range r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes { + if state == "" || desiredMountPaths[mountPath] { + continue + } + if _, alreadyHandled := pathsToUntrack[mountPath]; !alreadyHandled { + log.Info("Clearing stale tiered cache volume status for already-deleted mount path", + "brokerId", brokerId, "mountPath", mountPath) + pathsToUntrack[mountPath] = "" + } + } + + if len(pathsToUntrack) == 0 { + return nil + } + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + pathsToUntrack, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not clear tiered cache volume state after PVC deletion", "brokerId", brokerId) } return nil } @@ -1311,12 +1358,12 @@ func (r *Reconciler) deleteRemovedCachePVCs( // effectivePvcCount returns the number of PVCs for a broker, counting each pending-deletion cache // mount path only once (old + replacement coexist during a shrink but represent one logical disk). func (r *Reconciler) effectivePvcCount(brokerId string, pvcList *corev1.PersistentVolumeClaimList) int { - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + tieredCacheVolumes := r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes counted := make(map[string]bool) total := 0 for _, pvc := range pvcList.Items { mp := pvc.Annotations["mountPath"] - if cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + if tieredCacheVolumes[mp] == banzaiv1beta1.TieredCacheVolumePendingDeletion { if !counted[mp] { counted[mp] = true total++ @@ -1352,7 +1399,7 @@ func (r *Reconciler) getBrokerPodExists(ctx context.Context, brokerId string) (b // It returns true when the caller should skip the rest of the reconcile loop for this broker (orphaned resize // state detected while the pod is running — a rolling restart must complete first). // -//nolint:funlen +//nolint:funlen,gocyclo func (r *Reconciler) handleBrokerCacheResizeCleanup( ctx context.Context, log logr.Logger, @@ -1363,8 +1410,15 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( brokerPodExists bool, matchingLabels client.MatchingLabels, ) (bool, error) { - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates - if len(cacheVolumeStates) == 0 { + tieredCacheVolumes := r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes + hasPendingDeletion := false + for _, state := range tieredCacheVolumes { + if state == banzaiv1beta1.TieredCacheVolumePendingDeletion { + hasPendingDeletion = true + break + } + } + if !hasPendingDeletion { return false, nil } @@ -1378,9 +1432,9 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( } } - // stateUpdates accumulates CacheVolumeStates changes to apply in a single status update. - // An empty CacheResizeState value means "delete this entry from the map". - stateUpdates := make(map[string]banzaiv1beta1.CacheResizeState) + // stateUpdates accumulates TieredCacheVolumes changes to apply in a single status update. + // An empty TieredCacheVolumeState value means "delete this entry from the map". + stateUpdates := make(map[string]banzaiv1beta1.TieredCacheVolumeState) if brokerPodExists { // Resize complete when the replacement PVC (size == desired) already exists at the mount @@ -1389,8 +1443,8 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( // pending-deletion state and creating the replacement PVC, the next cycle sees only the // old PVC (size != desired) and correctly does NOT clear state, letting // reconcileDesiredPvcsForBroker re-create the replacement. - for mp, state := range cacheVolumeStates { - if state != banzaiv1beta1.CacheResizePendingDeletion { + for mp, state := range tieredCacheVolumes { + if state != banzaiv1beta1.TieredCacheVolumePendingDeletion { continue } desiredSize := desiredSizeByMountPath[mp] @@ -1409,15 +1463,15 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( } } if replacementExists && !oldPvcExists { - stateUpdates[mp] = "" + stateUpdates[mp] = banzaiv1beta1.TieredCacheVolumeActive log.Info("Tiered storage cache PVC resize complete — clearing resize state", "brokerId", brokerId, "mountPath", mp) } } } else { // Broker pod is down — safe to delete old PVCs and clean up orphaned state. - for mp, state := range cacheVolumeStates { - if state != banzaiv1beta1.CacheResizePendingDeletion { + for mp, state := range tieredCacheVolumes { + if state != banzaiv1beta1.TieredCacheVolumePendingDeletion { continue } if !desiredMountPaths[mp] { @@ -1434,6 +1488,7 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( "deleting orphaned cache resize PVC failed", "pvc", pvc.Name) } } + stateUpdates[mp] = "" // entry removed: volume gone, no longer a cache PVC } else { // Mount path still desired: delete the old PVC (the one whose size differs from desired). desiredSize := desiredSizeByMountPath[mp] @@ -1452,8 +1507,8 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( break } } + stateUpdates[mp] = banzaiv1beta1.TieredCacheVolumeActive // resize done, still a cache PVC } - stateUpdates[mp] = "" } if len(stateUpdates) > 0 { @@ -1481,7 +1536,10 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( // longer desired (storage config removed mid-resize), trigger a rolling restart so the pod // stops and the orphaned PVCs can be deleted on the next cycle. if brokerPodExists { - for mp := range r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates { + for mp, state := range r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes { + if state != banzaiv1beta1.TieredCacheVolumePendingDeletion { + continue + } if !desiredMountPaths[mp] { log.Info("Orphaned cache resize state detected with broker pod running — triggering rolling restart", "brokerId", brokerId, "mountPath", mp) @@ -1502,7 +1560,7 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( // reconcileDesiredPvcsForBroker reconciles the desired PVCs for a single broker. // -//nolint:funlen +//nolint:funlen,gocyclo func (r *Reconciler) reconcileDesiredPvcsForBroker( ctx context.Context, log logr.Logger, @@ -1531,6 +1589,13 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) } log.Info("resource created") + if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not record tiered cache volume state after PVC creation", "brokerId", brokerId) + } + } continue } @@ -1548,7 +1613,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // Skip the old PVC that is waiting to be deleted once the broker pod stops. // We identify it as the PVC whose size differs from the desired size while a // pending-deletion resize is in flight for this mount path. - if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + if r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] == banzaiv1beta1.TieredCacheVolumePendingDeletion { if desiredPvc.Spec.Resources.Requests.Storage().Value() != pvc.Spec.Resources.Requests.Storage().Value() { continue } @@ -1558,7 +1623,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // Trigger a CC disk rebalance only for regular data volumes. // Tiered storage cache PVCs are ephemeral — CC must not account for them. - if currentPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] != annotationTrue { + if !r.isBrokerPVCTieredCache(currentPvc, brokerId) { // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, // then we make it happening with status update. // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance @@ -1578,7 +1643,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // excludes the old PVC, so alreadyCreated is false even when a replacement exists. // A strongly-consistent (uncached) read picks up that replacement so we don't issue a // second Create with a fresh GenerateName. - if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + if r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] == banzaiv1beta1.TieredCacheVolumePendingDeletion { liveList := &corev1.PersistentVolumeClaimList{} if err := r.DirectClient.List(ctx, liveList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "uncached list of PVCs failed", "kind", desiredType) @@ -1607,13 +1672,20 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( if err := r.Create(ctx, desiredPvc); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) } + if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not record tiered cache volume state after PVC creation", "brokerId", brokerId) + } + } continue } // A resize is in flight for this mount path: the matched PVC is the replacement. // Keep triggering ConfigOutOfSync so the rolling upgrade completes and the old PVC // can be deleted on the next reconcile cycle. - if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + if r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] == banzaiv1beta1.TieredCacheVolumePendingDeletion { if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, banzaiv1beta1.ConfigOutOfSync, log); err != nil { @@ -1624,11 +1696,11 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } - // Trust only the persisted PVC annotation when classifying — never the desired-spec - // annotation. Reading from the desired side would let a CR-edit (flipping - // tieredStorageCache: false → true on an existing data volume) route a real log-dir PVC - // through the cache-shrink delete-and-recreate path, bypassing graceful disk drain. - isTieredCache := currentPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue + // Use the committed brokerState (written at PVC creation) as the authoritative source of + // truth — never the desired spec. Reading from the desired side would let a CR edit + // (flipping tieredStorageCache: false → true on an existing data volume) route a real + // log-dir PVC through the cache-shrink delete-and-recreate path, bypassing graceful drain. + isTieredCache := r.isBrokerPVCTieredCache(currentPvc, brokerId) desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() @@ -1691,7 +1763,7 @@ func (r *Reconciler) stageTieredCachePVCShrink( // Record the resize in brokerState so the reconciler knows to exclude the old PVC and // keep the rolling upgrade running until the old PVC is cleaned up. if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - map[string]banzaiv1beta1.CacheResizeState{mountPath: banzaiv1beta1.CacheResizePendingDeletion}, log); err != nil { + map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumePendingDeletion}, log); err != nil { return errorfactory.New(errorfactory.StatusUpdateError{}, err, "could not record cache resize state for tiered storage cache PVC resize", "brokerId", brokerId) } @@ -1703,6 +1775,9 @@ func (r *Reconciler) stageTieredCachePVCShrink( return errors.WrapIf(err, "could not apply last state to annotation") } if err := r.Create(ctx, desiredPvc); err != nil { + // Resize state is already recorded; the next reconcile cycle will re-create the replacement. + log.Error(err, "creating replacement tiered storage cache PVC failed after recording resize state; will retry", + "brokerId", brokerId, "mountPath", mountPath) return errorfactory.New(errorfactory.APIFailure{}, err, "creating replacement tiered storage cache PVC failed", "brokerId", brokerId) } diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 7803630b4..9882441d2 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1412,14 +1412,14 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { directClientOnlyPvc *corev1.PersistentVolumeClaim desiredPvc *corev1.PersistentVolumeClaim existingPods []corev1.Pod - initialCacheVolumeState v1beta1.CacheResizeState // pre-existing brokerState for mountPath, if any + initialTieredCacheState v1beta1.TieredCacheVolumeState // pre-existing brokerState for mountPath, if any expectedUpdatePvc bool expectedCreatePvc bool expectedDeletePvc bool expectedError bool }{ { - // Pod is up, no prior resize state: record CacheResizePendingDeletion in brokerState, + // Pod is up, no prior resize state: record TieredCacheVolumePendingDeletion in brokerState, // create replacement PVC, set ConfigOutOfSync to trigger rolling upgrade. testName: "size decrease with running pod — resize state recorded, replacement PVC created", existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), @@ -1449,7 +1449,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{terminatingPod}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: true, @@ -1474,7 +1474,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: true, @@ -1500,7 +1500,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{runningPod}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: false, @@ -1515,7 +1515,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { additionalExistingPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), existingPods: []corev1.Pod{runningPod}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: false, expectedDeletePvc: false, @@ -1533,7 +1533,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { directClientOnlyPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), existingPods: []corev1.Pod{runningPod}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: false, expectedDeletePvc: false, @@ -1559,11 +1559,11 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { }}, }, } - if test.initialCacheVolumeState != "" { + if test.initialTieredCacheState != "" { kafkaCluster.Status.BrokersState = map[string]v1beta1.BrokerState{ brokerId: { - CacheVolumeStates: map[string]v1beta1.CacheResizeState{ - mountPath: test.initialCacheVolumeState, + TieredCacheVolumes: map[string]v1beta1.TieredCacheVolumeState{ + mountPath: test.initialTieredCacheState, }, }, } diff --git a/pkg/webhooks/kafkacluster_validator.go b/pkg/webhooks/kafkacluster_validator.go index 6b51d7901..a2941cd41 100644 --- a/pkg/webhooks/kafkacluster_validator.go +++ b/pkg/webhooks/kafkacluster_validator.go @@ -18,6 +18,7 @@ package webhooks import ( "context" "fmt" + "strconv" corev1 "k8s.io/api/core/v1" @@ -48,7 +49,7 @@ func (s KafkaClusterValidator) ValidateUpdate(ctx context.Context, oldObj, newOb allErrs = append(allErrs, listenerErrs...) } - allErrs = append(allErrs, checkTieredStorageCacheImmutability(&kafkaClusterOld.Spec, &kafkaClusterNew.Spec)...) + allErrs = append(allErrs, checkTieredStorageCacheImmutability(kafkaClusterOld, kafkaClusterNew)...) if len(allErrs) == 0 { return nil, nil @@ -84,59 +85,66 @@ func (s KafkaClusterValidator) ValidateDelete(ctx context.Context, obj runtime.O return nil, nil } -// checkTieredStorageCacheImmutability rejects updates that flip the TieredStorageCache flag on an -// existing storageConfigs entry (matched by mountPath, scoped per brokerConfigGroup or per broker). -// Flipping false→true on a live data PVC silently removes it from log.dirs and Cruise Control -// capacity, and enables a delete-and-recreate shrink path that bypasses graceful disk drain — an -// irreversible data-loss vector when applied to a volume that already holds Kafka log segments. -// To change this property, callers must remove the entry and re-add it in a subsequent apply. -func checkTieredStorageCacheImmutability(oldSpec, newSpec *banzaicloudv1beta1.KafkaClusterSpec) field.ErrorList { +// checkTieredStorageCacheImmutability rejects updates that change the tieredStorageCache +// classification of an existing PVC. It uses status.BrokersState[brokerID].TieredCacheVolumes +// as the authoritative source of truth — the reconciler writes this map when it creates each PVC. +// Comparing against committed status catches all bypass paths (in-place flip, group-switch, +// inline-shadow override) without requiring spec-level merging or effective-config resolution. +// Removing a mountPath from the spec is allowed (delete-and-re-add path). +func checkTieredStorageCacheImmutability(oldCluster, newCluster *banzaicloudv1beta1.KafkaCluster) field.ErrorList { var allErrs field.ErrorList - for groupName, newGroup := range newSpec.BrokerConfigGroups { - oldGroup, ok := oldSpec.BrokerConfigGroups[groupName] - if !ok { - continue - } - groupPath := field.NewPath("spec").Child("brokerConfigGroups").Key(groupName).Child("storageConfigs") - allErrs = append(allErrs, diffTieredStorageCache(oldGroup.StorageConfigs, newGroup.StorageConfigs, groupPath)...) - } - - oldByID := make(map[int32]banzaicloudv1beta1.Broker, len(oldSpec.Brokers)) - for _, b := range oldSpec.Brokers { - oldByID[b.Id] = b - } - for i, newBroker := range newSpec.Brokers { - oldBroker, ok := oldByID[newBroker.Id] - if !ok || oldBroker.BrokerConfig == nil || newBroker.BrokerConfig == nil { - continue + for brokerIDStr, brokerState := range oldCluster.Status.BrokersState { + for mountPath, state := range brokerState.TieredCacheVolumes { + if state == "" { + continue // not a cache PVC + } + newValue, fieldPath, found := findTieredStorageCacheInSpec(&newCluster.Spec, brokerIDStr, mountPath) + if !found { + continue // mountPath removed from spec — remove-and-re-add is allowed + } + if !newValue { + allErrs = append(allErrs, field.Forbidden(fieldPath, immutableTieredStorageCacheErrMsg)) + } } - brokerPath := field.NewPath("spec").Child("brokers").Index(i).Child("brokerConfig").Child("storageConfigs") - allErrs = append(allErrs, diffTieredStorageCache(oldBroker.BrokerConfig.StorageConfigs, newBroker.BrokerConfig.StorageConfigs, brokerPath)...) } return allErrs } -func diffTieredStorageCache(oldConfigs, newConfigs []banzaicloudv1beta1.StorageConfig, basePath *field.Path) field.ErrorList { - var allErrs field.ErrorList - oldByPath := make(map[string]banzaicloudv1beta1.StorageConfig, len(oldConfigs)) - for _, sc := range oldConfigs { - oldByPath[sc.MountPath] = sc - } - for i, newSC := range newConfigs { - oldSC, ok := oldByPath[newSC.MountPath] - if !ok { +// findTieredStorageCacheInSpec resolves the intended tieredStorageCache value for a given +// broker (by string ID, matching reconciler convention) and mountPath from the raw spec. +// Inline storageConfigs take priority over brokerConfigGroup (matching dedupStorageConfigs order). +// Returns (value, fieldPath, found=true) when the mountPath is present in the new spec. +func findTieredStorageCacheInSpec(spec *banzaicloudv1beta1.KafkaClusterSpec, brokerIDStr, mountPath string) (bool, *field.Path, bool) { + for i, broker := range spec.Brokers { + if strconv.Itoa(int(broker.Id)) != brokerIDStr { continue } - if oldSC.TieredStorageCache != newSC.TieredStorageCache { - allErrs = append(allErrs, field.Forbidden( - basePath.Index(i).Child("tieredStorageCache"), - immutableTieredStorageCacheErrMsg, - )) + // Inline config has priority (mirrors dedupStorageConfigs order in GetBrokerConfig) + if broker.BrokerConfig != nil { + for k, sc := range broker.BrokerConfig.StorageConfigs { + if sc.MountPath == mountPath { + p := field.NewPath("spec").Child("brokers").Index(i). + Child("brokerConfig").Child("storageConfigs").Index(k).Child("tieredStorageCache") + return sc.TieredStorageCache, p, true + } + } + } + if broker.BrokerConfigGroup != "" { + if group, ok := spec.BrokerConfigGroups[broker.BrokerConfigGroup]; ok { + for k, sc := range group.StorageConfigs { + if sc.MountPath == mountPath { + p := field.NewPath("spec").Child("brokerConfigGroups"). + Key(broker.BrokerConfigGroup).Child("storageConfigs").Index(k).Child("tieredStorageCache") + return sc.TieredStorageCache, p, true + } + } + } } + break } - return allErrs + return false, nil, false } // checkListeners validates the spec.listenersConfig object diff --git a/pkg/webhooks/kafkacluster_validator_test.go b/pkg/webhooks/kafkacluster_validator_test.go index 6193a9a8b..7d81acf69 100644 --- a/pkg/webhooks/kafkacluster_validator_test.go +++ b/pkg/webhooks/kafkacluster_validator_test.go @@ -259,53 +259,65 @@ func TestCheckExternalListenerStartingPort(t *testing.T) { } func TestCheckTieredStorageCacheImmutability(t *testing.T) { - mp := func(path string, tc bool) v1beta1.StorageConfig { + sc := func(path string, tc bool) v1beta1.StorageConfig { return v1beta1.StorageConfig{MountPath: path, TieredStorageCache: tc} } + // committed returns a KafkaCluster whose status records the given tieredCache volumes for broker 0. + // Only cache PVCs are tracked (TieredCacheVolumeActive or TieredCacheVolumePendingDeletion); + // non-cache PVCs have no entry. + committed := func(vols map[string]v1beta1.TieredCacheVolumeState) *v1beta1.KafkaCluster { + return &v1beta1.KafkaCluster{ + Status: v1beta1.KafkaClusterStatus{ + BrokersState: map[string]v1beta1.BrokerState{ + "0": {TieredCacheVolumes: vols}, + }, + }, + } + } testCases := []struct { - testName string - oldSpec v1beta1.KafkaClusterSpec - newSpec v1beta1.KafkaClusterSpec - expected field.ErrorList + testName string + oldCluster *v1beta1.KafkaCluster + newCluster *v1beta1.KafkaCluster + expected field.ErrorList }{ { - testName: "no change — empty", - oldSpec: v1beta1.KafkaClusterSpec{}, - newSpec: v1beta1.KafkaClusterSpec{}, + // No committed state — new cluster, any spec is valid. + testName: "no committed state — any spec change allowed", + oldCluster: &v1beta1.KafkaCluster{}, + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/data", true)}, + }}}, + }}, expected: nil, }, { - testName: "brokerConfigGroups: flip false→true on existing mountPath rejected", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}}, - }, - }, - newSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", true)}}, - }, - }, + // Committed status says /data IS a cache volume; inline spec now marks it as non-cache. + testName: "in-place inline flip true→false rejected", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/data": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/data", false)}, + }}}, + }}, expected: append(field.ErrorList{}, field.Forbidden( - field.NewPath("spec").Child("brokerConfigGroups").Key("default").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + field.NewPath("spec").Child("brokers").Index(0).Child("brokerConfig").Child("storageConfigs").Index(0).Child("tieredStorageCache"), immutableTieredStorageCacheErrMsg, ), ), }, { - testName: "brokerConfigGroups: flip true→false on existing mountPath rejected", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}}, - }, - }, - newSpec: v1beta1.KafkaClusterSpec{ + // Committed status says /cache IS a cache volume; group spec now marks it as non-cache. + testName: "in-place group flip true→false rejected", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfigGroup: "default"}}, BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", false)}}, + "default": {StorageConfigs: []v1beta1.StorageConfig{sc("/cache", false)}}, }, - }, + }}, expected: append(field.ErrorList{}, field.Forbidden( field.NewPath("spec").Child("brokerConfigGroups").Key("default").Child("storageConfigs").Index(0).Child("tieredStorageCache"), @@ -314,48 +326,43 @@ func TestCheckTieredStorageCacheImmutability(t *testing.T) { ), }, { - testName: "brokerConfigGroups: new entry with tieredStorageCache=true allowed", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}}, - }, - }, - newSpec: v1beta1.KafkaClusterSpec{ + // Bypass attempt: broker switches from groupA (where /cache=true) to groupB (where /cache=false). + // The raw-spec old→new comparison would miss this since groupA is unchanged. + // Status-based check catches it because committed status records /cache=active for broker 0. + testName: "group-switch bypass rejected", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfigGroup: "groupB"}}, BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{ - mp("/kafka-logs/0", false), - mp("/cache/0", true), - }}, + "groupA": {StorageConfigs: []v1beta1.StorageConfig{sc("/cache", true)}}, + "groupB": {StorageConfigs: []v1beta1.StorageConfig{sc("/cache", false)}}, }, - }, - expected: nil, + }}, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokerConfigGroups").Key("groupB").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), }, { - testName: "brokerConfigGroups: remove entry, then re-add with different value — allowed (no entry to match against in old at the same mountPath after removal scenario; test single-pass: removed entry simply not in new)", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", false)}}, - }, - }, - newSpec: v1beta1.KafkaClusterSpec{ + // Bypass attempt: /cache was provisioned as a cache volume (via group); now an inline entry + // overrides it with TieredStorageCache=false. Inline takes priority in GetBrokerConfig so + // the effective value would flip — the check must reject the inline entry. + testName: "inline-shadow bypass rejected", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{ + Id: 0, + BrokerConfigGroup: "default", + BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/cache", false)}, + }, + }}, BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{}}, + "default": {StorageConfigs: []v1beta1.StorageConfig{sc("/cache", true)}}, }, - }, - expected: nil, - }, - { - testName: "brokers[].brokerConfig: flip false→true rejected", - oldSpec: v1beta1.KafkaClusterSpec{ - Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ - StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}, - }}}, - }, - newSpec: v1beta1.KafkaClusterSpec{ - Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ - StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", true)}, - }}}, - }, + }}, expected: append(field.ErrorList{}, field.Forbidden( field.NewPath("spec").Child("brokers").Index(0).Child("brokerConfig").Child("storageConfigs").Index(0).Child("tieredStorageCache"), @@ -364,34 +371,43 @@ func TestCheckTieredStorageCacheImmutability(t *testing.T) { ), }, { - testName: "brokers[].brokerConfig: nil old config — no error (entry didn't exist)", - oldSpec: v1beta1.KafkaClusterSpec{ - Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: nil}}, - }, - newSpec: v1beta1.KafkaClusterSpec{ + // mountPath is removed from spec entirely — remove-and-re-add path is intentionally allowed. + testName: "remove mountPath from spec — allowed", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ - StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}, + StorageConfigs: []v1beta1.StorageConfig{sc("/data", false)}, }}}, - }, + }}, expected: nil, }, { - testName: "brokerConfigGroup not present in old — no error (whole group is new)", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{}, - }, - newSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "new-group": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}}, - }, - }, + // New mountPath with no committed state — any value is allowed. + testName: "new mountPath not in committed state — allowed", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/data": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/data", true), sc("/cache", true)}, + }}}, + }}, + expected: nil, + }, + { + // Value unchanged — no error. + testName: "unchanged value — no error", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/cache", true)}, + }}}, + }}, expected: nil, }, } for _, testCase := range testCases { t.Run(testCase.testName, func(t *testing.T) { - got := checkTieredStorageCacheImmutability(&testCase.oldSpec, &testCase.newSpec) + got := checkTieredStorageCacheImmutability(testCase.oldCluster, testCase.newCluster) require.Equal(t, testCase.expected, got) }) } diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 5e6e1cfb3..00eaae5f7 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -181,6 +181,33 @@ func testTieredStorageCachePvcResize() bool { var err error var broker0PodUID string // UID of the broker-0 pod before the resize, used to detect recycling + // AfterAll ensures the cluster and its PVCs are removed even when a test step fails + // mid-run, preventing stale resources from blocking the next test run. + ginkgo.AfterAll(func() { + opts, e := kubectlOptionsForCurrentContext() + if e != nil { + return + } + opts.Namespace = koperatorLocalHelmDescriptor.Namespace + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + "patch", kafkaKind, tsResizeClusterName, + "--type=merge", `--patch={"metadata":{"finalizers":[]}}`, + ) + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + "delete", kafkaKind, tsResizeClusterName, "--ignore-not-found", + ) + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + "delete", "persistentvolumeclaims", + "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), + "--ignore-not-found", + ) + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + "delete", "pods", + "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), + "--ignore-not-found", "--grace-period=0", + ) + }) + ginkgo.It("Acquiring K8s config and context", func() { kubectlOptions, err = kubectlOptionsForCurrentContext() gomega.Expect(err).NotTo(gomega.HaveOccurred()) From 27b17ad70f4e2e9665fccffab630a922a2306499 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Fri, 8 May 2026 14:09:56 +0300 Subject: [PATCH 18/19] agent review- e2e tests --- pkg/resources/kafka/kafka.go | 6 ++- pkg/resources/kafka/kafka_test.go | 27 ++++++++----- tests/e2e/test_tiered_storage_cache_resize.go | 39 ++++++++++--------- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 0d69ba612..00301705d 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1672,7 +1672,11 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( if err := r.Create(ctx, desiredPvc); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) } - if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue { + // Only write Active when this is a genuinely new PVC. If pending-deletion is already + // set (crash-recovery: prior Create timed out), preserve the in-flight state so + // handleBrokerCacheResizeCleanup can still delete the old PVC on the next cycle. + if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue && + r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] != banzaiv1beta1.TieredCacheVolumePendingDeletion { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { return errorfactory.New(errorfactory.StatusUpdateError{}, err, diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 9882441d2..292d1cdbc 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1417,6 +1417,9 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { expectedCreatePvc bool expectedDeletePvc bool expectedError bool + // expectedTieredCacheState, when non-empty, asserts the final TieredCacheVolumes[mountPath] + // value after reconcileKafkaPvc returns. + expectedTieredCacheState v1beta1.TieredCacheVolumeState }{ { // Pod is up, no prior resize state: record TieredCacheVolumePendingDeletion in brokerState, @@ -1496,15 +1499,16 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { // failed before completing. Pod is still running, so the old PVC must NOT be // deleted and state must NOT be cleared. The reconciler must re-create the // replacement. - testName: "pending-deletion with running pod and no replacement — replacement re-staged", - existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), - desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), - existingPods: []corev1.Pod{runningPod}, - initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, - expectedUpdatePvc: false, - expectedCreatePvc: true, - expectedDeletePvc: false, - expectedError: false, + testName: "pending-deletion with running pod and no replacement — replacement re-staged", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + expectedTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, }, { // Idempotency (symmetric): pending-deletion state and a replacement at desired @@ -1646,6 +1650,11 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { } else { assert.Nil(t, err, "expected no error but got: %v", err) } + if test.expectedTieredCacheState != "" { + finalState := kafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] + assert.Equal(t, test.expectedTieredCacheState, finalState, + "TieredCacheVolumes[%s] after reconcile", mountPath) + } }) } } diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 00eaae5f7..273858d63 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -45,13 +45,14 @@ const ( // pvcItem is a minimal representation of a PVC for assertion helpers. type pvcItem struct { - Name string - Annotations map[string]string - StorageSize string - Phase string + Name string + Annotations map[string]string + StorageSize string + Phase string + IsTerminating bool } -// getCacheResizeState returns the CacheVolumeStates entry for the given broker and mount path +// getCacheResizeState returns the tieredCacheVolumes entry for the given broker and mount path // from the KafkaCluster CR status, or an empty string if not set. func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerID, mountPath string) (string, error) { rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, @@ -65,7 +66,7 @@ func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerI var cr struct { Status struct { BrokersState map[string]struct { - CacheVolumeStates map[string]string `json:"cacheVolumeStates"` + TieredCacheVolumes map[string]string `json:"tieredCacheVolumes"` } `json:"brokersState"` } `json:"status"` } @@ -77,7 +78,7 @@ func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerI if !ok { return "", nil } - return brokerState.CacheVolumeStates[mountPath], nil + return brokerState.TieredCacheVolumes[mountPath], nil } // listBrokerCachePVCs returns PVCs for broker tsResizeBrokerID that have the @@ -97,8 +98,9 @@ func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { var pvcList struct { Items []struct { Metadata struct { - Name string `json:"name"` - Annotations map[string]string `json:"annotations"` + Name string `json:"name"` + Annotations map[string]string `json:"annotations"` + DeletionTimestamp *string `json:"deletionTimestamp"` } `json:"metadata"` Spec struct { Resources struct { @@ -120,10 +122,11 @@ func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { for _, item := range pvcList.Items { if item.Metadata.Annotations["mountPath"] == tsResizeCacheMountPath { result = append(result, pvcItem{ - Name: item.Metadata.Name, - Annotations: item.Metadata.Annotations, - StorageSize: item.Spec.Resources.Requests.Storage, - Phase: item.Status.Phase, + Name: item.Metadata.Name, + Annotations: item.Metadata.Annotations, + StorageSize: item.Spec.Resources.Requests.Storage, + Phase: item.Status.Phase, + IsTerminating: item.Metadata.DeletionTimestamp != nil, }) } } @@ -325,7 +328,7 @@ func testTieredStorageCachePvcResize() bool { // Two acceptable success paths — the resize can complete in seconds, often faster // than the polling interval, so insisting on the intermediate "pending-deletion" // snapshot makes this phase flake-prone: - // A) In-progress: cacheVolumeStates[mp]=="pending-deletion" AND ≥2 cache PVCs. + // A) In-progress: tieredCacheVolumes[mp]=="pending-deletion" AND ≥2 cache PVCs. // B) Already completed: state cleared AND exactly one cache PVC at the shrunk size. // Either proves the resize was staged correctly; Phase 2/3 cover the remaining // invariants regardless of which path we observed. @@ -346,7 +349,7 @@ func testTieredStorageCachePvcResize() bool { if state == "" && len(pvcs) == 1 && pvcs[0].StorageSize == tsResizeShrunkSize { return nil } - return fmt.Errorf("not at expected staging state: cacheVolumeStates[%s]=%q, %d cache PVC(s)", + return fmt.Errorf("not at expected staging state: tieredCacheVolumes[%s]=%q, %d cache PVC(s)", tsResizeCacheMountPath, state, len(pvcs)) }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) }) @@ -378,7 +381,7 @@ func testTieredStorageCachePvcResize() bool { activePvcs := make([]pvcItem, 0, len(pvcs)) for _, pvc := range pvcs { // Ignore PVCs that are being deleted (DeletionTimestamp set but not yet gone). - if pvc.Phase != "" { + if !pvc.IsTerminating { activePvcs = append(activePvcs, pvc) } } @@ -393,7 +396,7 @@ func testTieredStorageCachePvcResize() bool { return err } if state != "" { - return fmt.Errorf("expected cacheVolumeStates[%s] to be cleared, got %q", + return fmt.Errorf("expected tieredCacheVolumes[%s] to be cleared, got %q", tsResizeCacheMountPath, state) } return nil @@ -414,7 +417,7 @@ func testTieredStorageCachePvcResize() bool { state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Expect(state).To(gomega.BeEmpty(), "cacheVolumeStates entry should be cleared after resize") + gomega.Expect(state).To(gomega.BeEmpty(), "tieredCacheVolumes entry should be cleared after resize") }) ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() { From 7887d90d6698dfa0b32fd3657b6ab6f75044e940 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Fri, 8 May 2026 17:22:11 +0300 Subject: [PATCH 19/19] agent review- e2e tests --- tests/e2e/test_tiered_storage_cache_resize.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 273858d63..e826646cf 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -329,7 +329,9 @@ func testTieredStorageCachePvcResize() bool { // than the polling interval, so insisting on the intermediate "pending-deletion" // snapshot makes this phase flake-prone: // A) In-progress: tieredCacheVolumes[mp]=="pending-deletion" AND ≥2 cache PVCs. - // B) Already completed: state cleared AND exactly one cache PVC at the shrunk size. + // B) Already completed: state is "active" (no resize in flight) AND exactly one + // cache PVC at the shrunk size. "active" is the steady-state after a resize; + // "" (absent) would mean the mount path was removed entirely. // Either proves the resize was staged correctly; Phase 2/3 cover the remaining // invariants regardless of which path we observed. ginkgo.By("Waiting until the resize is observable as in-progress or completed") @@ -346,7 +348,7 @@ func testTieredStorageCachePvcResize() bool { if state == "pending-deletion" && len(pvcs) >= 2 { return nil } - if state == "" && len(pvcs) == 1 && pvcs[0].StorageSize == tsResizeShrunkSize { + if state == "active" && len(pvcs) == 1 && pvcs[0].StorageSize == tsResizeShrunkSize { return nil } return fmt.Errorf("not at expected staging state: tieredCacheVolumes[%s]=%q, %d cache PVC(s)", @@ -389,15 +391,15 @@ func testTieredStorageCachePvcResize() bool { return fmt.Errorf("expected 1 active cache PVC for broker %d after pod restart, got %d", tsResizeBrokerID, len(activePvcs)) } - // The resize state should be cleared once the old PVC is gone. + // After cleanup the resize state transitions back to "active" — no longer pending-deletion. state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) if err != nil { return err } - if state != "" { - return fmt.Errorf("expected tieredCacheVolumes[%s] to be cleared, got %q", - tsResizeCacheMountPath, state) + if state == "pending-deletion" { + return fmt.Errorf("tieredCacheVolumes[%s] still pending-deletion after old PVC deleted", + tsResizeCacheMountPath) } return nil }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) @@ -417,7 +419,7 @@ func testTieredStorageCachePvcResize() bool { state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Expect(state).To(gomega.BeEmpty(), "tieredCacheVolumes entry should be cleared after resize") + gomega.Expect(state).To(gomega.Equal("active"), "tieredCacheVolumes entry should be active after resize completes") }) ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() {