From 617b21e55e7c76e72cc7beed2b05d0fae6d4a80c Mon Sep 17 00:00:00 2001
From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com>
Date: Wed, 22 Apr 2026 15:19:44 +0100
Subject: [PATCH 1/3] Expose storage-version migrator feature flag in operator
 helm chart

Adds operator.features.storageVersionMigrator (default true) plus the
ENABLE_STORAGE_VERSION_MIGRATOR env var wiring in the deployment
template, mirroring the existing server/registry/virtualMCP pattern.
Helm docs regenerated.

Admins who run kube-storage-version-migrator externally can disable
the in-operator controller by setting this flag to false.

Part of #4969.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 deploy/charts/operator/README.md                 | 3 ++-
 deploy/charts/operator/templates/deployment.yaml | 2 ++
 deploy/charts/operator/values.yaml               | 7 +++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/deploy/charts/operator/README.md b/deploy/charts/operator/README.md
index e90a12452a..3f354f27cb 100644
--- a/deploy/charts/operator/README.md
+++ b/deploy/charts/operator/README.md
@@ -46,7 +46,7 @@ The command removes all the Kubernetes components associated with the chart and
 |-----|------|---------|-------------|
 | fullnameOverride | string | `"toolhive-operator"` | Provide a fully-qualified name override for resources |
 | nameOverride | string | `""` | Override the name of the chart |
-| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"defaultImagePullSecrets":[],"env":[],"features":{"experimental":false},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.28.3","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["events.k8s.io"],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.28.3","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.28.3","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources |
+| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"defaultImagePullSecrets":[],"env":[],"features":{"experimental":false,"storageVersionMigrator":true},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.28.3","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["events.k8s.io"],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.28.3","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.28.3","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources |
 | operator.affinity | object | `{}` | Affinity settings for the operator pod |
 | operator.autoscaling | object | `{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | Configuration for horizontal pod autoscaling |
 | operator.autoscaling.enabled | bool | `false` | Enable autoscaling for the operator |
@@ -57,6 +57,7 @@ The command removes all the Kubernetes components associated with the chart and
 | operator.defaultImagePullSecrets | list | `[]` | List of image pull secrets that the operator applies as defaults to every workload it spawns (proxy runners, vMCP servers, registry API, etc.). Per-CR `imagePullSecrets` take precedence on name collisions; chart-level entries are appended additively. The operator parses these once at startup from the TOOLHIVE_DEFAULT_IMAGE_PULL_SECRETS environment variable. The Secrets must exist in the namespace where each workload is created.  Each entry may be either a plain string (the Secret name) or an object with a `name` field, e.g.:   defaultImagePullSecrets:     - regcred     - name: otherscred The two shapes are equivalent; the object form matches `operator.imagePullSecrets` above for convenience. |
 | operator.env | list | `[]` | Environment variables to set in the operator container. Supported toolhive-specific variables include: - TOOLHIVE_SKIP_UPDATE_CHECK: set to "true" to disable the operator's   periodic update check against the ToolHive update API. Also disables   the usage-metrics collection that is gated on the same check. |
 | operator.features.experimental | bool | `false` | Enable experimental features |
+| operator.features.storageVersionMigrator | bool | `true` | Enable the StorageVersionMigrator controller, which auto-cleans status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a future release can drop deprecated versions (e.g. v1alpha1) without orphaning etcd objects. Leave this on unless you are running kube-storage-version-migrator externally. This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable. |
 | operator.gc | object | `{"gogc":75,"gomemlimit":"110MiB"}` | Go memory limits and garbage collection percentage for the operator container |
 | operator.gc.gogc | int | `75` | Go garbage collection percentage for the operator container |
 | operator.gc.gomemlimit | string | `"110MiB"` | Go memory limits for the operator container |
diff --git a/deploy/charts/operator/templates/deployment.yaml b/deploy/charts/operator/templates/deployment.yaml
index fa81ad160b..10c69c64e3 100644
--- a/deploy/charts/operator/templates/deployment.yaml
+++ b/deploy/charts/operator/templates/deployment.yaml
@@ -68,6 +68,8 @@ spec:
             value: "true"
           - name: ENABLE_EXPERIMENTAL_FEATURES
             value: {{ .Values.operator.features.experimental | quote }}
+          - name: ENABLE_STORAGE_VERSION_MIGRATOR
+            value: {{ .Values.operator.features.storageVersionMigrator | quote }}
           {{- if eq .Values.operator.rbac.scope "namespace" }}
           - name: WATCH_NAMESPACE
             value: "{{ .Values.operator.rbac.allowedNamespaces | join "," }}"
diff --git a/deploy/charts/operator/values.yaml b/deploy/charts/operator/values.yaml
index ca2f342d47..6864cfb051 100644
--- a/deploy/charts/operator/values.yaml
+++ b/deploy/charts/operator/values.yaml
@@ -8,6 +8,13 @@ operator:
   features:
     # -- Enable experimental features
     experimental: false
+    # -- Enable the StorageVersionMigrator controller, which auto-cleans
+    # status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a
+    # future release can drop deprecated versions (e.g. v1alpha1) without
+    # orphaning etcd objects. Leave this on unless you are running
+    # kube-storage-version-migrator externally.
+    # This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable.
+    storageVersionMigrator: true
   # -- Number of replicas for the operator deployment
   replicaCount: 1
 

From c47904aab6c017aa48fec405977ce4cdf0ec512f Mon Sep 17 00:00:00 2001
From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com>
Date: Wed, 22 Apr 2026 15:21:11 +0100
Subject: [PATCH 2/3] Document storage version migration for operator users

Covers what the controller does, the opt-in label contract, how to
disable the controller operator-wide, the per-CRD escape hatch and its
interaction with GitOps, how the migrator interacts with future
version-removal releases, and the required RBAC.

Part of #4969.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/operator/storage-version-migration.md | 114 +++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 docs/operator/storage-version-migration.md

diff --git a/docs/operator/storage-version-migration.md b/docs/operator/storage-version-migration.md
new file mode 100644
index 0000000000..547beeed62
--- /dev/null
+++ b/docs/operator/storage-version-migration.md
@@ -0,0 +1,114 @@
+# Storage Version Migration
+
+The ToolHive operator ships a `StorageVersionMigrator` controller that keeps every ToolHive CRD's `status.storedVersions` list clean, so a future operator release can drop deprecated API versions (e.g. `v1alpha1`) without orphaning objects in etcd.
+
+## Why this exists
+
+When a CRD graduates from, say, `v1alpha1` to `v1beta1` with both versions served and `v1beta1` as the storage version, existing objects continue to work — they are transparently converted on read/write. But the API server records every version that has ever been used for storage in `CustomResourceDefinition.status.storedVersions`. Until that list is trimmed, the Kubernetes API server refuses to let you remove a version from `spec.versions`, because doing so would orphan any etcd-stored objects encoded at that version.
+
+The cleanup is not automatic. Someone has to re-store every existing object at the current storage version, then explicitly patch `status.storedVersions` to drop the old entry. The `StorageVersionMigrator` controller does this for you, on every opted-in ToolHive CRD, continuously. See [upstream Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-kubernetes-objects/storage-version-migration/) for the mechanism.
+
+## What the controller does
+
+For each opted-in CRD:
+
+1. Reads `spec.versions` to find the entry with `storage: true`.
+2. If `status.storedVersions` already equals `[<currentStorageVersion>]` and only one version is served, nothing to do.
+3. Otherwise, lists every Custom Resource of that kind and issues a metadata-only Server-Side Apply against the `/status` subresource with field manager `thv-storage-version-migrator`. This forces the API server to re-encode each object at the current storage version without triggering admission webhooks (SSA on `/status` typically bypasses webhooks registered on the main resource, and the empty apply owns no fields so it doesn't fight other controllers).
+4. Once every object has been re-stored, patches `CRD.status.storedVersions` to `[<currentStorageVersion>]` using an optimistic-lock merge — so concurrent API-server writes cause a clean retry rather than a silent overwrite.
+
+CRDs without a `/status` subresource fall back to main-resource SSA.
+
+## The opt-in label
+
+A CRD participates in migration only if it carries:
+
+```yaml
+metadata:
+  labels:
+    toolhive.stacklok.dev/auto-migrate-storage-version: "true"
+```
+
+The label is set at CRD-generation time via a kubebuilder marker on each Go type in `cmd/thv-operator/api/v1beta1/`:
+
+```go
+// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true
+type MCPServer struct { ... }
+```
+
+`task operator-manifests` bakes the label into the generated CRD YAML. All current ToolHive root types ship with the marker. A CI test (`TestV1beta1TypesMarkerCoverage`) fails the build if a root type is added without either this marker or an explicit `// +thv:storage-version-migrator:exclude` sibling marker — so the migrator cannot silently forget a new CRD.
+
+Adding a new CRD that should be migrated:
+
+```go
+// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true
+type NewShinyThing struct { ... }
+```
+
+Adding a new CRD that deliberately should NOT be migrated (e.g. an experimental kind that is still stabilising its schema):
+
+```go
+// +thv:storage-version-migrator:exclude
+type ExperimentalThing struct { ... }
+```
+
+## Disabling the controller
+
+Set the Helm feature flag:
+
+```yaml
+operator:
+  features:
+    storageVersionMigrator: false   # default: true
+```
+
+This sets `ENABLE_STORAGE_VERSION_MIGRATOR=false` on the operator Deployment, and the reconciler is not registered with the manager.
+
+Disable only if you are running an external migrator such as [kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator). Disabling without a replacement is a footgun: the next ToolHive release that removes a deprecated API version will refuse to apply its CRD update until `storedVersions` is cleaned, and you will have to clean it yourself.
+
+## Per-CRD emergency escape hatch
+
+Removing the label on a live cluster excludes that single CRD from migration immediately:
+
+```bash
+kubectl label crd/mcpservers.toolhive.stacklok.dev \
+  toolhive.stacklok.dev/auto-migrate-storage-version-
+```
+
+Intended for incident response only. If you deploy the operator via GitOps (Argo CD, Flux) or `helm upgrade`, the chart will re-apply the chart-set label within seconds. Use the `storageVersionMigrator` feature flag for long-term opt-out.
+
+## Interaction with version removal releases
+
+The `StorageVersionMigrator` must have had time to run against your cluster *before* an operator release that drops a deprecated CRD version ships. The typical sequence is:
+
+1. **Release N**: both versions served, newer version is storage, `StorageVersionMigrator` enabled. The controller quietly re-stores all objects and trims `storedVersions` on every cluster during this deprecation window.
+2. **Release N+1+**: the deprecated version is removed from `spec.versions`. Because every cluster's `storedVersions` was already cleaned in the previous release, the CRD update applies cleanly.
+
+If your cluster upgraded directly from a pre-migrator release to the version-removal release without ever running release N, you must clean `storedVersions` manually (or deploy `kube-storage-version-migrator` once) before the upgrade can succeed.
+
+## Verification
+
+For any ToolHive CRD in a cluster where the controller has run:
+
+```bash
+kubectl get crd mcpservers.toolhive.stacklok.dev \
+  -o jsonpath='{.status.storedVersions}'
+# ["v1beta1"]
+```
+
+If the list contains more than one entry, the controller has not yet finished migrating — check operator logs for reconcile errors and the `StorageVersionMigrationFailed` event on the CRD.
+
+## RBAC
+
+The controller requires (generated from kubebuilder markers, applied by the operator Helm chart):
+
+- `customresourcedefinitions.apiextensions.k8s.io`: `get`, `list`, `watch`
+- `customresourcedefinitions/status.apiextensions.k8s.io`: `update`, `patch`
+- `*.toolhive.stacklok.dev`: `get`, `list`, `patch`
+- `*/status.toolhive.stacklok.dev`: `patch`
+
+## Related
+
+- Issue: [stacklok/toolhive#4969](https://github.com/stacklok/toolhive/issues/4969)
+- Kubernetes CRD versioning: [official docs](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definition-versioning/)
+- Reference implementation: [kubernetes-sigs/cluster-api `crdmigrator`](https://github.com/kubernetes-sigs/cluster-api/tree/main/controllers/crdmigrator)

From 45a541749323f8a626f0a98cb68e64a133db0cf3 Mon Sep 17 00:00:00 2001
From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com>
Date: Wed, 20 May 2026 23:58:15 +0100
Subject: [PATCH 3/3] Add upgrade walkthrough + CR fixtures to
 docs/operator/upgrade-guide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

End-to-end reproducible verification for the StorageVersionMigrator:
deploy v0.21.0 (last v1alpha1-only release), create one CR of each of
the 12 toolhive kinds at v1alpha1, upgrade to multi-version CRDs +
this branch's operator with the migrator explicitly disabled, re-apply
at v1beta1, confirm storedVersions is stuck at [v1alpha1, v1beta1]
on all 12 CRDs. Then enable the migrator and watch all 12 converge to
[v1beta1] within ~1s.

Includes an optional step 11 that simulates the next release dropping
v1alpha1 from spec.versions — proving the migrator's storedVersions
trim is what unblocks that change.

This is what was actually run to validate PR #5011 against a real kind
cluster. Putting it in the docs tree (not the worktree root) so it's
discoverable and reproducible after the PR merges.

CR fixtures (crs-v1alpha1.yaml and crs-v1beta1.yaml) originate from
the graduate-crds upgrade-test fixtures; copying them in keeps this
guide self-contained.

Part of #4969.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/operator/upgrade-guide/README.md         | 323 ++++++++++++++++++
 docs/operator/upgrade-guide/crs-v1alpha1.yaml | 149 ++++++++
 docs/operator/upgrade-guide/crs-v1beta1.yaml  | 149 ++++++++
 3 files changed, 621 insertions(+)
 create mode 100644 docs/operator/upgrade-guide/README.md
 create mode 100644 docs/operator/upgrade-guide/crs-v1alpha1.yaml
 create mode 100644 docs/operator/upgrade-guide/crs-v1beta1.yaml

diff --git a/docs/operator/upgrade-guide/README.md b/docs/operator/upgrade-guide/README.md
new file mode 100644
index 0000000000..845a57bc2e
--- /dev/null
+++ b/docs/operator/upgrade-guide/README.md
@@ -0,0 +1,323 @@
+# Upgrade walkthrough — v1alpha1 → v1beta1 with StorageVersionMigrator
+
+End-to-end manual walkthrough that replays a real user upgrade against a local kind cluster: install the previous v0.21.0 release, create a CR of each of the 12 toolhive CRD kinds at `v1alpha1`, upgrade to the new multi-version chart, deploy this branch's operator with the migrator **disabled**, re-apply the CRs at `v1beta1`, and confirm `status.storedVersions` is stuck at `[v1alpha1, v1beta1]` on every CRD. Then enable the `StorageVersionMigrator` and confirm it converges every CRD to `[v1beta1]`.
+
+Total run time: ~30 minutes. The slow part is the first `ko build` of the operator + proxyrunner + vmcp images (~3 min); subsequent runs use the build cache and finish in ~30s.
+
+This guide is the canonical reproducible verification for the migrator. Companion reading:
+
+- [`docs/operator/storage-version-migration.md`](../storage-version-migration.md) — reference docs for the controller itself (label contract, opt-out, mechanism).
+- [Issue #4969](https://github.com/stacklok/toolhive/issues/4969) — the motivating problem.
+
+## Prerequisites
+
+- `kind`, `kubectl`, `helm`, `ko`, `task` on PATH (`go install github.com/google/ko@latest` for ko)
+- Working directory: the repo root (`task operator-deploy-local` and the relative chart paths assume this).
+- Cluster name is `toolhive`. If you already have a cluster with that name, delete it first or run from a different worktree.
+
+The CR fixtures used below ship alongside this doc:
+
+- [`crs-v1alpha1.yaml`](./crs-v1alpha1.yaml) — one CR of each of the 12 kinds at `v1alpha1`
+- [`crs-v1beta1.yaml`](./crs-v1beta1.yaml) — byte-identical to the v1alpha1 file except for `apiVersion`, simulating what `sed -i 's/v1alpha1/v1beta1/g'` would produce
+
+---
+
+## 0 · Set up the cluster
+
+```bash
+# If you already have a "toolhive" kind cluster from a previous run, delete it
+kind delete cluster --name toolhive 2>/dev/null
+
+kind create cluster --name toolhive --wait 60s
+kind get kubeconfig --name toolhive > kconfig.yaml
+export KUBECONFIG=$(pwd)/kconfig.yaml
+```
+
+## 1 · Install v0.21.0 (the last v1alpha1-only release)
+
+```bash
+helm install toolhive-operator-crds \
+  oci://ghcr.io/stacklok/toolhive/toolhive-operator-crds \
+  --version 0.21.0 --wait
+
+helm install toolhive-operator \
+  oci://ghcr.io/stacklok/toolhive/toolhive-operator \
+  --version 0.21.0 \
+  --namespace toolhive-system --create-namespace --wait
+
+kubectl get crd mcpservers.toolhive.stacklok.dev -o jsonpath='{.spec.versions[*].name}'
+# expected: v1alpha1   (only one version)
+
+kubectl wait --for=condition=Available deployment -n toolhive-system --all --timeout=180s
+```
+
+## 2 · Create one CR of each of the 12 kinds at v1alpha1
+
+```bash
+kubectl create namespace upgrade-test
+kubectl apply -f docs/operator/upgrade-guide/crs-v1alpha1.yaml
+
+# Confirm all 12 landed
+kubectl get \
+  mcpservers,mcpremoteproxies,mcptoolconfigs,mcpgroups,embeddingservers,mcpregistries,virtualmcpservers,virtualmcpcompositetooldefinitions,mcpoidcconfigs,mcptelemetryconfigs,mcpexternalauthconfigs,mcpserverentries \
+  -n upgrade-test --no-headers | wc -l
+# expected: 12
+```
+
+## 3 · Let the old operator reconcile + capture the baseline
+
+```bash
+sleep 60
+kubectl get deployments -n upgrade-test
+# expected: up to 5 Deployments — test-server, test-remote-proxy, test-virtual-server,
+# test-registry-api, and (sometimes) test-embedding shows as a StatefulSet not a Deployment
+
+# Snapshot the UIDs for later comparison
+kubectl get deployments -n upgrade-test \
+  -o jsonpath='{range .items[*]}{.metadata.name}={.metadata.uid}{"\n"}{end}' \
+  | sort > /tmp/before-upgrade.txt
+cat /tmp/before-upgrade.txt
+```
+
+These UIDs are the canary. If they change after the operator upgrade, a workload was recreated → downtime.
+
+## 4 · Upgrade the CRDs chart to multi-version
+
+```bash
+helm upgrade toolhive-operator-crds deploy/charts/operator-crds --wait --timeout 120s
+
+kubectl get crd mcpservers.toolhive.stacklok.dev -o jsonpath='{.spec.versions[*].name}'
+# expected: v1alpha1 v1beta1
+
+kubectl get crd mcpservers.toolhive.stacklok.dev -o jsonpath='{.spec.versions[?(@.storage==true)].name}'
+# expected: v1beta1
+```
+
+## 5 · Build the new operator + deploy with the migrator DISABLED
+
+This is the key deviation from `task operator-deploy-local` — we want to control the feature flag, so we run ko + helm manually rather than using the task.
+
+```bash
+# ~3 minutes on first run, ~30s with build cache
+OP=$(KO_DOCKER_REPO=kind.local ko build --local -B ./cmd/thv-operator | tail -1)
+PR=$(KO_DOCKER_REPO=kind.local ko build --local -B ./cmd/thv-proxyrunner | tail -1)
+VM=$(KO_DOCKER_REPO=kind.local ko build --local -B ./cmd/vmcp | tail -1)
+
+# Load all three into the kind cluster
+kind load docker-image --name toolhive "$OP"
+kind load docker-image --name toolhive "$PR"
+kind load docker-image --name toolhive "$VM"
+
+# Persist for later steps
+echo "$OP" > /tmp/img-op
+echo "$PR" > /tmp/img-pr
+echo "$VM" > /tmp/img-vm
+
+# Helm upgrade with migrator EXPLICITLY DISABLED
+helm upgrade toolhive-operator deploy/charts/operator \
+  --set operator.image="$OP" \
+  --set operator.toolhiveRunnerImage="$PR" \
+  --set operator.vmcpImage="$VM" \
+  --set operator.features.storageVersionMigrator=false \
+  --namespace toolhive-system
+
+kubectl rollout status deployment -n toolhive-system --timeout=180s
+
+# Confirm flag took effect
+NEW_POD=$(kubectl get pods -n toolhive-system --no-headers | grep "toolhive-operator-" | awk '{print $1}' | head -1)
+kubectl get pod "$NEW_POD" -n toolhive-system -o yaml | grep -A1 ENABLE_STORAGE_VERSION_MIGRATOR
+# expected: value: "false"
+
+kubectl logs "$NEW_POD" -n toolhive-system | grep "ENABLE_STORAGE_VERSION_MIGRATOR is disabled"
+# expected: one line — "ENABLE_STORAGE_VERSION_MIGRATOR is disabled, skipping StorageVersionMigrator controller"
+```
+
+## 6 · Verify zero downtime — Deployment UIDs unchanged
+
+```bash
+kubectl get deployments -n upgrade-test \
+  -o jsonpath='{range .items[*]}{.metadata.name}={.metadata.uid}{"\n"}{end}' \
+  | sort > /tmp/after-upgrade.txt
+
+diff /tmp/before-upgrade.txt /tmp/after-upgrade.txt && echo "All Deployment UIDs unchanged"
+```
+
+## 7 · Re-apply all 12 CRs at v1beta1 (the user migration step)
+
+```bash
+kubectl apply -f docs/operator/upgrade-guide/crs-v1beta1.yaml
+# expected: 12 "configured" lines (not "created")
+
+# Wait for the operator to observe each update
+sleep 10
+```
+
+## 8 · Confirm storedVersions is stuck at `[v1alpha1, v1beta1]` on all 12 CRDs (migrator is OFF)
+
+```bash
+echo "=== storedVersions on ALL 12 CRDs (migrator OFF) ==="
+for crd in $(kubectl get crd -o name | grep toolhive.stacklok.dev | sort); do
+  name=$(echo $crd | sed 's|.*/||')
+  stored=$(kubectl get $crd -o jsonpath='{.status.storedVersions}')
+  printf "  %-55s %s\n" "$name" "$stored"
+done
+```
+
+Expected: every line ends with `["v1alpha1","v1beta1"]`. This is the "post-graduation, pre-migration" state every cluster lands in after the v0.21.0 → multi-version upgrade. **Without the migrator, this state is permanent** — any future operator release that drops `v1alpha1` from `spec.versions` would fail with:
+
+```
+status.storedVersions[0]: Invalid value: "v1alpha1": must appear in spec.versions
+```
+
+## 9 · Enable the migrator + watch it converge
+
+Helm upgrade to flip the feature flag:
+
+```bash
+OP=$(cat /tmp/img-op)
+PR=$(cat /tmp/img-pr)
+VM=$(cat /tmp/img-vm)
+
+helm upgrade toolhive-operator deploy/charts/operator \
+  --set operator.image="$OP" \
+  --set operator.toolhiveRunnerImage="$PR" \
+  --set operator.vmcpImage="$VM" \
+  --namespace toolhive-system
+
+kubectl rollout status deployment -n toolhive-system --timeout=180s
+
+# Confirm flag is now true
+NEW_POD=$(kubectl get pods -n toolhive-system --no-headers | grep "toolhive-operator-" | awk '{print $1}' | head -1)
+kubectl get pod "$NEW_POD" -n toolhive-system -o yaml | grep -A1 ENABLE_STORAGE_VERSION_MIGRATOR
+# expected: value: "true"
+```
+
+Wait for convergence:
+
+```bash
+echo "=== waiting up to 60s for all 12 CRDs to converge ==="
+for i in $(seq 1 60); do
+  count=$(for c in $(kubectl get crd -o name | grep toolhive.stacklok.dev); do
+    kubectl get $c -o jsonpath='{.status.storedVersions}'
+    echo
+  done | grep -c '^\["v1beta1"\]$')
+  if [ "$count" = "12" ]; then
+    echo "All 12 CRDs converged after ${i}s"
+    break
+  fi
+  sleep 1
+done
+```
+
+In practice this completes in ~1-2 seconds once the new pod is ready.
+
+Verify:
+
+```bash
+echo "=== storedVersions on ALL 12 CRDs (migrator ON, post-converge) ==="
+for crd in $(kubectl get crd -o name | grep toolhive.stacklok.dev | sort); do
+  name=$(echo $crd | sed 's|.*/||')
+  stored=$(kubectl get $crd -o jsonpath='{.status.storedVersions}')
+  printf "  %-55s %s\n" "$name" "$stored"
+done
+# expected: every line ends with ["v1beta1"]
+
+echo "=== StorageVersionMigrationSucceeded events ==="
+kubectl get events -A --field-selector reason=StorageVersionMigrationSucceeded --no-headers | wc -l
+# expected: 12 — one event per CRD
+
+echo "=== StorageVersionMigrationFailed events (should be 0) ==="
+kubectl get events -A --field-selector reason=StorageVersionMigrationFailed --no-headers | wc -l
+# expected: 0
+```
+
+Confirm CRs still healthy (admission webhooks on MCPServer / MCPGroup / VirtualMCPServer all accepted the per-CR Updates):
+
+```bash
+kubectl get \
+  mcpservers,mcpremoteproxies,mcptoolconfigs,mcpgroups,embeddingservers,mcpregistries,virtualmcpservers,virtualmcpcompositetooldefinitions,mcpoidcconfigs,mcptelemetryconfigs,mcpexternalauthconfigs,mcpserverentries \
+  -n upgrade-test --no-headers | wc -l
+# expected: 12
+```
+
+## 10 · (Optional) Inspect migrator logs
+
+```bash
+NEW_POD=$(kubectl get pods -n toolhive-system --no-headers | grep "toolhive-operator-" | awk '{print $1}' | head -1)
+kubectl logs "$NEW_POD" -n toolhive-system | grep "storage version migration complete" | wc -l
+# expected: 12 lines — one per CRD
+```
+
+**If this prints 0**, the migration may have happened in a previous container instance (the operator pod can restart for unrelated reasons in a kind cluster — leases, OOM, etc.). Try the previous container's logs:
+
+```bash
+kubectl logs "$NEW_POD" -n toolhive-system --previous | grep "storage version migration complete" | wc -l
+```
+
+The migration is complete in either case — the events on the CRDs in step 9 are the authoritative signal.
+
+## 11 · (Optional) Simulate the next release that drops v1alpha1
+
+Once `storedVersions` is `[v1beta1]` everywhere, the apiserver will accept removal of `v1alpha1` from `spec.versions` — the safety interlock the migrator exists to satisfy. To demonstrate:
+
+```bash
+# Direct apiserver patch — the same end state a future "drop v1alpha1" chart would produce.
+for crd in $(kubectl get crd -o name | grep toolhive.stacklok.dev); do
+  name=$(echo $crd | sed 's|.*/||')
+  newversions=$(kubectl get $crd -o json | jq '.spec.versions | map(select(.name != "v1alpha1"))')
+  patch=$(jq -n --argjson v "$newversions" '{spec:{versions:$v}}')
+  printf "  %-55s " "$name"
+  kubectl patch $crd --type=merge -p "$patch" 2>&1 | tail -1
+done
+```
+
+Every line should say `... patched`. Verify v1alpha1 access now fails:
+
+```bash
+kubectl get mcpgroups.v1alpha1.toolhive.stacklok.dev test-group -n upgrade-test
+# expected: error — the server doesn't have a resource type "mcpgroups"
+
+kubectl get mcpgroups.v1beta1.toolhive.stacklok.dev test-group -n upgrade-test
+# expected: the resource
+```
+
+**Negative test**: if you skip step 9 (the migrator pass) and jump straight to step 11, every `kubectl patch` will fail with:
+
+```
+Invalid value: "v1alpha1": must appear in spec.versions
+```
+
+That's the apiserver wall the migrator exists to clear.
+
+## 12 · Cleanup
+
+```bash
+kind delete cluster --name toolhive
+rm -f kconfig.yaml /tmp/before-upgrade.txt /tmp/after-upgrade.txt \
+      /tmp/img-op /tmp/img-pr /tmp/img-vm
+```
+
+---
+
+## Summary of what's verified
+
+| Check | Validates | Step |
+|---|---|---|
+| Operator startup logs show migrator skipped when disabled | The `setupStorageVersionMigrator` branch in `app/app.go` honours `ENABLE_STORAGE_VERSION_MIGRATOR=false` | 5 |
+| Zero-downtime upgrade across operator chart upgrade | The PR's operator changes don't recreate any managed workload | 6 |
+| `storedVersions` is `[v1alpha1, v1beta1]` after re-apply with migrator OFF | Migrator did not run; baseline state is correctly preserved | 8 |
+| Helm-upgrade flip enables the migrator | Feature flag round-trips correctly | 9 |
+| `storedVersions` converges to `[v1beta1]` on all 12 CRDs | The actual migration logic works against real ToolHive CRDs | 9 |
+| 12 `StorageVersionMigrationSucceeded` events on the CRDs | The Recorder is correctly wired and per-CRD migrations are observable | 9 |
+| 0 `StorageVersionMigrationFailed` events | No CRD's per-CR Update loop returned a non-retriable error | 9 |
+| All 12 CRs still healthy after migration | Validating admission webhooks (MCPServer / MCPGroup / VirtualMCPServer) tolerate the per-CR Updates | 9 |
+| RBAC permits storedVersions trim | The ClusterRole has the correct verbs for `customresourcedefinitions/status` and `*.toolhive.stacklok.dev/*` | implicit — trim succeeds in step 9 |
+| Apiserver permits `v1alpha1` removal from `spec.versions` once `storedVersions` is clean | The deprecation chain end-to-end works | 11 |
+
+## What this does NOT cover
+
+- **Mid-migration crash recovery**: no test forces the operator to crash mid-loop. Envtest covers the conflict-handling and re-encode-failure paths separately.
+- **Pagination at real-cluster scale**: 12 CRs is well below the 500-default page size. The envtest suite explicitly tests the continue-token loop with 7 CRs at `PageSize=3`.
+- **Operator restart resilience under load**: kind clusters are resource-limited and the operator pod sometimes restarts during tests for unrelated reasons (the `--previous` log fallback in step 10 covers this).
+
+These are covered by the envtest suite at `cmd/thv-operator/test-integration/storageversionmigrator/`. This walkthrough covers what envtest can't: helm chart wiring, real ToolHive CRD schemas with their actual webhooks, and the full upgrade sequence a real user would run.
diff --git a/docs/operator/upgrade-guide/crs-v1alpha1.yaml b/docs/operator/upgrade-guide/crs-v1alpha1.yaml
new file mode 100644
index 0000000000..ec1a2d0aa5
--- /dev/null
+++ b/docs/operator/upgrade-guide/crs-v1alpha1.yaml
@@ -0,0 +1,149 @@
+# All 12 ToolHive CR kinds at v1alpha1.
+# Apply with:
+#   kubectl create namespace upgrade-test
+#   kubectl apply -f crs-v1alpha1.yaml
+#
+# Note: MCPServerEntry references MCPGroup (groupRef.name: test-group),
+# so MCPGroup must be applied first. kubectl handles ordering within a
+# single file by best-effort, and since all refs here are by name (not
+# UID), no strict ordering is required — missing refs just stay pending.
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPGroup
+metadata:
+  name: test-group
+  namespace: upgrade-test
+spec:
+  description: Test group for upgrade validation
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPServer
+metadata:
+  name: test-server
+  namespace: upgrade-test
+spec:
+  image: ghcr.io/github/github-mcp-server
+  transport: stdio
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPRemoteProxy
+metadata:
+  name: test-remote-proxy
+  namespace: upgrade-test
+spec:
+  remoteUrl: https://example.com/mcp
+  transport: streamable-http
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPToolConfig
+metadata:
+  name: test-tool-config
+  namespace: upgrade-test
+spec:
+  toolsFilter:
+  - allowed_tool
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: EmbeddingServer
+metadata:
+  name: test-embedding
+  namespace: upgrade-test
+spec:
+  image: ghcr.io/stacklok/test-embedding:latest
+  model: test-model
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPRegistry
+metadata:
+  name: test-registry
+  namespace: upgrade-test
+spec:
+  displayName: Test Registry
+  configYAML: |
+    sources:
+      - name: k8s
+        format: upstream
+        kubernetes: {}
+    registries:
+      - name: default
+        sources: ["k8s"]
+    database:
+      host: postgres
+      port: 5432
+      user: db_app
+      database: registry
+    auth:
+      mode: anonymous
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPOIDCConfig
+metadata:
+  name: test-oidc
+  namespace: upgrade-test
+spec:
+  type: inline
+  inline:
+    issuer: https://auth.example.com
+    jwksUrl: https://auth.example.com/.well-known/jwks.json
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPTelemetryConfig
+metadata:
+  name: test-telemetry
+  namespace: upgrade-test
+spec:
+  openTelemetry:
+    endpoint: http://otel-collector:4317
+    metrics:
+      enabled: true
+    tracing:
+      enabled: true
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPExternalAuthConfig
+metadata:
+  name: test-external-auth
+  namespace: upgrade-test
+spec:
+  type: tokenExchange
+  tokenExchange:
+    tokenUrl: https://auth.example.com/token
+    audience: https://api.example.com
+    clientId: test-client
+    clientSecretRef:
+      name: test-secret
+      key: client-secret
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: VirtualMCPServer
+metadata:
+  name: test-virtual-server
+  namespace: upgrade-test
+spec:
+  groupRef:
+    name: test-group
+  incomingAuth:
+    type: anonymous
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: VirtualMCPCompositeToolDefinition
+metadata:
+  name: test-composite-tool
+  namespace: upgrade-test
+spec:
+  name: test_workflow
+  description: Test composite workflow
+  steps:
+  - id: step-1
+    tool: placeholder.tool
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPServerEntry
+metadata:
+  name: test-server-entry
+  namespace: upgrade-test
+spec:
+  groupRef:
+    name: test-group
+  transport: streamable-http
+  remoteUrl: https://example.com/mcp
diff --git a/docs/operator/upgrade-guide/crs-v1beta1.yaml b/docs/operator/upgrade-guide/crs-v1beta1.yaml
new file mode 100644
index 0000000000..b8570aa141
--- /dev/null
+++ b/docs/operator/upgrade-guide/crs-v1beta1.yaml
@@ -0,0 +1,149 @@
+# All 12 ToolHive CR kinds at v1beta1.
+# Apply with:
+#   kubectl create namespace upgrade-test
+#   kubectl apply -f crs-v1beta1.yaml
+#
+# Note: MCPServerEntry references MCPGroup (groupRef.name: test-group),
+# so MCPGroup must be applied first. kubectl handles ordering within a
+# single file by best-effort, and since all refs here are by name (not
+# UID), no strict ordering is required — missing refs just stay pending.
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPGroup
+metadata:
+  name: test-group
+  namespace: upgrade-test
+spec:
+  description: Test group for upgrade validation
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPServer
+metadata:
+  name: test-server
+  namespace: upgrade-test
+spec:
+  image: ghcr.io/github/github-mcp-server
+  transport: stdio
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPRemoteProxy
+metadata:
+  name: test-remote-proxy
+  namespace: upgrade-test
+spec:
+  remoteUrl: https://example.com/mcp
+  transport: streamable-http
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPToolConfig
+metadata:
+  name: test-tool-config
+  namespace: upgrade-test
+spec:
+  toolsFilter:
+  - allowed_tool
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: EmbeddingServer
+metadata:
+  name: test-embedding
+  namespace: upgrade-test
+spec:
+  image: ghcr.io/stacklok/test-embedding:latest
+  model: test-model
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPRegistry
+metadata:
+  name: test-registry
+  namespace: upgrade-test
+spec:
+  displayName: Test Registry
+  configYAML: |
+    sources:
+      - name: k8s
+        format: upstream
+        kubernetes: {}
+    registries:
+      - name: default
+        sources: ["k8s"]
+    database:
+      host: postgres
+      port: 5432
+      user: db_app
+      database: registry
+    auth:
+      mode: anonymous
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPOIDCConfig
+metadata:
+  name: test-oidc
+  namespace: upgrade-test
+spec:
+  type: inline
+  inline:
+    issuer: https://auth.example.com
+    jwksUrl: https://auth.example.com/.well-known/jwks.json
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPTelemetryConfig
+metadata:
+  name: test-telemetry
+  namespace: upgrade-test
+spec:
+  openTelemetry:
+    endpoint: http://otel-collector:4317
+    metrics:
+      enabled: true
+    tracing:
+      enabled: true
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPExternalAuthConfig
+metadata:
+  name: test-external-auth
+  namespace: upgrade-test
+spec:
+  type: tokenExchange
+  tokenExchange:
+    tokenUrl: https://auth.example.com/token
+    audience: https://api.example.com
+    clientId: test-client
+    clientSecretRef:
+      name: test-secret
+      key: client-secret
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: VirtualMCPServer
+metadata:
+  name: test-virtual-server
+  namespace: upgrade-test
+spec:
+  groupRef:
+    name: test-group
+  incomingAuth:
+    type: anonymous
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: VirtualMCPCompositeToolDefinition
+metadata:
+  name: test-composite-tool
+  namespace: upgrade-test
+spec:
+  name: test_workflow
+  description: Test composite workflow
+  steps:
+  - id: step-1
+    tool: placeholder.tool
+---
+apiVersion: toolhive.stacklok.dev/v1beta1
+kind: MCPServerEntry
+metadata:
+  name: test-server-entry
+  namespace: upgrade-test
+spec:
+  groupRef:
+    name: test-group
+  transport: streamable-http
+  remoteUrl: https://example.com/mcp