From 617b21e55e7c76e72cc7beed2b05d0fae6d4a80c Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:19:44 +0100 Subject: [PATCH 1/3] Expose storage-version migrator feature flag in operator helm chart Adds operator.features.storageVersionMigrator (default true) plus the ENABLE_STORAGE_VERSION_MIGRATOR env var wiring in the deployment template, mirroring the existing server/registry/virtualMCP pattern. Helm docs regenerated. Admins who run kube-storage-version-migrator externally can disable the in-operator controller by setting this flag to false. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- deploy/charts/operator/README.md | 3 ++- deploy/charts/operator/templates/deployment.yaml | 2 ++ deploy/charts/operator/values.yaml | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/deploy/charts/operator/README.md b/deploy/charts/operator/README.md index e90a12452a..3f354f27cb 100644 --- a/deploy/charts/operator/README.md +++ b/deploy/charts/operator/README.md @@ -46,7 +46,7 @@ The command removes all the Kubernetes components associated with the chart and |-----|------|---------|-------------| | fullnameOverride | string | `"toolhive-operator"` | Provide a fully-qualified name override for resources | | nameOverride | string | `""` | Override the name of the chart | -| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"defaultImagePullSecrets":[],"env":[],"features":{"experimental":false},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.28.3","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["events.k8s.io"],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.28.3","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.28.3","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | +| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"defaultImagePullSecrets":[],"env":[],"features":{"experimental":false,"storageVersionMigrator":true},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.28.3","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["events.k8s.io"],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.28.3","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.28.3","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | | operator.affinity | object | `{}` | Affinity settings for the operator pod | | operator.autoscaling | object | `{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | Configuration for horizontal pod autoscaling | | operator.autoscaling.enabled | bool | `false` | Enable autoscaling for the operator | @@ -57,6 +57,7 @@ The command removes all the Kubernetes components associated with the chart and | operator.defaultImagePullSecrets | list | `[]` | List of image pull secrets that the operator applies as defaults to every workload it spawns (proxy runners, vMCP servers, registry API, etc.). Per-CR `imagePullSecrets` take precedence on name collisions; chart-level entries are appended additively. The operator parses these once at startup from the TOOLHIVE_DEFAULT_IMAGE_PULL_SECRETS environment variable. The Secrets must exist in the namespace where each workload is created. Each entry may be either a plain string (the Secret name) or an object with a `name` field, e.g.: defaultImagePullSecrets: - regcred - name: otherscred The two shapes are equivalent; the object form matches `operator.imagePullSecrets` above for convenience. | | operator.env | list | `[]` | Environment variables to set in the operator container. Supported toolhive-specific variables include: - TOOLHIVE_SKIP_UPDATE_CHECK: set to "true" to disable the operator's periodic update check against the ToolHive update API. Also disables the usage-metrics collection that is gated on the same check. | | operator.features.experimental | bool | `false` | Enable experimental features | +| operator.features.storageVersionMigrator | bool | `true` | Enable the StorageVersionMigrator controller, which auto-cleans status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a future release can drop deprecated versions (e.g. v1alpha1) without orphaning etcd objects. Leave this on unless you are running kube-storage-version-migrator externally. This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable. | | operator.gc | object | `{"gogc":75,"gomemlimit":"110MiB"}` | Go memory limits and garbage collection percentage for the operator container | | operator.gc.gogc | int | `75` | Go garbage collection percentage for the operator container | | operator.gc.gomemlimit | string | `"110MiB"` | Go memory limits for the operator container | diff --git a/deploy/charts/operator/templates/deployment.yaml b/deploy/charts/operator/templates/deployment.yaml index fa81ad160b..10c69c64e3 100644 --- a/deploy/charts/operator/templates/deployment.yaml +++ b/deploy/charts/operator/templates/deployment.yaml @@ -68,6 +68,8 @@ spec: value: "true" - name: ENABLE_EXPERIMENTAL_FEATURES value: {{ .Values.operator.features.experimental | quote }} + - name: ENABLE_STORAGE_VERSION_MIGRATOR + value: {{ .Values.operator.features.storageVersionMigrator | quote }} {{- if eq .Values.operator.rbac.scope "namespace" }} - name: WATCH_NAMESPACE value: "{{ .Values.operator.rbac.allowedNamespaces | join "," }}" diff --git a/deploy/charts/operator/values.yaml b/deploy/charts/operator/values.yaml index ca2f342d47..6864cfb051 100644 --- a/deploy/charts/operator/values.yaml +++ b/deploy/charts/operator/values.yaml @@ -8,6 +8,13 @@ operator: features: # -- Enable experimental features experimental: false + # -- Enable the StorageVersionMigrator controller, which auto-cleans + # status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a + # future release can drop deprecated versions (e.g. v1alpha1) without + # orphaning etcd objects. Leave this on unless you are running + # kube-storage-version-migrator externally. + # This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable. + storageVersionMigrator: true # -- Number of replicas for the operator deployment replicaCount: 1 From c47904aab6c017aa48fec405977ce4cdf0ec512f Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:21:11 +0100 Subject: [PATCH 2/3] Document storage version migration for operator users Covers what the controller does, the opt-in label contract, how to disable the controller operator-wide, the per-CRD escape hatch and its interaction with GitOps, how the migrator interacts with future version-removal releases, and the required RBAC. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/operator/storage-version-migration.md | 114 +++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 docs/operator/storage-version-migration.md diff --git a/docs/operator/storage-version-migration.md b/docs/operator/storage-version-migration.md new file mode 100644 index 0000000000..547beeed62 --- /dev/null +++ b/docs/operator/storage-version-migration.md @@ -0,0 +1,114 @@ +# Storage Version Migration + +The ToolHive operator ships a `StorageVersionMigrator` controller that keeps every ToolHive CRD's `status.storedVersions` list clean, so a future operator release can drop deprecated API versions (e.g. `v1alpha1`) without orphaning objects in etcd. + +## Why this exists + +When a CRD graduates from, say, `v1alpha1` to `v1beta1` with both versions served and `v1beta1` as the storage version, existing objects continue to work — they are transparently converted on read/write. But the API server records every version that has ever been used for storage in `CustomResourceDefinition.status.storedVersions`. Until that list is trimmed, the Kubernetes API server refuses to let you remove a version from `spec.versions`, because doing so would orphan any etcd-stored objects encoded at that version. + +The cleanup is not automatic. Someone has to re-store every existing object at the current storage version, then explicitly patch `status.storedVersions` to drop the old entry. The `StorageVersionMigrator` controller does this for you, on every opted-in ToolHive CRD, continuously. See [upstream Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-kubernetes-objects/storage-version-migration/) for the mechanism. + +## What the controller does + +For each opted-in CRD: + +1. Reads `spec.versions` to find the entry with `storage: true`. +2. If `status.storedVersions` already equals `[]` and only one version is served, nothing to do. +3. Otherwise, lists every Custom Resource of that kind and issues a metadata-only Server-Side Apply against the `/status` subresource with field manager `thv-storage-version-migrator`. This forces the API server to re-encode each object at the current storage version without triggering admission webhooks (SSA on `/status` typically bypasses webhooks registered on the main resource, and the empty apply owns no fields so it doesn't fight other controllers). +4. Once every object has been re-stored, patches `CRD.status.storedVersions` to `[]` using an optimistic-lock merge — so concurrent API-server writes cause a clean retry rather than a silent overwrite. + +CRDs without a `/status` subresource fall back to main-resource SSA. + +## The opt-in label + +A CRD participates in migration only if it carries: + +```yaml +metadata: + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" +``` + +The label is set at CRD-generation time via a kubebuilder marker on each Go type in `cmd/thv-operator/api/v1beta1/`: + +```go +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +type MCPServer struct { ... } +``` + +`task operator-manifests` bakes the label into the generated CRD YAML. All current ToolHive root types ship with the marker. A CI test (`TestV1beta1TypesMarkerCoverage`) fails the build if a root type is added without either this marker or an explicit `// +thv:storage-version-migrator:exclude` sibling marker — so the migrator cannot silently forget a new CRD. + +Adding a new CRD that should be migrated: + +```go +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +type NewShinyThing struct { ... } +``` + +Adding a new CRD that deliberately should NOT be migrated (e.g. an experimental kind that is still stabilising its schema): + +```go +// +thv:storage-version-migrator:exclude +type ExperimentalThing struct { ... } +``` + +## Disabling the controller + +Set the Helm feature flag: + +```yaml +operator: + features: + storageVersionMigrator: false # default: true +``` + +This sets `ENABLE_STORAGE_VERSION_MIGRATOR=false` on the operator Deployment, and the reconciler is not registered with the manager. + +Disable only if you are running an external migrator such as [kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator). Disabling without a replacement is a footgun: the next ToolHive release that removes a deprecated API version will refuse to apply its CRD update until `storedVersions` is cleaned, and you will have to clean it yourself. + +## Per-CRD emergency escape hatch + +Removing the label on a live cluster excludes that single CRD from migration immediately: + +```bash +kubectl label crd/mcpservers.toolhive.stacklok.dev \ + toolhive.stacklok.dev/auto-migrate-storage-version- +``` + +Intended for incident response only. If you deploy the operator via GitOps (Argo CD, Flux) or `helm upgrade`, the chart will re-apply the chart-set label within seconds. Use the `storageVersionMigrator` feature flag for long-term opt-out. + +## Interaction with version removal releases + +The `StorageVersionMigrator` must have had time to run against your cluster *before* an operator release that drops a deprecated CRD version ships. The typical sequence is: + +1. **Release N**: both versions served, newer version is storage, `StorageVersionMigrator` enabled. The controller quietly re-stores all objects and trims `storedVersions` on every cluster during this deprecation window. +2. **Release N+1+**: the deprecated version is removed from `spec.versions`. Because every cluster's `storedVersions` was already cleaned in the previous release, the CRD update applies cleanly. + +If your cluster upgraded directly from a pre-migrator release to the version-removal release without ever running release N, you must clean `storedVersions` manually (or deploy `kube-storage-version-migrator` once) before the upgrade can succeed. + +## Verification + +For any ToolHive CRD in a cluster where the controller has run: + +```bash +kubectl get crd mcpservers.toolhive.stacklok.dev \ + -o jsonpath='{.status.storedVersions}' +# ["v1beta1"] +``` + +If the list contains more than one entry, the controller has not yet finished migrating — check operator logs for reconcile errors and the `StorageVersionMigrationFailed` event on the CRD. + +## RBAC + +The controller requires (generated from kubebuilder markers, applied by the operator Helm chart): + +- `customresourcedefinitions.apiextensions.k8s.io`: `get`, `list`, `watch` +- `customresourcedefinitions/status.apiextensions.k8s.io`: `update`, `patch` +- `*.toolhive.stacklok.dev`: `get`, `list`, `patch` +- `*/status.toolhive.stacklok.dev`: `patch` + +## Related + +- Issue: [stacklok/toolhive#4969](https://github.com/stacklok/toolhive/issues/4969) +- Kubernetes CRD versioning: [official docs](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definition-versioning/) +- Reference implementation: [kubernetes-sigs/cluster-api `crdmigrator`](https://github.com/kubernetes-sigs/cluster-api/tree/main/controllers/crdmigrator) From 45a541749323f8a626f0a98cb68e64a133db0cf3 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 20 May 2026 23:58:15 +0100 Subject: [PATCH 3/3] Add upgrade walkthrough + CR fixtures to docs/operator/upgrade-guide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End-to-end reproducible verification for the StorageVersionMigrator: deploy v0.21.0 (last v1alpha1-only release), create one CR of each of the 12 toolhive kinds at v1alpha1, upgrade to multi-version CRDs + this branch's operator with the migrator explicitly disabled, re-apply at v1beta1, confirm storedVersions is stuck at [v1alpha1, v1beta1] on all 12 CRDs. Then enable the migrator and watch all 12 converge to [v1beta1] within ~1s. Includes an optional step 11 that simulates the next release dropping v1alpha1 from spec.versions — proving the migrator's storedVersions trim is what unblocks that change. This is what was actually run to validate PR #5011 against a real kind cluster. Putting it in the docs tree (not the worktree root) so it's discoverable and reproducible after the PR merges. CR fixtures (crs-v1alpha1.yaml and crs-v1beta1.yaml) originate from the graduate-crds upgrade-test fixtures; copying them in keeps this guide self-contained. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/operator/upgrade-guide/README.md | 323 ++++++++++++++++++ docs/operator/upgrade-guide/crs-v1alpha1.yaml | 149 ++++++++ docs/operator/upgrade-guide/crs-v1beta1.yaml | 149 ++++++++ 3 files changed, 621 insertions(+) create mode 100644 docs/operator/upgrade-guide/README.md create mode 100644 docs/operator/upgrade-guide/crs-v1alpha1.yaml create mode 100644 docs/operator/upgrade-guide/crs-v1beta1.yaml diff --git a/docs/operator/upgrade-guide/README.md b/docs/operator/upgrade-guide/README.md new file mode 100644 index 0000000000..845a57bc2e --- /dev/null +++ b/docs/operator/upgrade-guide/README.md @@ -0,0 +1,323 @@ +# Upgrade walkthrough — v1alpha1 → v1beta1 with StorageVersionMigrator + +End-to-end manual walkthrough that replays a real user upgrade against a local kind cluster: install the previous v0.21.0 release, create a CR of each of the 12 toolhive CRD kinds at `v1alpha1`, upgrade to the new multi-version chart, deploy this branch's operator with the migrator **disabled**, re-apply the CRs at `v1beta1`, and confirm `status.storedVersions` is stuck at `[v1alpha1, v1beta1]` on every CRD. Then enable the `StorageVersionMigrator` and confirm it converges every CRD to `[v1beta1]`. + +Total run time: ~30 minutes. The slow part is the first `ko build` of the operator + proxyrunner + vmcp images (~3 min); subsequent runs use the build cache and finish in ~30s. + +This guide is the canonical reproducible verification for the migrator. Companion reading: + +- [`docs/operator/storage-version-migration.md`](../storage-version-migration.md) — reference docs for the controller itself (label contract, opt-out, mechanism). +- [Issue #4969](https://github.com/stacklok/toolhive/issues/4969) — the motivating problem. + +## Prerequisites + +- `kind`, `kubectl`, `helm`, `ko`, `task` on PATH (`go install github.com/google/ko@latest` for ko) +- Working directory: the repo root (`task operator-deploy-local` and the relative chart paths assume this). +- Cluster name is `toolhive`. If you already have a cluster with that name, delete it first or run from a different worktree. + +The CR fixtures used below ship alongside this doc: + +- [`crs-v1alpha1.yaml`](./crs-v1alpha1.yaml) — one CR of each of the 12 kinds at `v1alpha1` +- [`crs-v1beta1.yaml`](./crs-v1beta1.yaml) — byte-identical to the v1alpha1 file except for `apiVersion`, simulating what `sed -i 's/v1alpha1/v1beta1/g'` would produce + +--- + +## 0 · Set up the cluster + +```bash +# If you already have a "toolhive" kind cluster from a previous run, delete it +kind delete cluster --name toolhive 2>/dev/null + +kind create cluster --name toolhive --wait 60s +kind get kubeconfig --name toolhive > kconfig.yaml +export KUBECONFIG=$(pwd)/kconfig.yaml +``` + +## 1 · Install v0.21.0 (the last v1alpha1-only release) + +```bash +helm install toolhive-operator-crds \ + oci://ghcr.io/stacklok/toolhive/toolhive-operator-crds \ + --version 0.21.0 --wait + +helm install toolhive-operator \ + oci://ghcr.io/stacklok/toolhive/toolhive-operator \ + --version 0.21.0 \ + --namespace toolhive-system --create-namespace --wait + +kubectl get crd mcpservers.toolhive.stacklok.dev -o jsonpath='{.spec.versions[*].name}' +# expected: v1alpha1 (only one version) + +kubectl wait --for=condition=Available deployment -n toolhive-system --all --timeout=180s +``` + +## 2 · Create one CR of each of the 12 kinds at v1alpha1 + +```bash +kubectl create namespace upgrade-test +kubectl apply -f docs/operator/upgrade-guide/crs-v1alpha1.yaml + +# Confirm all 12 landed +kubectl get \ + mcpservers,mcpremoteproxies,mcptoolconfigs,mcpgroups,embeddingservers,mcpregistries,virtualmcpservers,virtualmcpcompositetooldefinitions,mcpoidcconfigs,mcptelemetryconfigs,mcpexternalauthconfigs,mcpserverentries \ + -n upgrade-test --no-headers | wc -l +# expected: 12 +``` + +## 3 · Let the old operator reconcile + capture the baseline + +```bash +sleep 60 +kubectl get deployments -n upgrade-test +# expected: up to 5 Deployments — test-server, test-remote-proxy, test-virtual-server, +# test-registry-api, and (sometimes) test-embedding shows as a StatefulSet not a Deployment + +# Snapshot the UIDs for later comparison +kubectl get deployments -n upgrade-test \ + -o jsonpath='{range .items[*]}{.metadata.name}={.metadata.uid}{"\n"}{end}' \ + | sort > /tmp/before-upgrade.txt +cat /tmp/before-upgrade.txt +``` + +These UIDs are the canary. If they change after the operator upgrade, a workload was recreated → downtime. + +## 4 · Upgrade the CRDs chart to multi-version + +```bash +helm upgrade toolhive-operator-crds deploy/charts/operator-crds --wait --timeout 120s + +kubectl get crd mcpservers.toolhive.stacklok.dev -o jsonpath='{.spec.versions[*].name}' +# expected: v1alpha1 v1beta1 + +kubectl get crd mcpservers.toolhive.stacklok.dev -o jsonpath='{.spec.versions[?(@.storage==true)].name}' +# expected: v1beta1 +``` + +## 5 · Build the new operator + deploy with the migrator DISABLED + +This is the key deviation from `task operator-deploy-local` — we want to control the feature flag, so we run ko + helm manually rather than using the task. + +```bash +# ~3 minutes on first run, ~30s with build cache +OP=$(KO_DOCKER_REPO=kind.local ko build --local -B ./cmd/thv-operator | tail -1) +PR=$(KO_DOCKER_REPO=kind.local ko build --local -B ./cmd/thv-proxyrunner | tail -1) +VM=$(KO_DOCKER_REPO=kind.local ko build --local -B ./cmd/vmcp | tail -1) + +# Load all three into the kind cluster +kind load docker-image --name toolhive "$OP" +kind load docker-image --name toolhive "$PR" +kind load docker-image --name toolhive "$VM" + +# Persist for later steps +echo "$OP" > /tmp/img-op +echo "$PR" > /tmp/img-pr +echo "$VM" > /tmp/img-vm + +# Helm upgrade with migrator EXPLICITLY DISABLED +helm upgrade toolhive-operator deploy/charts/operator \ + --set operator.image="$OP" \ + --set operator.toolhiveRunnerImage="$PR" \ + --set operator.vmcpImage="$VM" \ + --set operator.features.storageVersionMigrator=false \ + --namespace toolhive-system + +kubectl rollout status deployment -n toolhive-system --timeout=180s + +# Confirm flag took effect +NEW_POD=$(kubectl get pods -n toolhive-system --no-headers | grep "toolhive-operator-" | awk '{print $1}' | head -1) +kubectl get pod "$NEW_POD" -n toolhive-system -o yaml | grep -A1 ENABLE_STORAGE_VERSION_MIGRATOR +# expected: value: "false" + +kubectl logs "$NEW_POD" -n toolhive-system | grep "ENABLE_STORAGE_VERSION_MIGRATOR is disabled" +# expected: one line — "ENABLE_STORAGE_VERSION_MIGRATOR is disabled, skipping StorageVersionMigrator controller" +``` + +## 6 · Verify zero downtime — Deployment UIDs unchanged + +```bash +kubectl get deployments -n upgrade-test \ + -o jsonpath='{range .items[*]}{.metadata.name}={.metadata.uid}{"\n"}{end}' \ + | sort > /tmp/after-upgrade.txt + +diff /tmp/before-upgrade.txt /tmp/after-upgrade.txt && echo "All Deployment UIDs unchanged" +``` + +## 7 · Re-apply all 12 CRs at v1beta1 (the user migration step) + +```bash +kubectl apply -f docs/operator/upgrade-guide/crs-v1beta1.yaml +# expected: 12 "configured" lines (not "created") + +# Wait for the operator to observe each update +sleep 10 +``` + +## 8 · Confirm storedVersions is stuck at `[v1alpha1, v1beta1]` on all 12 CRDs (migrator is OFF) + +```bash +echo "=== storedVersions on ALL 12 CRDs (migrator OFF) ===" +for crd in $(kubectl get crd -o name | grep toolhive.stacklok.dev | sort); do + name=$(echo $crd | sed 's|.*/||') + stored=$(kubectl get $crd -o jsonpath='{.status.storedVersions}') + printf " %-55s %s\n" "$name" "$stored" +done +``` + +Expected: every line ends with `["v1alpha1","v1beta1"]`. This is the "post-graduation, pre-migration" state every cluster lands in after the v0.21.0 → multi-version upgrade. **Without the migrator, this state is permanent** — any future operator release that drops `v1alpha1` from `spec.versions` would fail with: + +``` +status.storedVersions[0]: Invalid value: "v1alpha1": must appear in spec.versions +``` + +## 9 · Enable the migrator + watch it converge + +Helm upgrade to flip the feature flag: + +```bash +OP=$(cat /tmp/img-op) +PR=$(cat /tmp/img-pr) +VM=$(cat /tmp/img-vm) + +helm upgrade toolhive-operator deploy/charts/operator \ + --set operator.image="$OP" \ + --set operator.toolhiveRunnerImage="$PR" \ + --set operator.vmcpImage="$VM" \ + --namespace toolhive-system + +kubectl rollout status deployment -n toolhive-system --timeout=180s + +# Confirm flag is now true +NEW_POD=$(kubectl get pods -n toolhive-system --no-headers | grep "toolhive-operator-" | awk '{print $1}' | head -1) +kubectl get pod "$NEW_POD" -n toolhive-system -o yaml | grep -A1 ENABLE_STORAGE_VERSION_MIGRATOR +# expected: value: "true" +``` + +Wait for convergence: + +```bash +echo "=== waiting up to 60s for all 12 CRDs to converge ===" +for i in $(seq 1 60); do + count=$(for c in $(kubectl get crd -o name | grep toolhive.stacklok.dev); do + kubectl get $c -o jsonpath='{.status.storedVersions}' + echo + done | grep -c '^\["v1beta1"\]$') + if [ "$count" = "12" ]; then + echo "All 12 CRDs converged after ${i}s" + break + fi + sleep 1 +done +``` + +In practice this completes in ~1-2 seconds once the new pod is ready. + +Verify: + +```bash +echo "=== storedVersions on ALL 12 CRDs (migrator ON, post-converge) ===" +for crd in $(kubectl get crd -o name | grep toolhive.stacklok.dev | sort); do + name=$(echo $crd | sed 's|.*/||') + stored=$(kubectl get $crd -o jsonpath='{.status.storedVersions}') + printf " %-55s %s\n" "$name" "$stored" +done +# expected: every line ends with ["v1beta1"] + +echo "=== StorageVersionMigrationSucceeded events ===" +kubectl get events -A --field-selector reason=StorageVersionMigrationSucceeded --no-headers | wc -l +# expected: 12 — one event per CRD + +echo "=== StorageVersionMigrationFailed events (should be 0) ===" +kubectl get events -A --field-selector reason=StorageVersionMigrationFailed --no-headers | wc -l +# expected: 0 +``` + +Confirm CRs still healthy (admission webhooks on MCPServer / MCPGroup / VirtualMCPServer all accepted the per-CR Updates): + +```bash +kubectl get \ + mcpservers,mcpremoteproxies,mcptoolconfigs,mcpgroups,embeddingservers,mcpregistries,virtualmcpservers,virtualmcpcompositetooldefinitions,mcpoidcconfigs,mcptelemetryconfigs,mcpexternalauthconfigs,mcpserverentries \ + -n upgrade-test --no-headers | wc -l +# expected: 12 +``` + +## 10 · (Optional) Inspect migrator logs + +```bash +NEW_POD=$(kubectl get pods -n toolhive-system --no-headers | grep "toolhive-operator-" | awk '{print $1}' | head -1) +kubectl logs "$NEW_POD" -n toolhive-system | grep "storage version migration complete" | wc -l +# expected: 12 lines — one per CRD +``` + +**If this prints 0**, the migration may have happened in a previous container instance (the operator pod can restart for unrelated reasons in a kind cluster — leases, OOM, etc.). Try the previous container's logs: + +```bash +kubectl logs "$NEW_POD" -n toolhive-system --previous | grep "storage version migration complete" | wc -l +``` + +The migration is complete in either case — the events on the CRDs in step 9 are the authoritative signal. + +## 11 · (Optional) Simulate the next release that drops v1alpha1 + +Once `storedVersions` is `[v1beta1]` everywhere, the apiserver will accept removal of `v1alpha1` from `spec.versions` — the safety interlock the migrator exists to satisfy. To demonstrate: + +```bash +# Direct apiserver patch — the same end state a future "drop v1alpha1" chart would produce. +for crd in $(kubectl get crd -o name | grep toolhive.stacklok.dev); do + name=$(echo $crd | sed 's|.*/||') + newversions=$(kubectl get $crd -o json | jq '.spec.versions | map(select(.name != "v1alpha1"))') + patch=$(jq -n --argjson v "$newversions" '{spec:{versions:$v}}') + printf " %-55s " "$name" + kubectl patch $crd --type=merge -p "$patch" 2>&1 | tail -1 +done +``` + +Every line should say `... patched`. Verify v1alpha1 access now fails: + +```bash +kubectl get mcpgroups.v1alpha1.toolhive.stacklok.dev test-group -n upgrade-test +# expected: error — the server doesn't have a resource type "mcpgroups" + +kubectl get mcpgroups.v1beta1.toolhive.stacklok.dev test-group -n upgrade-test +# expected: the resource +``` + +**Negative test**: if you skip step 9 (the migrator pass) and jump straight to step 11, every `kubectl patch` will fail with: + +``` +Invalid value: "v1alpha1": must appear in spec.versions +``` + +That's the apiserver wall the migrator exists to clear. + +## 12 · Cleanup + +```bash +kind delete cluster --name toolhive +rm -f kconfig.yaml /tmp/before-upgrade.txt /tmp/after-upgrade.txt \ + /tmp/img-op /tmp/img-pr /tmp/img-vm +``` + +--- + +## Summary of what's verified + +| Check | Validates | Step | +|---|---|---| +| Operator startup logs show migrator skipped when disabled | The `setupStorageVersionMigrator` branch in `app/app.go` honours `ENABLE_STORAGE_VERSION_MIGRATOR=false` | 5 | +| Zero-downtime upgrade across operator chart upgrade | The PR's operator changes don't recreate any managed workload | 6 | +| `storedVersions` is `[v1alpha1, v1beta1]` after re-apply with migrator OFF | Migrator did not run; baseline state is correctly preserved | 8 | +| Helm-upgrade flip enables the migrator | Feature flag round-trips correctly | 9 | +| `storedVersions` converges to `[v1beta1]` on all 12 CRDs | The actual migration logic works against real ToolHive CRDs | 9 | +| 12 `StorageVersionMigrationSucceeded` events on the CRDs | The Recorder is correctly wired and per-CRD migrations are observable | 9 | +| 0 `StorageVersionMigrationFailed` events | No CRD's per-CR Update loop returned a non-retriable error | 9 | +| All 12 CRs still healthy after migration | Validating admission webhooks (MCPServer / MCPGroup / VirtualMCPServer) tolerate the per-CR Updates | 9 | +| RBAC permits storedVersions trim | The ClusterRole has the correct verbs for `customresourcedefinitions/status` and `*.toolhive.stacklok.dev/*` | implicit — trim succeeds in step 9 | +| Apiserver permits `v1alpha1` removal from `spec.versions` once `storedVersions` is clean | The deprecation chain end-to-end works | 11 | + +## What this does NOT cover + +- **Mid-migration crash recovery**: no test forces the operator to crash mid-loop. Envtest covers the conflict-handling and re-encode-failure paths separately. +- **Pagination at real-cluster scale**: 12 CRs is well below the 500-default page size. The envtest suite explicitly tests the continue-token loop with 7 CRs at `PageSize=3`. +- **Operator restart resilience under load**: kind clusters are resource-limited and the operator pod sometimes restarts during tests for unrelated reasons (the `--previous` log fallback in step 10 covers this). + +These are covered by the envtest suite at `cmd/thv-operator/test-integration/storageversionmigrator/`. This walkthrough covers what envtest can't: helm chart wiring, real ToolHive CRD schemas with their actual webhooks, and the full upgrade sequence a real user would run. diff --git a/docs/operator/upgrade-guide/crs-v1alpha1.yaml b/docs/operator/upgrade-guide/crs-v1alpha1.yaml new file mode 100644 index 0000000000..ec1a2d0aa5 --- /dev/null +++ b/docs/operator/upgrade-guide/crs-v1alpha1.yaml @@ -0,0 +1,149 @@ +# All 12 ToolHive CR kinds at v1alpha1. +# Apply with: +# kubectl create namespace upgrade-test +# kubectl apply -f crs-v1alpha1.yaml +# +# Note: MCPServerEntry references MCPGroup (groupRef.name: test-group), +# so MCPGroup must be applied first. kubectl handles ordering within a +# single file by best-effort, and since all refs here are by name (not +# UID), no strict ordering is required — missing refs just stay pending. +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPGroup +metadata: + name: test-group + namespace: upgrade-test +spec: + description: Test group for upgrade validation +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPServer +metadata: + name: test-server + namespace: upgrade-test +spec: + image: ghcr.io/github/github-mcp-server + transport: stdio +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPRemoteProxy +metadata: + name: test-remote-proxy + namespace: upgrade-test +spec: + remoteUrl: https://example.com/mcp + transport: streamable-http +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPToolConfig +metadata: + name: test-tool-config + namespace: upgrade-test +spec: + toolsFilter: + - allowed_tool +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: EmbeddingServer +metadata: + name: test-embedding + namespace: upgrade-test +spec: + image: ghcr.io/stacklok/test-embedding:latest + model: test-model +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPRegistry +metadata: + name: test-registry + namespace: upgrade-test +spec: + displayName: Test Registry + configYAML: | + sources: + - name: k8s + format: upstream + kubernetes: {} + registries: + - name: default + sources: ["k8s"] + database: + host: postgres + port: 5432 + user: db_app + database: registry + auth: + mode: anonymous +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPOIDCConfig +metadata: + name: test-oidc + namespace: upgrade-test +spec: + type: inline + inline: + issuer: https://auth.example.com + jwksUrl: https://auth.example.com/.well-known/jwks.json +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPTelemetryConfig +metadata: + name: test-telemetry + namespace: upgrade-test +spec: + openTelemetry: + endpoint: http://otel-collector:4317 + metrics: + enabled: true + tracing: + enabled: true +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPExternalAuthConfig +metadata: + name: test-external-auth + namespace: upgrade-test +spec: + type: tokenExchange + tokenExchange: + tokenUrl: https://auth.example.com/token + audience: https://api.example.com + clientId: test-client + clientSecretRef: + name: test-secret + key: client-secret +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: VirtualMCPServer +metadata: + name: test-virtual-server + namespace: upgrade-test +spec: + groupRef: + name: test-group + incomingAuth: + type: anonymous +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: VirtualMCPCompositeToolDefinition +metadata: + name: test-composite-tool + namespace: upgrade-test +spec: + name: test_workflow + description: Test composite workflow + steps: + - id: step-1 + tool: placeholder.tool +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPServerEntry +metadata: + name: test-server-entry + namespace: upgrade-test +spec: + groupRef: + name: test-group + transport: streamable-http + remoteUrl: https://example.com/mcp diff --git a/docs/operator/upgrade-guide/crs-v1beta1.yaml b/docs/operator/upgrade-guide/crs-v1beta1.yaml new file mode 100644 index 0000000000..b8570aa141 --- /dev/null +++ b/docs/operator/upgrade-guide/crs-v1beta1.yaml @@ -0,0 +1,149 @@ +# All 12 ToolHive CR kinds at v1beta1. +# Apply with: +# kubectl create namespace upgrade-test +# kubectl apply -f crs-v1beta1.yaml +# +# Note: MCPServerEntry references MCPGroup (groupRef.name: test-group), +# so MCPGroup must be applied first. kubectl handles ordering within a +# single file by best-effort, and since all refs here are by name (not +# UID), no strict ordering is required — missing refs just stay pending. +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPGroup +metadata: + name: test-group + namespace: upgrade-test +spec: + description: Test group for upgrade validation +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPServer +metadata: + name: test-server + namespace: upgrade-test +spec: + image: ghcr.io/github/github-mcp-server + transport: stdio +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPRemoteProxy +metadata: + name: test-remote-proxy + namespace: upgrade-test +spec: + remoteUrl: https://example.com/mcp + transport: streamable-http +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPToolConfig +metadata: + name: test-tool-config + namespace: upgrade-test +spec: + toolsFilter: + - allowed_tool +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: EmbeddingServer +metadata: + name: test-embedding + namespace: upgrade-test +spec: + image: ghcr.io/stacklok/test-embedding:latest + model: test-model +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPRegistry +metadata: + name: test-registry + namespace: upgrade-test +spec: + displayName: Test Registry + configYAML: | + sources: + - name: k8s + format: upstream + kubernetes: {} + registries: + - name: default + sources: ["k8s"] + database: + host: postgres + port: 5432 + user: db_app + database: registry + auth: + mode: anonymous +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPOIDCConfig +metadata: + name: test-oidc + namespace: upgrade-test +spec: + type: inline + inline: + issuer: https://auth.example.com + jwksUrl: https://auth.example.com/.well-known/jwks.json +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPTelemetryConfig +metadata: + name: test-telemetry + namespace: upgrade-test +spec: + openTelemetry: + endpoint: http://otel-collector:4317 + metrics: + enabled: true + tracing: + enabled: true +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPExternalAuthConfig +metadata: + name: test-external-auth + namespace: upgrade-test +spec: + type: tokenExchange + tokenExchange: + tokenUrl: https://auth.example.com/token + audience: https://api.example.com + clientId: test-client + clientSecretRef: + name: test-secret + key: client-secret +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: VirtualMCPServer +metadata: + name: test-virtual-server + namespace: upgrade-test +spec: + groupRef: + name: test-group + incomingAuth: + type: anonymous +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: VirtualMCPCompositeToolDefinition +metadata: + name: test-composite-tool + namespace: upgrade-test +spec: + name: test_workflow + description: Test composite workflow + steps: + - id: step-1 + tool: placeholder.tool +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPServerEntry +metadata: + name: test-server-entry + namespace: upgrade-test +spec: + groupRef: + name: test-group + transport: streamable-http + remoteUrl: https://example.com/mcp