From 69f424a036d27280684b5db8577414adfbb6b546 Mon Sep 17 00:00:00 2001 From: christopherwharrop Date: Wed, 3 Jun 2026 12:33:41 -0600 Subject: [PATCH] Support ubuntu 24.04 and 26.04 and refactor CI workflow with matrix --- .env | 27 ++++ .github/workflows/docker.yml | 191 ++++++++++++++++--------- README.md | 168 ++++++++++++++++++---- docker-compose-test.yml | 223 +++++++++++++++-------------- docker-compose.yml | 213 +++++++++++++++------------- frontend/Dockerfile | 267 ++++++++++++++++++++++++++++++----- master/Dockerfile | 7 +- node/Dockerfile | 7 +- 8 files changed, 766 insertions(+), 337 deletions(-) create mode 100644 .env diff --git a/.env b/.env new file mode 100644 index 0000000..1a3b8d4 --- /dev/null +++ b/.env @@ -0,0 +1,27 @@ +# Variables consumed by docker-compose.yml and docker-compose-test.yml. +# docker compose reads this file automatically from the project root. + +# Ubuntu base version. Selects which base image variant we pull from the +# dockerslurmcluster registry (must match one of the published base tags), +# and is also embedded in our own published image tag. +# Currently supported values: '26.04' or '24.04' +UBUNTU_VERSION=26.04 + +# Slurm version baked into the base image tag we pull. Bump only when the +# base dockerslurmcluster repo publishes new tags with a different slurm. +# Does NOT appear in our own published image tag -- consumers care about +# spack-stack version, slurm is implicit via the base image. +SLURM_VERSION=25.11.5 + +# Spack-stack version. Drives both the git checkout branch of jcsda/spack-stack +# and the tag suffix on our published images (and the corresponding buildcache +# repo on GHCR). Bump in lockstep with a new spack-stack release. +SPACK_STACK_VERSION=2.1.0 + +# Composite tags produced from the above: +# base image: ghcr.io/.../slurm-:ubuntu-${UBUNTU_VERSION}-slurm-${SLURM_VERSION} +# our image: ghcr.io/.../slurm-spack-stack-:ubuntu-${UBUNTU_VERSION}-spack-stack-${SPACK_STACK_VERSION} +# buildcache: ghcr.io/.../buildcache-ubuntu-${UBUNTU_VERSION}-spack-stack-${SPACK_STACK_VERSION} +# +# To switch bases for one invocation without editing this file: +# UBUNTU_VERSION=24.04 docker compose build diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 51fed34..52a3e0f 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -16,32 +16,31 @@ on: workflow_dispatch: env: + # Slurm version baked into the base image tag (ubuntu--slurm-). + # Bump only when the base dockerslurmcluster repo publishes new tags with a different slurm. + # NOT embedded in our own published image tags -- consumers care about spack-stack version. + SLURM_VERSION: 25.11.5 + # Spack-stack version. Drives the git checkout branch AND appears in our image tag suffix. SPACK_STACK_VERSION: 2.1.0 + # The ubuntu version that also gets the `latest` alias on our published images. + # Matches the dockerslurmcluster registry's `latest`-pointing convention. + LATEST_UBUNTU: '26.04' REGISTRY_FRONTEND_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend REGISTRY_MASTER_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master REGISTRY_NODE_IMAGE: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node jobs: - resolve_versions: - name: Resolve Version Tags - runs-on: ubuntu-latest - outputs: - ubuntu_version: ${{ steps.resolve.outputs.ubuntu_version }} - steps: - - - name: Resolve concrete ubuntu:latest version - id: resolve - run: | - UBUNTU_VERSION=$(docker run --rm ubuntu:latest bash -lc '. /etc/os-release && echo "$VERSION_ID"') - echo "ubuntu_version=${UBUNTU_VERSION}" >> "$GITHUB_OUTPUT" - echo "Resolved ubuntu:latest to VERSION_ID=${UBUNTU_VERSION}" - build-test-push-amd64: + name: amd64 (ubuntu-${{ matrix.ubuntu }}) runs-on: ubuntu2204-8c-32g-300ssd - needs: - - resolve_versions timeout-minutes: 360 + strategy: + fail-fast: false + matrix: + ubuntu: &ubuntu_versions + - '24.04' + - '26.04' permissions: packages: write contents: read @@ -76,13 +75,16 @@ jobs: context: ./frontend file: ./frontend/Dockerfile platforms: linux/amd64 - tags: ${{ env.REGISTRY_FRONTEND_IMAGE }}:latest + tags: ${{ env.REGISTRY_FRONTEND_IMAGE }}:ubuntu-${{ matrix.ubuntu }}-spack-stack-${{ env.SPACK_STACK_VERSION }} build-args: | SPACK_BUILD_JOBS=8 + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + UBUNTU_VERSION=${{ matrix.ubuntu }} + SPACK_STACK_VERSION=${{ env.SPACK_STACK_VERSION }} secrets: | "github_token=${{ secrets.GITHUB_TOKEN }}" - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache,mode=max + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-ubuntu-${{ matrix.ubuntu }}-amd64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-ubuntu-${{ matrix.ubuntu }}-amd64:cache,mode=max load: true - name: Build master image @@ -91,9 +93,11 @@ jobs: context: ./master file: ./master/Dockerfile platforms: linux/amd64 - tags: ${{ env.REGISTRY_MASTER_IMAGE }}:latest - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache,mode=max + tags: ${{ env.REGISTRY_MASTER_IMAGE }}:ubuntu-${{ matrix.ubuntu }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + build-args: | + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-ubuntu-${{ matrix.ubuntu }}-amd64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-ubuntu-${{ matrix.ubuntu }}-amd64:cache,mode=max load: true - name: Build node image @@ -102,12 +106,18 @@ jobs: context: ./node file: ./node/Dockerfile platforms: linux/amd64 - tags: ${{ env.REGISTRY_NODE_IMAGE }}:latest - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache,mode=max + tags: ${{ env.REGISTRY_NODE_IMAGE }}:ubuntu-${{ matrix.ubuntu }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + build-args: | + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-ubuntu-${{ matrix.ubuntu }}-amd64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-ubuntu-${{ matrix.ubuntu }}-amd64:cache,mode=max load: true - name: Start containers for testing + env: + UBUNTU_VERSION: ${{ matrix.ubuntu }} + SLURM_VERSION: ${{ env.SLURM_VERSION }} + SPACK_STACK_VERSION: ${{ env.SPACK_STACK_VERSION }} run: docker compose -f docker-compose-test.yml up --pull never -d - name: Check cluster logs @@ -138,6 +148,10 @@ jobs: - name: Shut down Slurm cluster containers if: always() + env: + UBUNTU_VERSION: ${{ matrix.ubuntu }} + SLURM_VERSION: ${{ env.SLURM_VERSION }} + SPACK_STACK_VERSION: ${{ env.SPACK_STACK_VERSION }} run: docker compose -f docker-compose-test.yml down - name: Push frontend by digest @@ -149,9 +163,12 @@ jobs: platforms: linux/amd64 build-args: | SPACK_BUILD_JOBS=8 + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + UBUNTU_VERSION=${{ matrix.ubuntu }} + SPACK_STACK_VERSION=${{ env.SPACK_STACK_VERSION }} secrets: | "github_token=${{ secrets.GITHUB_TOKEN }}" - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-amd64:cache + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-ubuntu-${{ matrix.ubuntu }}-amd64:cache outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true - name: Push master by digest @@ -161,7 +178,9 @@ jobs: context: ./master file: ./master/Dockerfile platforms: linux/amd64 - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-amd64:cache + build-args: | + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-ubuntu-${{ matrix.ubuntu }}-amd64:cache outputs: type=image,name=${{ env.REGISTRY_MASTER_IMAGE }},push-by-digest=true,name-canonical=true,push=true - name: Push node by digest @@ -171,7 +190,9 @@ jobs: context: ./node file: ./node/Dockerfile platforms: linux/amd64 - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-amd64:cache + build-args: | + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-ubuntu-${{ matrix.ubuntu }}-amd64:cache outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true - name: Export digests @@ -187,7 +208,7 @@ jobs: name: Upload frontend digest uses: actions/upload-artifact@v4 with: - name: frontend-digests-linux-amd64 + name: frontend-ubuntu-${{ matrix.ubuntu }}-digests-linux-amd64 path: /tmp/digests/frontend/* if-no-files-found: error retention-days: 1 @@ -195,7 +216,7 @@ jobs: name: Upload master digest uses: actions/upload-artifact@v4 with: - name: master-digests-linux-amd64 + name: master-ubuntu-${{ matrix.ubuntu }}-digests-linux-amd64 path: /tmp/digests/master/* if-no-files-found: error retention-days: 1 @@ -203,7 +224,7 @@ jobs: name: Upload node digest uses: actions/upload-artifact@v4 with: - name: node-digests-linux-amd64 + name: node-ubuntu-${{ matrix.ubuntu }}-digests-linux-amd64 path: /tmp/digests/node/* if-no-files-found: error retention-days: 1 @@ -216,10 +237,13 @@ jobs: limit-access-to-actor: true build-test-push-arm64: + name: arm64 (ubuntu-${{ matrix.ubuntu }}) runs-on: LinuxARM64-8core-32G-300Gb - needs: - - resolve_versions timeout-minutes: 360 + strategy: + fail-fast: false + matrix: + ubuntu: *ubuntu_versions permissions: packages: write contents: read @@ -280,13 +304,16 @@ jobs: context: ./frontend file: ./frontend/Dockerfile platforms: linux/arm64 - tags: ${{ env.REGISTRY_FRONTEND_IMAGE }}:latest + tags: ${{ env.REGISTRY_FRONTEND_IMAGE }}:ubuntu-${{ matrix.ubuntu }}-spack-stack-${{ env.SPACK_STACK_VERSION }} build-args: | SPACK_BUILD_JOBS=8 + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + UBUNTU_VERSION=${{ matrix.ubuntu }} + SPACK_STACK_VERSION=${{ env.SPACK_STACK_VERSION }} secrets: | "github_token=${{ secrets.GITHUB_TOKEN }}" - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache,mode=max + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-ubuntu-${{ matrix.ubuntu }}-arm64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-ubuntu-${{ matrix.ubuntu }}-arm64:cache,mode=max load: true - name: Build master image @@ -295,9 +322,11 @@ jobs: context: ./master file: ./master/Dockerfile platforms: linux/arm64 - tags: ${{ env.REGISTRY_MASTER_IMAGE }}:latest - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache,mode=max + tags: ${{ env.REGISTRY_MASTER_IMAGE }}:ubuntu-${{ matrix.ubuntu }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + build-args: | + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-ubuntu-${{ matrix.ubuntu }}-arm64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-ubuntu-${{ matrix.ubuntu }}-arm64:cache,mode=max load: true - name: Build node image @@ -306,12 +335,18 @@ jobs: context: ./node file: ./node/Dockerfile platforms: linux/arm64 - tags: ${{ env.REGISTRY_NODE_IMAGE }}:latest - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache - cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache,mode=max + tags: ${{ env.REGISTRY_NODE_IMAGE }}:ubuntu-${{ matrix.ubuntu }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + build-args: | + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-ubuntu-${{ matrix.ubuntu }}-arm64:cache + cache-to: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-ubuntu-${{ matrix.ubuntu }}-arm64:cache,mode=max load: true - name: Start containers for testing + env: + UBUNTU_VERSION: ${{ matrix.ubuntu }} + SLURM_VERSION: ${{ env.SLURM_VERSION }} + SPACK_STACK_VERSION: ${{ env.SPACK_STACK_VERSION }} run: docker compose -f docker-compose-test.yml up --pull never -d - name: Check cluster logs @@ -342,6 +377,10 @@ jobs: - name: Shut down Slurm cluster containers if: always() + env: + UBUNTU_VERSION: ${{ matrix.ubuntu }} + SLURM_VERSION: ${{ env.SLURM_VERSION }} + SPACK_STACK_VERSION: ${{ env.SPACK_STACK_VERSION }} run: docker compose -f docker-compose-test.yml down - name: Push frontend by digest @@ -353,9 +392,12 @@ jobs: platforms: linux/arm64 build-args: | SPACK_BUILD_JOBS=8 + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + UBUNTU_VERSION=${{ matrix.ubuntu }} + SPACK_STACK_VERSION=${{ env.SPACK_STACK_VERSION }} secrets: | "github_token=${{ secrets.GITHUB_TOKEN }}" - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-arm64:cache + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend-cache-ubuntu-${{ matrix.ubuntu }}-arm64:cache outputs: type=image,name=${{ env.REGISTRY_FRONTEND_IMAGE }},push-by-digest=true,name-canonical=true,push=true - name: Push master by digest @@ -365,7 +407,9 @@ jobs: context: ./master file: ./master/Dockerfile platforms: linux/arm64 - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-arm64:cache + build-args: | + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/master-cache-ubuntu-${{ matrix.ubuntu }}-arm64:cache outputs: type=image,name=${{ env.REGISTRY_MASTER_IMAGE }},push-by-digest=true,name-canonical=true,push=true - name: Push node by digest @@ -375,7 +419,9 @@ jobs: context: ./node file: ./node/Dockerfile platforms: linux/arm64 - cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-arm64:cache + build-args: | + BASE_IMAGE_TAG=ubuntu-${{ matrix.ubuntu }}-slurm-${{ env.SLURM_VERSION }} + cache-from: type=registry,ref=ghcr.io/noaa-gsl/dockerspackstackslurmcluster/node-cache-ubuntu-${{ matrix.ubuntu }}-arm64:cache outputs: type=image,name=${{ env.REGISTRY_NODE_IMAGE }},push-by-digest=true,name-canonical=true,push=true - name: Export digests @@ -391,7 +437,7 @@ jobs: name: Upload frontend digest uses: actions/upload-artifact@v4 with: - name: frontend-digests-linux-arm64 + name: frontend-ubuntu-${{ matrix.ubuntu }}-digests-linux-arm64 path: /tmp/digests/frontend/* if-no-files-found: error retention-days: 1 @@ -399,7 +445,7 @@ jobs: name: Upload master digest uses: actions/upload-artifact@v4 with: - name: master-digests-linux-arm64 + name: master-ubuntu-${{ matrix.ubuntu }}-digests-linux-arm64 path: /tmp/digests/master/* if-no-files-found: error retention-days: 1 @@ -407,7 +453,7 @@ jobs: name: Upload node digest uses: actions/upload-artifact@v4 with: - name: node-digests-linux-arm64 + name: node-ubuntu-${{ matrix.ubuntu }}-digests-linux-arm64 path: /tmp/digests/node/* if-no-files-found: error retention-days: 1 @@ -420,11 +466,15 @@ jobs: limit-access-to-actor: true merge-frontend: + name: Merge frontend (ubuntu-${{ matrix.ubuntu }}) runs-on: ubuntu-latest needs: - build-test-push-amd64 - build-test-push-arm64 - - resolve_versions + strategy: + fail-fast: false + matrix: + ubuntu: *ubuntu_versions permissions: packages: write contents: read @@ -438,7 +488,7 @@ jobs: uses: actions/download-artifact@v4 with: path: /tmp/digests - pattern: frontend-digests-* + pattern: frontend-ubuntu-${{ matrix.ubuntu }}-digests-* merge-multiple: true - name: Set up Docker Buildx @@ -450,12 +500,10 @@ jobs: with: images: ${{ env.REGISTRY_FRONTEND_IMAGE }} tags: | - type=raw,value=latest - type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + type=raw,value=ubuntu-${{ matrix.ubuntu }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + ${{ matrix.ubuntu == env.LATEST_UBUNTU && 'type=raw,value=latest' || '' }} flavor: | - latest=true - prefix= - suffix= + latest=false - name: Login to GHCR uses: docker/login-action@v3 @@ -476,11 +524,15 @@ jobs: docker buildx imagetools inspect ${{ env.REGISTRY_FRONTEND_IMAGE }}:${{ steps.meta.outputs.version }} merge-master: + name: Merge master (ubuntu-${{ matrix.ubuntu }}) runs-on: ubuntu-latest needs: - build-test-push-amd64 - build-test-push-arm64 - - resolve_versions + strategy: + fail-fast: false + matrix: + ubuntu: *ubuntu_versions permissions: packages: write contents: read @@ -494,7 +546,7 @@ jobs: uses: actions/download-artifact@v4 with: path: /tmp/digests - pattern: master-digests-* + pattern: master-ubuntu-${{ matrix.ubuntu }}-digests-* merge-multiple: true - name: Set up Docker Buildx @@ -506,12 +558,10 @@ jobs: with: images: ${{ env.REGISTRY_MASTER_IMAGE }} tags: | - type=raw,value=latest - type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + type=raw,value=ubuntu-${{ matrix.ubuntu }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + ${{ matrix.ubuntu == env.LATEST_UBUNTU && 'type=raw,value=latest' || '' }} flavor: | - latest=true - prefix= - suffix= + latest=false - name: Login to GHCR uses: docker/login-action@v3 @@ -532,11 +582,15 @@ jobs: docker buildx imagetools inspect ${{ env.REGISTRY_MASTER_IMAGE }}:${{ steps.meta.outputs.version }} merge-node: + name: Merge node (ubuntu-${{ matrix.ubuntu }}) runs-on: ubuntu-latest needs: - build-test-push-amd64 - build-test-push-arm64 - - resolve_versions + strategy: + fail-fast: false + matrix: + ubuntu: *ubuntu_versions permissions: packages: write contents: read @@ -550,7 +604,7 @@ jobs: uses: actions/download-artifact@v4 with: path: /tmp/digests - pattern: node-digests-* + pattern: node-ubuntu-${{ matrix.ubuntu }}-digests-* merge-multiple: true - name: Set up Docker Buildx @@ -562,12 +616,10 @@ jobs: with: images: ${{ env.REGISTRY_NODE_IMAGE }} tags: | - type=raw,value=latest - type=raw,value=ubuntu-${{ needs.resolve_versions.outputs.ubuntu_version }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + type=raw,value=ubuntu-${{ matrix.ubuntu }}-spack-stack-${{ env.SPACK_STACK_VERSION }} + ${{ matrix.ubuntu == env.LATEST_UBUNTU && 'type=raw,value=latest' || '' }} flavor: | - latest=true - prefix= - suffix= + latest=false - name: Login to GHCR uses: docker/login-action@v3 @@ -586,4 +638,3 @@ jobs: name: Inspect image run: | docker buildx imagetools inspect ${{ env.REGISTRY_NODE_IMAGE }}:${{ steps.meta.outputs.version }} - diff --git a/README.md b/README.md index d048711..2410052 100644 --- a/README.md +++ b/README.md @@ -26,52 +26,134 @@ sizes. The cluster behaves as if it were running on multiple nodes even if the containers are all running on the same host machine. +# Image tags and base selection + +Published images are tagged by Ubuntu version + spack-stack version: + +* `ubuntu-26.04-spack-stack-2.1.0` (also published as `latest`) +* `ubuntu-24.04-spack-stack-2.1.0` + +Internally each variant pulls from the +[NOAA-GSL/DockerSlurmCluster](https://github.com/NOAA-GSL/DockerSlurmCluster) +base registry at the matching `ubuntu--slurm-` tag. +The base image's slurm version is implicit -- consumers of these images interact +with the slurm tooling that came with the base, plus the spack-stack scientific +software stack layered on top. + +A separate per-(ubuntu, spack-stack) OCI buildcache repo (e.g. +`ghcr.io/noaa-gsl/dockerspackstackslurmcluster/buildcache-ubuntu-26.04-spack-stack-2.1.0`) +holds binary artifacts so rebuilds reuse cached packages instead of recompiling +from source. Caches are split per OS to prevent cross-OS spec contamination +during concretization. + +## Configuring versions + +The project root contains a `.env` file consumed by `docker compose`: + +```bash +UBUNTU_VERSION=26.04 +SLURM_VERSION=25.11.5 +SPACK_STACK_VERSION=2.1.0 +``` + +To run against the 24.04 base for one invocation without editing the file: + +```bash +UBUNTU_VERSION=24.04 docker compose up -d --pull never +``` + # Building the Containers -To build the containers from source: +## Quickest path: docker compose -## Master and Node Containers +`docker compose build` reads `.env` and constructs the full set of build args +automatically. To build all three containers (frontend, master, node) for the +default Ubuntu version: ```bash -docker build -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest -f master/Dockerfile master/ -docker build -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest -f node/Dockerfile node/ +docker compose build ``` -## Frontend Container +Or just one: + +```bash +docker compose build slurmfrontend +``` -The frontend container requires a GitHub personal access token (PAT) with package write permissions to push built packages to the GitHub Container Registry build cache. Set your token in an environment variable and pass it as a secret during build: +To build for a non-default Ubuntu version: ```bash -export GITHUB_TOKEN=your_github_pat_here -docker build --progress=plain \ - --secret id=github_token,env=GITHUB_TOKEN \ - -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest \ - -f frontend/Dockerfile \ - frontend/ +UBUNTU_VERSION=24.04 docker compose build slurmfrontend ``` -**Note:** The `--progress=plain` flag shows full build output. The frontend build compiles 355+ scientific software packages from source and can take several hours on first build. Subsequent builds use the cached packages from GHCR. +### GitHub PAT for buildcache push -### Configuring Parallel Build Jobs +A GitHub personal access token (PAT) is only required if you want the build to +**push** newly-built spack packages back to the OCI buildcache (autopush) -- +which is what CI and the original maintainer's builds do to keep the cache +populated. For most local development, where you just want to *consume* +artifacts the cache already has, no PAT is needed. -The frontend Dockerfile uses the `SPACK_BUILD_JOBS` build argument to control the number of parallel make jobs (`-j` flag) used when building each package (default: 8). This should match the number of CPU cores available: +The frontend Dockerfile only configures autopush when the docker secret +`github_token` is present *and non-empty*. Compose accepts an unset or empty +`GITHUB_TOKEN` environment variable (the secret simply becomes an empty file +inside the build), so pull-only builds work without setting anything: -**For 8-core systems (default):** ```bash -docker build --build-arg SPACK_BUILD_JOBS=8 ... +# Pull-only build: reads from the public buildcache, never pushes +docker compose build slurmfrontend ``` -**For 16-core systems:** +For push-capable builds, set the PAT before invoking compose: + ```bash -docker build --build-arg SPACK_BUILD_JOBS=16 ... +export GITHUB_TOKEN=your_github_pat_here # PAT with write:packages on the GHCR registry +docker compose build slurmfrontend ``` -**With Docker Compose:** +Note: this assumes the buildcache repo on GHCR is **public** (which is the +case for the upstream NOAA-GSL caches). If you maintain a fork with a private +cache, you'll need a PAT with read permission on the cache repo even for +pull-only builds. + +## Direct buildx invocation + +Equivalent build command for the frontend, useful when you want full control +(`--no-cache`, `--progress=plain`, custom tags) without going through compose: + +```bash +export GITHUB_TOKEN=your_github_pat_here +docker buildx build \ + --progress=plain \ + --pull \ + --secret id=github_token,env=GITHUB_TOKEN \ + --build-arg SPACK_BUILD_JOBS=8 \ + --build-arg BASE_IMAGE_TAG=ubuntu-26.04-slurm-25.11.5 \ + --build-arg UBUNTU_VERSION=26.04 \ + --build-arg SPACK_STACK_VERSION=2.1.0 \ + -t ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:ubuntu-26.04-spack-stack-2.1.0 \ + -f frontend/Dockerfile \ + frontend/ +``` + +The frontend build compiles ~355 scientific software packages and can take +many hours on first build from an empty buildcache. Subsequent builds reuse +cached packages from GHCR and finish much faster. + +## Configuring Parallel Build Jobs + +`SPACK_BUILD_JOBS` controls the number of parallel make jobs (`-j` flag) used +when building each package (default: 8). Match it to the CPU count of your +build machine: + ```bash +docker buildx build --build-arg SPACK_BUILD_JOBS=16 ... +# or docker compose build --build-arg SPACK_BUILD_JOBS=16 ``` -You can also modify the default in `docker-compose.yml`: +You can also change the default in `docker-compose.yml`: + ```yaml services: slurmfrontend: @@ -80,27 +162,57 @@ services: SPACK_BUILD_JOBS: 16 # Change from default 8 ``` -**Performance note:** Higher values speed up compilation of individual packages, especially large ones like ESMF, JEDI components, and NetCDF. However, on 32GB RAM systems, values above 8 may cause memory pressure during compilation of memory-intensive Fortran packages, potentially leading to swapping or OOM errors. +**Performance note:** higher values speed up compilation of individual +packages, especially large ones like ESMF, JEDI components, and NetCDF. On +32GB RAM systems values above 8 may cause memory pressure during compilation +of memory-intensive Fortran packages, potentially leading to swapping or OOM +errors. # Quick Start -To start the slurm cluster environment: +To start the slurm cluster environment (default Ubuntu 26.04): ``` -docker-compose -f docker-compose.yml up -d +docker compose -f docker-compose.yml up -d --pull never ``` + +For 24.04: +``` +UBUNTU_VERSION=24.04 docker compose -f docker-compose.yml up -d --pull never +``` + +The frontend container takes several minutes on first launch (it populates the +shared `opt-vol` volume with the spack-stack install). Healthchecks ensure the +master and nodes wait for the frontend before starting. + +### Switching `UBUNTU_VERSION` between runs + +Docker named volumes are not auto-rebuilt when you change the image they're +attached to. To switch from 26.04 to 24.04 (or vice versa) on the same host, +you must explicitly remove the existing `home-vol` and `opt-vol` first: + +``` +docker compose down -v # the -v flag deletes the named volumes +UBUNTU_VERSION=24.04 docker compose up -d --pull never +``` + +Without `-v`, the new container will mount the previous run's `/opt`, which +contains spack-built binaries linked against the *previous* OS's glibc. The +cluster will appear to start fine but `srun` of any spack-built executable will +fail with `GLIBC_X.YZ not found`. + To stop the cluster: ``` -docker-compose -f docker-compose.yml stop +docker compose -f docker-compose.yml stop ``` To check the cluster logs: ``` -docker-compose -f docker-compose.yml logs -f +docker compose -f docker-compose.yml logs -f ``` -(stop logs with CTRL-c") +(stop logs with CTRL-c) To check status of the cluster containers: ``` -docker-compose -f docker-compose.yml ps +docker compose -f docker-compose.yml ps ``` To check status of Slurm: ``` diff --git a/docker-compose-test.yml b/docker-compose-test.yml index 6ab0d06..43f88ea 100644 --- a/docker-compose-test.yml +++ b/docker-compose-test.yml @@ -1,117 +1,128 @@ +x-node-common: &node-common + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:ubuntu-${UBUNTU_VERSION:-26.04}-spack-stack-${SPACK_STACK_VERSION:-2.1.0} + shm_size: '4g' + user: admin + volumes: + - home-vol:/home/admin + - opt-vol:/opt:ro + - ./test:/home/admin/test + depends_on: + slurmmaster: + condition: service_started + slurmfrontend: + condition: service_healthy + healthcheck: + test: ["CMD", "ssh", "-o", "StrictHostKeyChecking=no", "admin@localhost", "hostname"] + interval: 2s + timeout: 2s + retries: 10 + start_period: 5s + services: slurmfrontend: - build: - context: ./frontend - dockerfile: ./Dockerfile - args: - SPACK_BUILD_JOBS: 8 - secrets: - - github_token - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest - container_name: spack-stack-frontend - hostname: slurmfrontend - user: admin - volumes: - - home-vol:/home/admin - - opt-vol:/opt - - ./test:/home/admin/test - ports: - - 8888:8888 + build: + context: ./frontend + args: + SPACK_BUILD_JOBS: 8 + BASE_IMAGE_TAG: ubuntu-${UBUNTU_VERSION:-26.04}-slurm-${SLURM_VERSION:-25.11.5} + UBUNTU_VERSION: ${UBUNTU_VERSION:-26.04} + SPACK_STACK_VERSION: ${SPACK_STACK_VERSION:-2.1.0} + secrets: + - github_token + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:ubuntu-${UBUNTU_VERSION:-26.04}-spack-stack-${SPACK_STACK_VERSION:-2.1.0} + container_name: spack-stack-frontend + shm_size: '4g' + hostname: slurmfrontend + user: admin + volumes: + - home-vol:/home/admin + - opt-vol:/opt + - ./test:/home/admin/test + ports: + - 8888:8888 + healthcheck: + # /tmp/ssh-ready is created by the base image entrypoint after munged + sshd are up + # and the admin SSH keypair is written. The long start_period absorbs the multi-GB + # initial copy of /opt (spack-stack install) into the empty opt-vol on first launch; + # subsequent restarts reuse the populated volume and finish quickly. + test: ["CMD", "test", "-f", "/tmp/ssh-ready"] + interval: 5s + timeout: 2s + retries: 250 + start_period: 60s + slurmmaster: - build: - context: ./master - dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest - container_name: spack-stack-master - hostname: slurmmaster - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - - ./test:/home/admin/test - environment: - - SLURM_CPUS_ON_NODE=8 - ports: - - 6817:6817 - - 6818:6818 - - 6819:6819 + build: + context: ./master + args: + BASE_IMAGE_TAG: ubuntu-${UBUNTU_VERSION:-26.04}-slurm-${SLURM_VERSION:-25.11.5} + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:ubuntu-${UBUNTU_VERSION:-26.04}-spack-stack-${SPACK_STACK_VERSION:-2.1.0} + container_name: spack-stack-master + shm_size: '4g' + hostname: slurmmaster + user: admin + volumes: + - home-vol:/home/admin + - opt-vol:/opt:ro + - ./test:/home/admin/test + environment: + - SLURM_CPUS_ON_NODE=8 + ports: + - 6817:6817 + - 6818:6818 + - 6819:6819 + depends_on: + slurmfrontend: + condition: service_healthy + slurmnode1: - build: - context: ./node - dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node1 - hostname: slurmnode1 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - - ./test:/home/admin/test - environment: - - SLURM_NODENAME=slurmnode1 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + build: + context: ./node + args: + BASE_IMAGE_TAG: ubuntu-${UBUNTU_VERSION:-26.04}-slurm-${SLURM_VERSION:-25.11.5} + container_name: spack-stack-node1 + hostname: slurmnode1 + environment: + - SLURM_NODENAME=slurmnode1 + - SLURM_CPUS_ON_NODE=8 + slurmnode2: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node2 - hostname: slurmnode2 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - - ./test:/home/admin/test - environment: - - SLURM_NODENAME=slurmnode2 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + container_name: spack-stack-node2 + hostname: slurmnode2 + environment: + - SLURM_NODENAME=slurmnode2 + - SLURM_CPUS_ON_NODE=8 + slurmnode3: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node3 - hostname: slurmnode3 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - - ./test:/home/admin/test - environment: - - SLURM_NODENAME=slurmnode3 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + container_name: spack-stack-node3 + hostname: slurmnode3 + environment: + - SLURM_NODENAME=slurmnode3 + - SLURM_CPUS_ON_NODE=8 + slurmnode4: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node4 - hostname: slurmnode4 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - - ./test:/home/admin/test - environment: - - SLURM_NODENAME=slurmnode4 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + container_name: spack-stack-node4 + hostname: slurmnode4 + environment: + - SLURM_NODENAME=slurmnode4 + - SLURM_CPUS_ON_NODE=8 + slurmnode5: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node5 - hostname: slurmnode5 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - - ./test:/home/admin/test - environment: - - SLURM_NODENAME=slurmnode5 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + container_name: spack-stack-node5 + hostname: slurmnode5 + environment: + - SLURM_NODENAME=slurmnode5 + - SLURM_CPUS_ON_NODE=8 + volumes: - home-vol: - opt-vol: + home-vol: + opt-vol: secrets: - github_token: - environment: GITHUB_TOKEN + github_token: + environment: GITHUB_TOKEN diff --git a/docker-compose.yml b/docker-compose.yml index 7ff97c6..701cf79 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,110 +1,125 @@ +x-node-common: &node-common + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:ubuntu-${UBUNTU_VERSION:-26.04}-spack-stack-${SPACK_STACK_VERSION:-2.1.0} + shm_size: '4g' + user: admin + volumes: + - home-vol:/home/admin + - opt-vol:/opt:ro + depends_on: + slurmmaster: + condition: service_started + slurmfrontend: + condition: service_healthy + healthcheck: + test: ["CMD", "ssh", "-o", "StrictHostKeyChecking=no", "admin@localhost", "hostname"] + interval: 2s + timeout: 2s + retries: 10 + start_period: 5s + services: slurmfrontend: - build: - context: ./frontend - dockerfile: ./Dockerfile - args: - SPACK_BUILD_JOBS: 8 - secrets: - - github_token - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:latest - container_name: spack-stack-frontend - hostname: slurmfrontend - user: admin - volumes: - - home-vol:/home/admin - - opt-vol:/opt - ports: - - 8888:8888 + build: + context: ./frontend + args: + SPACK_BUILD_JOBS: 8 + BASE_IMAGE_TAG: ubuntu-${UBUNTU_VERSION:-26.04}-slurm-${SLURM_VERSION:-25.11.5} + UBUNTU_VERSION: ${UBUNTU_VERSION:-26.04} + SPACK_STACK_VERSION: ${SPACK_STACK_VERSION:-2.1.0} + secrets: + - github_token + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-frontend:ubuntu-${UBUNTU_VERSION:-26.04}-spack-stack-${SPACK_STACK_VERSION:-2.1.0} + container_name: spack-stack-frontend + shm_size: '4g' + hostname: slurmfrontend + user: admin + volumes: + - home-vol:/home/admin + - opt-vol:/opt + ports: + - 8888:8888 + healthcheck: + # /tmp/ssh-ready is created by the base image entrypoint after munged + sshd are up + # and the admin SSH keypair is written. The long start_period absorbs the multi-GB + # initial copy of /opt (spack-stack install) into the empty opt-vol on first launch; + # subsequent restarts reuse the populated volume and finish quickly. + test: ["CMD", "test", "-f", "/tmp/ssh-ready"] + interval: 5s + timeout: 2s + retries: 250 + start_period: 60s + slurmmaster: - build: - context: ./master - dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:latest - container_name: spack-stack-master - hostname: slurmmaster - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - environment: - - SLURM_CPUS_ON_NODE=8 - ports: - - 6817:6817 - - 6818:6818 - - 6819:6819 + build: + context: ./master + args: + BASE_IMAGE_TAG: ubuntu-${UBUNTU_VERSION:-26.04}-slurm-${SLURM_VERSION:-25.11.5} + image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-master:ubuntu-${UBUNTU_VERSION:-26.04}-spack-stack-${SPACK_STACK_VERSION:-2.1.0} + container_name: spack-stack-master + shm_size: '4g' + hostname: slurmmaster + user: admin + volumes: + - home-vol:/home/admin + - opt-vol:/opt:ro + environment: + - SLURM_CPUS_ON_NODE=8 + ports: + - 6817:6817 + - 6818:6818 + - 6819:6819 + depends_on: + slurmfrontend: + condition: service_healthy + slurmnode1: - build: - context: ./node - dockerfile: ./Dockerfile - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node1 - hostname: slurmnode1 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - environment: - - SLURM_NODENAME=slurmnode1 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + build: + context: ./node + args: + BASE_IMAGE_TAG: ubuntu-${UBUNTU_VERSION:-26.04}-slurm-${SLURM_VERSION:-25.11.5} + container_name: spack-stack-node1 + hostname: slurmnode1 + environment: + - SLURM_NODENAME=slurmnode1 + - SLURM_CPUS_ON_NODE=8 + slurmnode2: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node2 - hostname: slurmnode2 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - environment: - - SLURM_NODENAME=slurmnode2 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + container_name: spack-stack-node2 + hostname: slurmnode2 + environment: + - SLURM_NODENAME=slurmnode2 + - SLURM_CPUS_ON_NODE=8 + slurmnode3: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node3 - hostname: slurmnode3 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - environment: - - SLURM_NODENAME=slurmnode3 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + container_name: spack-stack-node3 + hostname: slurmnode3 + environment: + - SLURM_NODENAME=slurmnode3 + - SLURM_CPUS_ON_NODE=8 + slurmnode4: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node4 - hostname: slurmnode4 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - environment: - - SLURM_NODENAME=slurmnode4 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + container_name: spack-stack-node4 + hostname: slurmnode4 + environment: + - SLURM_NODENAME=slurmnode4 + - SLURM_CPUS_ON_NODE=8 + slurmnode5: - image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/slurm-spack-stack-node:latest - container_name: spack-stack-node5 - hostname: slurmnode5 - user: admin - volumes: - - home-vol:/home/admin:nocopy - - opt-vol:/opt:ro - environment: - - SLURM_NODENAME=slurmnode5 - - SLURM_CPUS_ON_NODE=8 - links: - - slurmmaster + <<: *node-common + container_name: spack-stack-node5 + hostname: slurmnode5 + environment: + - SLURM_NODENAME=slurmnode5 + - SLURM_CPUS_ON_NODE=8 + volumes: - home-vol: - opt-vol: + home-vol: + opt-vol: secrets: - github_token: - environment: GITHUB_TOKEN + github_token: + environment: GITHUB_TOKEN diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 801f1b7..93423af 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -1,8 +1,23 @@ -FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-frontend:latest AS builder +# BASE_IMAGE_TAG is the full tag of the base image pulled from the dockerslurmcluster +# registry (e.g., ubuntu-26.04-slurm-25.11.5). When using docker compose this is +# constructed from UBUNTU_VERSION + SLURM_VERSION in .env; when invoking buildx +# directly it can be set via --build-arg BASE_IMAGE_TAG=... +ARG BASE_IMAGE_TAG=ubuntu-26.04-slurm-25.11.5 +FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-frontend:${BASE_IMAGE_TAG} AS builder # Default to 8 build jobs; override with --build-arg SPACK_BUILD_JOBS=16 for larger runners ARG SPACK_BUILD_JOBS=8 +# UBUNTU_VERSION and SPACK_STACK_VERSION are needed in subsequent RUN commands -- both +# for the per-(ubuntu, spack-stack) buildcache mirror URL (so cross-OS / cross-stack +# cache pollution can't happen) and to drive the spack-stack git clone branch. ARGs +# declared before the first FROM only cover FROM substitution; declaring stage-scoped +# ARGs here plus matching ENVs makes them visible to bash inside the RUN heredocs. +ARG UBUNTU_VERSION=26.04 +ENV UBUNTU_VERSION=${UBUNTU_VERSION} +ARG SPACK_STACK_VERSION=2.1.0 +ENV SPACK_STACK_VERSION=${SPACK_STACK_VERSION} + ENV DEBIAN_FRONTEND=noninteractive ENV TZ=Etc/UTC @@ -55,9 +70,79 @@ RUN git clone --recursive https://github.com/TACC/Lmod.git \ && popd \ && rm -rf Lmod -# Clone spack-stack 2.1.0 +# Clone spack-stack at the version specified by SPACK_STACK_VERSION. The git tag must +# match exactly -- bumping the value should be paired with bumping the tag in any +# downstream consumer paths (e.g., the patch script's "0.5.15 anchor" comment if the +# new spack-stack ships a different newest munge version). RUN cd /opt \ - && git clone -b 2.1.0 --recurse-submodules https://github.com/jcsda/spack-stack.git + && git clone -b ${SPACK_STACK_VERSION} --recurse-submodules https://github.com/jcsda/spack-stack.git + +# Stage a patch script that will be applied to the env-local munge recipe snapshot below. +# spack-stack 2.1.0 ships a munge recipe with two limitations we work around: +# 1. No `executables` / `determine_version` -- so `spack external find` cannot detect +# /usr/bin/munge automatically. The patch injects both. +# 2. Newest known version capped at 0.5.15. If the base image's munge is newer (e.g. +# Ubuntu 26.04 ships 0.5.16), the concretizer rejects the auto-detected external as +# an unknown version. The patch injects `version("", sha256=)` +# ONLY IF the system version isn't already recipe-known. The fake sha is safe because +# `packages:munge:buildable:false` (set below) prevents spack from ever downloading. +# The script is idempotent on both injections, takes the system MUNGE_VERSION as argv[2], +# and is a no-op for the version injection on bases whose munge happens to be already known +# (e.g. 24.04 ships 0.5.15 which the recipe already lists). +# Remove this whole patch once spack-stack ships a munge recipe that knows the system +# version and carries detection logic. +# (Recipe lives in a Python distribution that spack pulls in, not in the spack git source, +# so the patch is applied to the env's frozen copy at .spack-env/repos/... after env create.) +RUN cat > /tmp/patch_munge_recipe.py <<'PY' +"""Patch spack's munge recipe in place to enable system-munge external detection. + +Usage: patch_munge_recipe.py +""" +import re, sys +p = sys.argv[1] +munge_version = sys.argv[2] +src = open(p).read() + +# (1) Inject executables + determine_version (idempotent: skip if both already present). +if 'executables' not in src or 'determine_version' not in src: + insert_detection = ( + '\n executables = [r"^munge$"]\n' + '\n @classmethod\n' + ' def determine_version(cls, exe):\n' + ' import subprocess, re as _re\n' + ' try:\n' + ' out = subprocess.check_output([exe, "-V"], text=True, stderr=subprocess.STDOUT)\n' + ' m = _re.search(r"munge-([0-9.]+)", out)\n' + ' return m.group(1) if m else None\n' + ' except Exception:\n' + ' return None\n' + ) + src, n = re.subn(r'(\n variant\(\n "localstatedir")', insert_detection + r'\1', src, count=1) + if n != 1: + raise SystemExit("FAIL: could not locate localstatedir variant for detection-logic injection") + print("Injected executables + determine_version into recipe") +else: + print("Recipe already has executables + determine_version; no detection injection needed") + +# (2) Inject version() with placeholder sha if the system version isn't already declared. +if 'version("' + munge_version + '"' in src: + print("Recipe already declares version " + munge_version + "; no version injection needed") +else: + new_version = ' version("' + munge_version + '", sha256="' + '0' * 64 + '")\n' + # Anchor before the highest known version in the recipe (currently 0.5.15 in spack-stack 2.1.0). + src, n = re.subn( + r'( version\("0\.5\.15", sha256="[a-f0-9]+"\)\n)', + new_version + r'\1', + src, + count=1, + ) + if n != 1: + raise SystemExit("FAIL: could not locate version 0.5.15 anchor for version injection") + print("Injected version(" + repr(munge_version) + ") into recipe") + +open(p, 'w').write(src) +print("Patched munge recipe at " + p) +PY # Create and configure the unified env using the container site RUN cd /opt/spack-stack \ @@ -65,6 +150,34 @@ RUN cd /opt/spack-stack \ && spack stack create env --site container --template unified-dev --name unified-env --compiler gcc \ && cd envs/unified-env \ && spack env activate . \ + # Detect the system munge version once, up-front, so the recipe patch and the \ + # `packages:munge:require:[@VERSION]` config_add below can both reference it. \ + && MUNGE_VERSION=$(/usr/bin/munge -V 2>&1 | head -1 | sed -n 's/^munge-\([0-9.]\+\).*/\1/p') \ + && [ -n "$MUNGE_VERSION" ] || { \ + echo "ERROR: could not parse munge version from /usr/bin/munge -V output:"; \ + /usr/bin/munge -V 2>&1; \ + exit 1; \ + } \ + && echo "Detected system munge version: $MUNGE_VERSION" \ + # Apply the munge recipe patch to the *source* recipe, before any external find / concretize \ + # runs (which is what triggers spack to lazily snapshot the recipe into .spack-env/repos/...). \ + # Ask spack itself for the recipe path -- robust to whether packages live in the spack git \ + # submodule, a pip-installed spack_repo distribution, or somewhere else entirely. \ + && MUNGE_PY=$(spack python -c "import spack.repo; print(spack.repo.PATH.get_pkg_class('munge').module.__file__)") \ + && [ -n "$MUNGE_PY" ] && [ -f "$MUNGE_PY" ] || { \ + echo "ERROR: spack could not locate munge/package.py (got: '$MUNGE_PY')"; \ + echo "Diagnostic find under /opt:"; \ + find /opt -path "*/packages/munge/package.py" 2>/dev/null; \ + exit 1; \ + } \ + && python3 /tmp/patch_munge_recipe.py "$MUNGE_PY" "$MUNGE_VERSION" \ + # Nuke any bytecode cache near the patched recipe so the next spack import re-compiles \ + # against the new source. Necessary because earlier spack commands in this RUN may have \ + # already imported and cached the unpatched recipe, and Python's mtime-based pyc validation \ + # can be unreliable on Docker overlayfs. \ + && rm -rf "$(dirname "$MUNGE_PY")/__pycache__" \ + && echo "Munge recipe patch verification (looking for version $MUNGE_VERSION + detection logic):" \ + && grep -nE "version\(\"${MUNGE_VERSION//./\\.}\"|executables = |def determine_version" "$MUNGE_PY" \ # Fix system external versions for Ubuntu 26.04 \ && spack external find --scope "env:/opt/spack-stack/envs/unified-env:/opt/spack-stack/envs/unified-env/site" \ --exclude cmake \ @@ -73,32 +186,38 @@ RUN cd /opt/spack-stack \ --exclude openssh \ --exclude python \ && spack external find --scope "env:/opt/spack-stack/envs/unified-env:/opt/spack-stack/envs/unified-env/site" wget \ + # Explicit munge detection. The bulk find above scans many packages at once and (as observed) \ + # sometimes fails to pick up newly-patched recipes; an explicit single-package find reliably \ + # uses the patched recipe. Writes to the env scope (spack.yaml), where our `buildable: false` \ + # config_add further below also lands -- both get merged at concretize time. \ + && spack external find munge \ # Add slurm as an external package \ && echo " slurm:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " externals:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " - spec: slurm@25.11.5" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " prefix: /usr" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ && echo " buildable: false" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ - # Add munge as an external package so spack uses the system munge (same one munged starts at boot) \ - # rather than building its own, which would cause LD_LIBRARY_PATH conflicts with system Slurm tools \ - && MUNGE_VERSION=$(dpkg -l munge 2>/dev/null | awk '/^ii/{print $3}' | cut -d: -f2 | cut -d- -f1 || echo "0.5.16") \ - && echo " munge:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ - && echo " externals:" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ - && echo " - spec: munge@${MUNGE_VERSION}" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ - && echo " prefix: /usr" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ - && echo " buildable: false" >> /opt/spack-stack/envs/unified-env/site/packages.yaml \ + # Forbid building munge so the concretizer must use the system munge that the bulk \ + # `spack external find` above auto-detected via the patched recipe. (Auto-detection writes \ + # the external entry to packages.yaml but does not set buildable:false on its own.) \ + && spack -e . config add 'packages:munge:buildable:false' \ + # Force the concretizer to pin munge at the system version. Without this require, the \ + # solver's version-badness scoring downranks our patched version (placeholder sha = "less \ + # buildable") in favor of a recipe-default version with a real sha, and picks build over \ + # external -- even though buildable:false is set and our auto-detected external is the only \ + # valid candidate at the system version. The require overrides the version score and forces \ + # the only feasible solution. MUNGE_VERSION is the value detected above. \ + && spack -e . config add "packages:munge:require:[\"@${MUNGE_VERSION}\"]" \ # Configure openmpi to use Slurm scheduler integration (OpenMPI 5 uses PMIx, not +pmi) \ && spack -e . config add 'packages:openmpi:require:[schedulers=slurm]' \ # Ensure PMIx includes munge security plugin to avoid psec/munge runtime warnings. # Use require (not just variants preference) so concretization cannot silently pick ~munge. \ && spack -e . config add 'packages:pmix:require:[+munge]' \ - # Normalize target selection so builds on different hosts can share buildcache artifacts. \ - # Use generic targets (x86_64 / aarch64) for maximum portability at the cost of SIMD optimizations. \ - && if [ "$(uname -m)" = "x86_64" ]; then \ - spack -e . config add 'packages:all:target:[x86_64]' ; \ - elif [ "$(uname -m)" = "aarch64" ]; then \ - spack -e . config add 'packages:all:target:[aarch64]' ; \ - fi \ + # Use the spack-stack site default target (x86_64_v3 on x86_64 hosts). \ + # Earlier this block added `target:[x86_64]` for buildcache portability, but spack merged \ + # that with the site default to `[x86_64, x86_64_v3]`, and the concretizer's choice between \ + # them caused manually-declared externals (notably munge) to be rejected on target mismatch. \ + # x86_64_v3 covers Haswell and newer, which is fine for our build/run hardware. \ # Configure lmod modules instead of tcl \ && sed -i 's/tcl/lmod/g' site/modules.yaml \ # Use unhashed module names like other spack-stack deployments, with suffix rules to avoid naming clashes \ @@ -139,15 +258,76 @@ RUN cd /opt/spack-stack \ # Force env module root so final stage can copy a stable path \ && spack -e . config add 'modules:default:roots:lmod:$env/modules' -# Add the build cache mirror and concretize -RUN cd /opt/spack-stack \ - && source setup.sh \ - && cd envs/unified-env \ - && spack env activate . \ - && spack mirror add --unsigned ghcr_buildcache oci://ghcr.io/noaa-gsl/dockerspackstackslurmcluster/buildcache \ - && spack concretize > /dev/null 2>&1 \ - && spack spec openmpi 2>&1 | grep -Eq 'schedulers(:=|=)slurm' \ - && spack spec pmix 2>&1 | grep -q '+munge' +# Register the per-(ubuntu, spack-stack) buildcache mirror, concretize the env, and +# verify that the four load-bearing configuration choices took effect (munge / slurm +# externals, pmix +munge, openmpi schedulers=slurm). Verification is non-fatal so a +# regression here surfaces as a visible FAIL line in the build log without breaking +# the build outright -- inspect the resulting image with `docker run --entrypoint +# bash` and `spack spec ` to dig in if anything ever fails here. +RUN <<'EOF' + set -eo pipefail + cd /opt/spack-stack + source setup.sh + cd envs/unified-env + spack env activate . + # Per-(ubuntu, spack-stack) buildcache repo. Splitting by ubuntu prevents cross-OS spec + # contamination -- spack's reuse:true scoring would otherwise pull e.g. 26.04-built + # gcc-runtime into a 24.04 concretization, producing a split-brain env with duplicate + # same-package specs at different OSes that crash module generation. Splitting by + # spack-stack version isolates recipe-change deltas between releases (different recipes + # produce different package hashes). Slurm version is NOT in the cache name because + # slurm is external and doesn't affect spec hashes. GHCR will auto-create the repo on + # first push (private by default; set visibility from the GHCR UI separately if you + # want to share read access). + spack mirror add --unsigned ghcr_buildcache "oci://ghcr.io/noaa-gsl/dockerspackstackslurmcluster/buildcache-ubuntu-${UBUNTU_VERSION}-spack-stack-${SPACK_STACK_VERSION}" + + spack concretize + + # Helpers parse `spack spec --json` and look only at the top-level node, so external + # deps (e.g. openssl) don't produce false positives in the is_external check. + is_external() { + python3 -c ' +import json, subprocess, sys +out = subprocess.check_output(["spack", "spec", "--json", sys.argv[1]], + text=True, stderr=subprocess.DEVNULL) +n = json.loads(out)["spec"]["nodes"][0] +ext = n.get("external_path") +if not ext: + e = n.get("external") + if isinstance(e, dict): + ext = e.get("path") +sys.exit(0 if ext else 1) +' "$1" + } + has_variant() { + # has_variant variant forms: +foo ~foo key=val + python3 -c ' +import json, subprocess, sys +out = subprocess.check_output(["spack", "spec", "--json", sys.argv[1]], + text=True, stderr=subprocess.DEVNULL) +n = json.loads(out)["spec"]["nodes"][0] +params = n.get("parameters", {}) +spec = sys.argv[2] +if spec.startswith("+"): + sys.exit(0 if params.get(spec[1:]) is True else 1) +if spec.startswith("~"): + sys.exit(0 if params.get(spec[1:]) is False else 1) +if "=" in spec: + k, v = spec.split("=", 1) + val = params.get(k) + if isinstance(val, list): + sys.exit(0 if v in val else 1) + sys.exit(0 if val == v else 1) +sys.exit(2) +' "$1" "$2" + } + + echo "===== Verification summary =====" + if is_external munge; then echo "PASS: munge is external"; else echo "FAIL: munge is NOT external"; fi + if is_external slurm; then echo "PASS: slurm is external"; else echo "FAIL: slurm is NOT external"; fi + if has_variant pmix +munge; then echo "PASS: pmix has +munge variant"; else echo "FAIL: pmix does NOT have +munge variant"; fi + if has_variant openmpi schedulers=slurm; then echo "PASS: openmpi has schedulers=slurm"; else echo "FAIL: openmpi does NOT have schedulers=slurm"; fi +EOF # Install the Spack environment RUN --mount=type=secret,id=github_token <&1 | tee log.install - # Update the build cache index if credentials were provided - if [ -f /run/secrets/github_token ]; then - spack buildcache update-index ghcr_buildcache + # Update the build cache index if credentials were provided. + # update-index makes many blob fetches against GHCR to rebuild the index, and we've + # observed transient SSL / connection-timeout failures on slow or congested links. + # Per-package artifacts are already published via the autopush mirror setting above, + # so the buildcache contents are populated regardless of whether the index refreshes. + # That makes index refresh best-effort: retry a few times, then warn and move on + # rather than failing the entire build (which would discard hours of install work). + # A stale index can be refreshed later from any container with GHCR credentials via + # `spack buildcache update-index ghcr_buildcache`. + if [ -s /run/secrets/github_token ]; then + INDEX_OK=0 + for i in 1 2 3 4 5; do + if spack buildcache update-index ghcr_buildcache; then + INDEX_OK=1 + echo "buildcache update-index succeeded on attempt \$i" + break + fi + echo "buildcache update-index attempt $i failed, sleeping 30s before retry..." + sleep 30 + done + if [ "$INDEX_OK" -ne 1 ]; then + echo "WARN: buildcache update-index failed after 5 attempts." + echo "WARN: per-package artifacts ARE in the buildcache via autopush, but the" + echo "WARN: index is stale until 'spack buildcache update-index ghcr_buildcache'" + echo "WARN: is run successfully from a container with credentials." + fi fi EOF @@ -185,7 +388,7 @@ RUN cd /opt/spack-stack \ && rm -rf ~/.spack # Copy installed environment into final image -FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-frontend:latest +FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-frontend:${BASE_IMAGE_TAG} # Default Slurm MPI plugin so users do not need to pass --mpi=pmix on every srun. ENV SLURM_MPI_TYPE=pmix diff --git a/master/Dockerfile b/master/Dockerfile index 159e27e..7cd06a6 100644 --- a/master/Dockerfile +++ b/master/Dockerfile @@ -1,4 +1,9 @@ -FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-master:latest +# BASE_IMAGE_TAG selects the base image variant (e.g. ubuntu-26.04-slurm-25.11.5 or +# ubuntu-24.04-slurm-25.11.5). Default matches `latest` in the base registry. When +# using docker compose this is constructed from UBUNTU_VERSION + SLURM_VERSION in .env; +# when invoking buildx directly it can be set via --build-arg BASE_IMAGE_TAG=... +ARG BASE_IMAGE_TAG=ubuntu-26.04-slurm-25.11.5 +FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-master:${BASE_IMAGE_TAG} ENV DEBIAN_FRONTEND=noninteractive ENV TZ=Etc/UTC diff --git a/node/Dockerfile b/node/Dockerfile index 928eeb7..acf4f40 100644 --- a/node/Dockerfile +++ b/node/Dockerfile @@ -1,4 +1,9 @@ -FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-node:latest +# BASE_IMAGE_TAG selects the base image variant (e.g. ubuntu-26.04-slurm-25.11.5 or +# ubuntu-24.04-slurm-25.11.5). Default matches `latest` in the base registry. When +# using docker compose this is constructed from UBUNTU_VERSION + SLURM_VERSION in .env; +# when invoking buildx directly it can be set via --build-arg BASE_IMAGE_TAG=... +ARG BASE_IMAGE_TAG=ubuntu-26.04-slurm-25.11.5 +FROM ghcr.io/noaa-gsl/dockerslurmcluster/slurm-node:${BASE_IMAGE_TAG} ENV DEBIAN_FRONTEND=noninteractive ENV TZ=Etc/UTC