Periodic LLM benchmarks #64
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Periodic LLM benchmarks | |
| on: | |
| schedule: | |
| # Daily at midnight UTC. Change to '0 */6 * * *' for every 6h, | |
| # or '0 */4 * * *' for every 4h. | |
| - cron: '0 0 * * *' | |
| workflow_dispatch: | |
| inputs: | |
| models: | |
| description: 'Models to run (provider:model format, comma-separated, or "all")' | |
| required: false | |
| default: 'all' | |
| languages: | |
| description: 'Languages to benchmark (comma-separated: rust,csharp,typescript)' | |
| required: false | |
| default: 'rust,csharp,typescript' | |
| modes: | |
| description: 'Modes to run (comma-separated: guidelines,no_context,docs,...)' | |
| required: false | |
| default: 'guidelines,no_context' | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: llm-benchmark-periodic | |
| cancel-in-progress: true | |
| jobs: | |
| run-benchmarks: | |
| runs-on: spacetimedb-new-runner-2 | |
| timeout-minutes: 180 | |
| steps: | |
| - name: Checkout master | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: master | |
| fetch-depth: 1 | |
| - uses: dtolnay/rust-toolchain@stable | |
| - uses: Swatinem/rust-cache@v2 | |
| - name: Setup .NET SDK | |
| uses: actions/setup-dotnet@v4 | |
| with: | |
| dotnet-version: "8.0.x" | |
| - name: Install WASI workload | |
| env: | |
| DOTNET_MULTILEVEL_LOOKUP: "0" | |
| DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home | |
| DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1" | |
| run: | | |
| dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: 22 | |
| - name: Install pnpm | |
| uses: ./.github/actions/setup-pnpm | |
| - name: Build llm-benchmark tool | |
| run: cargo install --path tools/xtask-llm-benchmark --locked | |
| - name: Build SpacetimeDB server for benchmark harness | |
| run: | | |
| cargo ci smoketests prepare | |
| mkdir -p "$HOME/.local/bin" | |
| ln -sf "$GITHUB_WORKSPACE/target/release/spacetimedb-cli" "$HOME/.local/bin/spacetime" | |
| echo "$HOME/.local/bin" >> "$GITHUB_PATH" | |
| - name: Run benchmarks | |
| env: | |
| OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }} | |
| LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }} | |
| MSBUILDDISABLENODEREUSE: "1" | |
| DOTNET_CLI_USE_MSBUILD_SERVER: "0" | |
| INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }} | |
| INPUT_MODELS: ${{ inputs.models || 'all' }} | |
| INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }} | |
| run: | | |
| LANGS="$INPUT_LANGUAGES" | |
| MODELS="$INPUT_MODELS" | |
| MODES="$INPUT_MODES" | |
| SUCCEEDED=0 | |
| FAILED=0 | |
| for LANG in $(echo "$LANGS" | tr ',' ' '); do | |
| if [ "$MODELS" = "all" ]; then | |
| if llm_benchmark run --lang "$LANG" --modes "$MODES"; then | |
| SUCCEEDED=$((SUCCEEDED + 1)) | |
| else | |
| echo "::warning::Benchmark run failed for lang=$LANG" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| else | |
| if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "$MODELS"; then | |
| SUCCEEDED=$((SUCCEEDED + 1)) | |
| else | |
| echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| fi | |
| done | |
| echo "Benchmark runs: $SUCCEEDED succeeded, $FAILED failed" | |
| if [ "$SUCCEEDED" -eq 0 ] && [ "$FAILED" -gt 0 ]; then | |
| echo "::error::All benchmark runs failed" | |
| exit 1 | |
| fi |