Skip to content

Refactor dataset model for extensibility across evaluation categories #1950

Refactor dataset model for extensibility across evaluation categories

Refactor dataset model for extensibility across evaluation categories #1950

Workflow file for this run

name: CI
on:
push:
branches: [main]
pull_request:
branches: [main]
workflow_dispatch:
permissions:
contents: read
env:
EVALUATION_RESULTS_DIR: evaluation_results
jobs:
lint-and-test:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v5
- name: Setup Python with UV
uses: ./.github/actions/setup-python-uv
with:
all-extras: true
- name: Run Ruff
run: uv run ruff check --output-format=github .
- name: Run tests with coverage
run: uv run pytest --cov=src/bcbench --cov-report=term-missing
select-category:
runs-on: ubuntu-latest
outputs:
category: ${{ steps.random.outputs.category }}
steps:
- name: Select random category
id: random
shell: pwsh
run: |
$categories = @("bug-fix", "test-generation")
$selected = $categories | Get-Random
echo "category=$selected" >> $env:GITHUB_OUTPUT
get-entries:
needs: select-category
uses: ./.github/workflows/get-entries.yml
with:
test-run: true
category: ${{ needs.select-category.outputs.category }}
mock-evaluation:
runs-on: ubuntu-latest
needs: [get-entries, select-category]
if: needs.get-entries.outputs.entries != '[]'
outputs:
results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
strategy:
fail-fast: false
matrix:
entry: ${{ fromJson(needs.get-entries.outputs.entries) }}
name: Test Run for ${{ matrix.entry }}
steps:
- name: Checkout repository
uses: actions/checkout@v5
- name: Setup Python with UV
uses: ./.github/actions/setup-python-uv
- name: Run mock evaluation for ${{ matrix.entry }}
run: uv run bcbench evaluate mock "${{ matrix.entry }}" --category ${{ needs.select-category.outputs.category }} --output-dir evaluation_results --run-id ${{ github.run_id }}
- name: Upload mock evaluation results
uses: actions/upload-artifact@v6
with:
name: ${{ matrix.entry }}
path: ${{ env.EVALUATION_RESULTS_DIR }}/**/*.jsonl
retention-days: 1
summarize-results:
needs: [mock-evaluation, select-category]
uses: ./.github/workflows/summarize-results.yml
permissions:
contents: write
id-token: write
with:
results-dir: ${{ needs.mock-evaluation.outputs.results-dir }}
model: ${{ github.run_id }}
agent: "mock-agent"
mock: true
category: ${{ needs.select-category.outputs.category }}
secrets: inherit