Refactor dataset model for extensibility across evaluation categories #1950
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| env: | |
| EVALUATION_RESULTS_DIR: evaluation_results | |
| jobs: | |
| lint-and-test: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v5 | |
| - name: Setup Python with UV | |
| uses: ./.github/actions/setup-python-uv | |
| with: | |
| all-extras: true | |
| - name: Run Ruff | |
| run: uv run ruff check --output-format=github . | |
| - name: Run tests with coverage | |
| run: uv run pytest --cov=src/bcbench --cov-report=term-missing | |
| select-category: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| category: ${{ steps.random.outputs.category }} | |
| steps: | |
| - name: Select random category | |
| id: random | |
| shell: pwsh | |
| run: | | |
| $categories = @("bug-fix", "test-generation") | |
| $selected = $categories | Get-Random | |
| echo "category=$selected" >> $env:GITHUB_OUTPUT | |
| get-entries: | |
| needs: select-category | |
| uses: ./.github/workflows/get-entries.yml | |
| with: | |
| test-run: true | |
| category: ${{ needs.select-category.outputs.category }} | |
| mock-evaluation: | |
| runs-on: ubuntu-latest | |
| needs: [get-entries, select-category] | |
| if: needs.get-entries.outputs.entries != '[]' | |
| outputs: | |
| results-dir: ${{ env.EVALUATION_RESULTS_DIR }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| entry: ${{ fromJson(needs.get-entries.outputs.entries) }} | |
| name: Test Run for ${{ matrix.entry }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v5 | |
| - name: Setup Python with UV | |
| uses: ./.github/actions/setup-python-uv | |
| - name: Run mock evaluation for ${{ matrix.entry }} | |
| run: uv run bcbench evaluate mock "${{ matrix.entry }}" --category ${{ needs.select-category.outputs.category }} --output-dir evaluation_results --run-id ${{ github.run_id }} | |
| - name: Upload mock evaluation results | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: ${{ matrix.entry }} | |
| path: ${{ env.EVALUATION_RESULTS_DIR }}/**/*.jsonl | |
| retention-days: 1 | |
| summarize-results: | |
| needs: [mock-evaluation, select-category] | |
| uses: ./.github/workflows/summarize-results.yml | |
| permissions: | |
| contents: write | |
| id-token: write | |
| with: | |
| results-dir: ${{ needs.mock-evaluation.outputs.results-dir }} | |
| model: ${{ github.run_id }} | |
| agent: "mock-agent" | |
| mock: true | |
| category: ${{ needs.select-category.outputs.category }} | |
| secrets: inherit |