Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies = [
"typing-extensions>=4.0",
"pyyaml>=6.0",
"pydantic>=2.0",
"github-copilot-sdk>=0.1.18",
"textual>=7.0",
]

Expand Down
42 changes: 39 additions & 3 deletions src/bcbench/agent/copilot/agent.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
"""GitHub Copilot CLI Agent implementation."""

import asyncio
import shutil
import subprocess
import sys
from pathlib import Path

import yaml
from copilot import CopilotClient, MCPServerConfig, SessionConfig
from copilot.generated.session_events import SessionEventType

from bcbench.agent.copilot.metrics import parse_metrics
from bcbench.agent.shared import build_mcp_config, build_prompt
from bcbench.config import get_config
from bcbench.dataset import DatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
from bcbench.logger import get_logger
from bcbench.operations.skills_operations import setup_copilot_skills
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration

Check failure on line 21 in src/bcbench/agent/copilot/agent.py

View workflow job for this annotation

GitHub Actions / lint-and-test

Ruff (I001)

src/bcbench/agent/copilot/agent.py:3:1: I001 Import block is un-sorted or un-formatted

logger = get_logger(__name__)
_config = get_config()
Expand All @@ -32,12 +36,14 @@
logger.info(f"Running GitHub Copilot CLI on: {entry.instance_id}")

prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
mcp_config_json, mcp_server_names = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
mcp_config: dict[str, MCPServerConfig] | None = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path)
copilot_skills: list[str] | None = setup_copilot_skills(copilot_config, entry, repo_path)
config = ExperimentConfiguration(mcp_servers=list(mcp_config.keys()) if mcp_config else None, custom_instructions=instructions_enabled, custom_agent=custom_agent)

Check failure on line 42 in src/bcbench/agent/copilot/agent.py

View workflow job for this annotation

GitHub Actions / lint-and-test

Ruff (F821)

src/bcbench/agent/copilot/agent.py:42:154: F821 Undefined name `custom_agent`
skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path)
custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,

Check failure on line 46 in src/bcbench/agent/copilot/agent.py

View workflow job for this annotation

GitHub Actions / lint-and-test

Ruff (F821)

src/bcbench/agent/copilot/agent.py:46:21: F821 Undefined name `mcp_server_names`
custom_instructions=instructions_enabled,
skills_enabled=skills_enabled,
custom_agent=custom_agent,
Expand All @@ -64,13 +70,43 @@
]
if not instructions_enabled:
cmd_args.append("--no-custom-instructions")
if mcp_config_json:
cmd_args.append(f"--additional-mcp-config={mcp_config_json}")
if custom_agent:
cmd_args.append(f"--agent={custom_agent}")

logger.debug(f"Copilot command args: {cmd_args}")

# Copilot SDK
async def run_copilot():
client = CopilotClient({"cli_path": copilot_cmd})
await client.start()

session = await client.create_session(
SessionConfig(
model=model,
mcp_servers=mcp_config if mcp_config else {},
streaming=True,
skill_directories=copilot_skills if copilot_skills else [],
)
)

# Listen for response chunks
def handle_event(event):
if event.type == SessionEventType.ASSISTANT_MESSAGE_DELTA:
sys.stdout.write(event.data.delta_content)
sys.stdout.flush()

session.on(handle_event)

await session.send_and_wait(
{"prompt": prompt},
timeout=_config.timeout.agent_execution,
)
print() # newline after streaming

await client.stop()

asyncio.run(run_copilot())

result = subprocess.run(
cmd_args,
cwd=str(repo_path),
Expand Down
37 changes: 21 additions & 16 deletions src/bcbench/agent/shared/mcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from typing import Any

from copilot import MCPLocalServerConfig, MCPRemoteServerConfig, MCPServerConfig
from jinja2 import Template

from bcbench.dataset import DatasetEntry
Expand Down Expand Up @@ -39,33 +40,33 @@ def cleanup(self) -> None:
_mcp_server_manager = _ALMcpServerManager()


def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> tuple[str, dict[str, Any]]:
def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> dict[str, MCPServerConfig]:
server_type: str = server["type"]
server_name: str = server["name"]
tools: list[str] = server["tools"]

match server_type:
case "http":
return server_name, {
"type": server_type,
"url": server["url"],
"tools": tools,
}
return {server_name: MCPRemoteServerConfig(
tools=tools,
url=server["url"],
type=server_type
)}
case "local":
args: list[str] = server["args"]
rendered_args = [Template(arg).render(**template_context) for arg in args]
return server_name, {
"type": server_type,
"command": server["command"],
"args": rendered_args,
"tools": tools,
}
return {server_name: MCPLocalServerConfig(
tools=tools,
command=server["command"],
args=rendered_args,
type=server_type,
)}
case _:
logger.error(f"Unsupported MCP server type: {server_type}, {server}")
raise AgentError(f"Unsupported MCP server type: {server_type}")


def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False) -> tuple[str | None, list[str] | None]:
def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False) -> dict[str, MCPServerConfig] | None:
# following docs: https://docs.github.com/en/enterprise-cloud@latest/copilot/how-tos/use-copilot-agents/coding-agent/extend-coding-agent-with-mcp
mcp_servers: list[dict[str, Any]] = config.get("mcp", {}).get("servers", [])

Expand All @@ -78,11 +79,15 @@ def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Pat
logger.info("AL MCP server enabled via --al-mcp flag")

if not mcp_servers:
return None, None
return None

template_context = {"repo_path": repo_path}
mcp_server_names: list[str] = [server["name"] for server in mcp_servers]
mcp_config = {"mcpServers": dict(map(lambda s: _build_server_entry(s, template_context), mcp_servers))}
# mcp_config = {"mcpServers": dict(map(lambda s: _build_server_entry(s, template_context), mcp_servers))}
mcp_config : dict[str, MCPServerConfig] = {}
for server in mcp_servers:
server_entry = _build_server_entry(server, template_context)
mcp_config.update(server_entry)

if al_mcp:
# Launch MCP server with all project paths separated by semicolons
Expand All @@ -92,4 +97,4 @@ def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Pat
logger.info(f"Using MCP servers: {mcp_server_names}")
logger.debug(f"MCP configuration: {json.dumps(mcp_config, indent=2)}")

return json.dumps(mcp_config, separators=(",", ":")), mcp_server_names
return mcp_config
3 changes: 3 additions & 0 deletions src/bcbench/operations/skills_operations.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from pathlib import Path
from shutil import copytree

from bcbench.config import get_config

Check failure on line 4 in src/bcbench/operations/skills_operations.py

View workflow job for this annotation

GitHub Actions / lint-and-test

Ruff (F401)

src/bcbench/operations/skills_operations.py:4:28: F401 `bcbench.config.get_config` imported but unused
from shutil import copytree, rmtree

Check failure on line 5 in src/bcbench/operations/skills_operations.py

View workflow job for this annotation

GitHub Actions / lint-and-test

Ruff (F811)

src/bcbench/operations/skills_operations.py:5:20: F811 Redefinition of unused `copytree` from line 2

from bcbench.dataset.dataset_entry import DatasetEntry
from bcbench.logger import get_logger
Expand Down
16 changes: 16 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading