microsoft · Jiawen-CS · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "typing-extensions>=4.0",
     "pyyaml>=6.0",
     "pydantic>=2.0",
+    "github-copilot-sdk>=0.1.18",
     "textual>=7.0",
 ]
 

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
@@ -1,20 +1,24 @@
 """GitHub Copilot CLI Agent implementation."""
 
+import asyncio
 import shutil
 import subprocess
 import sys
 from pathlib import Path
 
 import yaml
+from copilot import CopilotClient, MCPServerConfig, SessionConfig
+from copilot.generated.session_events import SessionEventType
 
 from bcbench.agent.copilot.metrics import parse_metrics
 from bcbench.agent.shared import build_mcp_config, build_prompt
 from bcbench.config import get_config
 from bcbench.dataset import DatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
+from bcbench.operations.skills_operations import setup_copilot_skills
 from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
 from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration
 
 logger = get_logger(__name__)
 _config = get_config()
@@ -32,12 +36,14 @@
     logger.info(f"Running GitHub Copilot CLI on: {entry.instance_id}")
 
     prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
-    mcp_config_json, mcp_server_names = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
+    mcp_config: dict[str, MCPServerConfig] | None = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
     instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path)
+    copilot_skills: list[str] | None = setup_copilot_skills(copilot_config, entry, repo_path)
+    config = ExperimentConfiguration(mcp_servers=list(mcp_config.keys()) if mcp_config else None, custom_instructions=instructions_enabled, custom_agent=custom_agent)
     skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path)
     custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path)
     config = ExperimentConfiguration(
        mcp_servers=mcp_server_names,
        custom_instructions=instructions_enabled,
        skills_enabled=skills_enabled,
        custom_agent=custom_agent,
@@ -64,13 +70,43 @@
         ]
         if not instructions_enabled:
             cmd_args.append("--no-custom-instructions")
-        if mcp_config_json:
-            cmd_args.append(f"--additional-mcp-config={mcp_config_json}")
         if custom_agent:
             cmd_args.append(f"--agent={custom_agent}")
 
         logger.debug(f"Copilot command args: {cmd_args}")
 
+        # Copilot SDK
+        async def run_copilot():
+            client = CopilotClient({"cli_path": copilot_cmd})
+            await client.start()
+
+            session = await client.create_session(
+                SessionConfig(
+                    model=model,
+                    mcp_servers=mcp_config if mcp_config else {},
+                    streaming=True,
+                    skill_directories=copilot_skills if copilot_skills else [],
+                )
+            )
+
+            # Listen for response chunks
+            def handle_event(event):
+                if event.type == SessionEventType.ASSISTANT_MESSAGE_DELTA:
+                    sys.stdout.write(event.data.delta_content)
+                    sys.stdout.flush()
+
+            session.on(handle_event)
+
+            await session.send_and_wait(
+                {"prompt": prompt},
+                timeout=_config.timeout.agent_execution,
+            )
+            print()  # newline after streaming
+
+            await client.stop()
+
+        asyncio.run(run_copilot())
+
         result = subprocess.run(
             cmd_args,
             cwd=str(repo_path),

diff --git a/src/bcbench/agent/shared/mcp.py b/src/bcbench/agent/shared/mcp.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import Any
 
+from copilot import MCPLocalServerConfig, MCPRemoteServerConfig, MCPServerConfig
 from jinja2 import Template
 
 from bcbench.dataset import DatasetEntry
@@ -39,33 +40,33 @@ def cleanup(self) -> None:
 _mcp_server_manager = _ALMcpServerManager()
 
 
-def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> tuple[str, dict[str, Any]]:
+def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> dict[str, MCPServerConfig]:
     server_type: str = server["type"]
     server_name: str = server["name"]
     tools: list[str] = server["tools"]
 
     match server_type:
         case "http":
-            return server_name, {
-                "type": server_type,
-                "url": server["url"],
-                "tools": tools,
-            }
+            return {server_name: MCPRemoteServerConfig(
+                tools=tools,
+                url=server["url"],
+                type=server_type
+            )}
         case "local":
             args: list[str] = server["args"]
             rendered_args = [Template(arg).render(**template_context) for arg in args]
-            return server_name, {
-                "type": server_type,
-                "command": server["command"],
-                "args": rendered_args,
-                "tools": tools,
-            }
+            return {server_name: MCPLocalServerConfig(
+                tools=tools,
+                command=server["command"],
+                args=rendered_args,
+                type=server_type,
+            )}
         case _:
             logger.error(f"Unsupported MCP server type: {server_type}, {server}")
             raise AgentError(f"Unsupported MCP server type: {server_type}")
 
 
-def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False) -> tuple[str | None, list[str] | None]:
+def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False) -> dict[str, MCPServerConfig] | None:
     # following docs: https://docs.github.com/en/enterprise-cloud@latest/copilot/how-tos/use-copilot-agents/coding-agent/extend-coding-agent-with-mcp
     mcp_servers: list[dict[str, Any]] = config.get("mcp", {}).get("servers", [])
 
@@ -78,11 +79,15 @@ def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Pat
         logger.info("AL MCP server enabled via --al-mcp flag")
 
     if not mcp_servers:
-        return None, None
+        return None
 
     template_context = {"repo_path": repo_path}
     mcp_server_names: list[str] = [server["name"] for server in mcp_servers]
-    mcp_config = {"mcpServers": dict(map(lambda s: _build_server_entry(s, template_context), mcp_servers))}
+    # mcp_config = {"mcpServers": dict(map(lambda s: _build_server_entry(s, template_context), mcp_servers))}
+    mcp_config : dict[str, MCPServerConfig] = {}
+    for server in mcp_servers:
+        server_entry = _build_server_entry(server, template_context)
+        mcp_config.update(server_entry)
 
     if al_mcp:
         # Launch MCP server with all project paths separated by semicolons
@@ -92,4 +97,4 @@ def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Pat
     logger.info(f"Using MCP servers: {mcp_server_names}")
     logger.debug(f"MCP configuration: {json.dumps(mcp_config, indent=2)}")
 
-    return json.dumps(mcp_config, separators=(",", ":")), mcp_server_names
+    return mcp_config
diff --git a/src/bcbench/operations/skills_operations.py b/src/bcbench/operations/skills_operations.py
@@ -1,5 +1,8 @@
 from pathlib import Path
+from shutil import copytree
+
+from bcbench.config import get_config
 from shutil import copytree, rmtree
 
 from bcbench.dataset.dataset_entry import DatasetEntry
 from bcbench.logger import get_logger

diff --git a/uv.lock b/uv.lock