From 61b61eaaef0a61d9b8a27a8046663fb1d550a1e7 Mon Sep 17 00:00:00 2001
From: Jiawen Sun <t-jiawensun@microsoft.com>
Date: Fri, 30 Jan 2026 13:21:39 +0100
Subject: [PATCH 1/8] Build Copilot SDK

---
 pyproject.toml                     |  1 +
 src/bcbench/agent/copilot/agent.py | 81 +++++++++++++++++++++++-------
 src/bcbench/agent/shared/mcp.py    | 25 ++++-----
 uv.lock                            | 16 ++++++
 4 files changed, 93 insertions(+), 30 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 70386b0fc..69c999e0d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "typing-extensions>=4.0",
     "pyyaml>=6.0",
     "pydantic>=2.0",
+    "github-copilot-sdk>=0.1.18",
 ]
 
 [project.scripts]
diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
index cec696189..a04d37354 100644
--- a/src/bcbench/agent/copilot/agent.py
+++ b/src/bcbench/agent/copilot/agent.py
@@ -1,11 +1,17 @@
 """GitHub Copilot CLI Agent implementation."""
 
+import asyncio
+import json
+import random
 import shutil
 import subprocess
 import sys
 from pathlib import Path
+from typing import cast
 
 import yaml
+from copilot import CopilotClient, MCPServerConfig
+from copilot.generated.session_events import SessionEventType
 
 from bcbench.agent.copilot.metrics import parse_metrics
 from bcbench.agent.shared import build_mcp_config, build_prompt
@@ -65,29 +71,68 @@ def run_copilot_agent(entry: DatasetEntry, model: str, category: EvaluationCateg
 
         logger.debug(f"Copilot command args: {cmd_args}")
 
-        result = subprocess.run(
-            cmd_args,
-            cwd=str(repo_path),
-            stderr=subprocess.PIPE,  # only capture stderr where metrics are printed
-            timeout=_config.timeout.agent_execution,
-            check=True,
-        )
+        # Copilot SDK
+        async def run_copilot():
+            client = CopilotClient({"cli_path": copilot_cmd})
+            await client.start()
 
-        if result.stderr:
-            sys.stdout.buffer.write(result.stderr)
-            sys.stdout.buffer.flush()
-        logger.info(f"Copilot CLI run complete for: {entry.instance_id}")
+            if mcp_config_json:
+                raw = json.loads(mcp_config_json)
+                raw_servers = raw.get("mcpServers", {})
+            else:
+                raw_servers = {}
 
-        stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
-        stderr_lines = stderr.splitlines()
+            mcp_servers = cast(dict[str, MCPServerConfig], raw_servers or {})
 
-        # Find the most recent session log for tool usage parsing
-        session_logs = list(output_dir.glob("process-*.log"))
-        session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None
+            session = await client.create_session(
+                {
+                    "model": model,
+                    "mcp_servers": mcp_servers,
+                    "streaming": True,
+                }
+            )
 
-        metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)
+            # Listen for response chunks
+            def handle_event(event):
+                if event.type == SessionEventType.ASSISTANT_MESSAGE_DELTA:
+                    sys.stdout.write(event.data.delta_content)
+                    sys.stdout.flush()
 
-        return metrics, config
+            session.on(handle_event)
+
+            response = await session.send_and_wait(
+                {"prompt": prompt},
+                timeout=_config.timeout.agent_execution,
+            )
+            print()  # newline after streaming
+
+            await client.stop()
+
+        asyncio.run(run_copilot())
+
+        # result = subprocess.run(
+        #     cmd_args,
+        #     cwd=str(repo_path),
+        #     stderr=subprocess.PIPE,  # only capture stderr where metrics are printed
+        #     timeout=_config.timeout.agent_execution,
+        #     check=True,
+        # )
+
+        # if result.stderr:
+        #     sys.stdout.buffer.write(result.stderr)
+        #     sys.stdout.buffer.flush()
+        # logger.info(f"Copilot CLI run complete for: {entry.instance_id}")
+
+        # stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
+        # stderr_lines = stderr.splitlines()
+
+        # # Find the most recent session log for tool usage parsing
+        # session_logs = list(output_dir.glob("process-*.log"))
+        # session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None
+
+        # metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)
+
+        return None, config
     except subprocess.TimeoutExpired:
         logger.error(f"Copilot CLI timed out after {_config.timeout.agent_execution} seconds")
         metrics = AgentMetrics(execution_time=_config.timeout.agent_execution)
diff --git a/src/bcbench/agent/shared/mcp.py b/src/bcbench/agent/shared/mcp.py
index 1ca948904..f43ae1ee0 100644
--- a/src/bcbench/agent/shared/mcp.py
+++ b/src/bcbench/agent/shared/mcp.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import Any
 
+from copilot import MCPLocalServerConfig, MCPRemoteServerConfig, MCPServerConfig
 from jinja2 import Template
 
 from bcbench.dataset import DatasetEntry
@@ -39,27 +40,27 @@ def cleanup(self) -> None:
 _mcp_server_manager = _ALMcpServerManager()
 
 
-def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> tuple[str, dict[str, Any]]:
+def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> tuple[str, dict[str, MCPServerConfig]]:
     server_type: str = server["type"]
     server_name: str = server["name"]
     tools: list[str] = server["tools"]
 
     match server_type:
         case "http":
-            return server_name, {
-                "type": server_type,
-                "url": server["url"],
-                "tools": tools,
-            }
+            return server_name, MCPRemoteServerConfig(
+                tools=tools,
+                url=server["url"],
+                type=server_type
+            )
         case "local":
             args: list[str] = server["args"]
             rendered_args = [Template(arg).render(**template_context) for arg in args]
-            return server_name, {
-                "type": server_type,
-                "command": server["command"],
-                "args": rendered_args,
-                "tools": tools,
-            }
+            return server_name, MCPLocalServerConfig(
+                tools=tools,
+                command=server["command"],
+                args=rendered_args,
+                type=server_type,
+            )
         case _:
             logger.error(f"Unsupported MCP server type: {server_type}, {server}")
             raise AgentError(f"Unsupported MCP server type: {server_type}")
diff --git a/uv.lock b/uv.lock
index 303f04538..32970d578 100644
--- a/uv.lock
+++ b/uv.lock
@@ -145,6 +145,7 @@ name = "bcbench"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
+    { name = "github-copilot-sdk" },
     { name = "jsonschema" },
     { name = "mini-swe-agent" },
     { name = "pydantic" },
@@ -174,6 +175,7 @@ dev = [
 
 [package.metadata]
 requires-dist = [
+    { name = "github-copilot-sdk", specifier = ">=0.1.18" },
     { name = "jsonschema", specifier = ">=4.0" },
     { name = "mini-swe-agent", specifier = ">=1.0.0" },
     { name = "pydantic", specifier = ">=2.0" },
@@ -579,6 +581,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/eb/02/a6b21098b1d5d6249b7c5ab69dde30108a71e4e819d4a9778f1de1d5b70d/fsspec-2025.10.0-py3-none-any.whl", hash = "sha256:7c7712353ae7d875407f97715f0e1ffcc21e33d5b24556cb1e090ae9409ec61d", size = 200966, upload-time = "2025-10-30T14:58:42.53Z" },
 ]
 
+[[package]]
+name = "github-copilot-sdk"
+version = "0.1.18"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "python-dateutil" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bf/00/be64b9b33015d5e79fb5e5e95d871484e79a907b3792935b855ab40308ce/github_copilot_sdk-0.1.18.tar.gz", hash = "sha256:b2d56d40c0f48e81f2899d32fb4a8d2b8df22620913547da93fddf9b2f368e9e", size = 81318, upload-time = "2026-01-24T18:09:57.617Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ae/0f/f832b32bca9d89a26a2b810c69fdc37ac925e34855ee93a11bb3d90ca2b7/github_copilot_sdk-0.1.18-py3-none-any.whl", hash = "sha256:99cfdf4d4d0da6d92d5bf36a952546157785df83d6b0783b3f7a8e93a2762171", size = 33740, upload-time = "2026-01-24T18:09:55.696Z" },
+]
+
 [[package]]
 name = "grpcio"
 version = "1.67.1"

From d298c1653ea30fe1a72570bd23abd8b47c865744 Mon Sep 17 00:00:00 2001
From: Jiawen Sun <t-jiawensun@microsoft.com>
Date: Fri, 30 Jan 2026 13:51:13 +0100
Subject: [PATCH 2/8] Remove unused package

---
 src/bcbench/agent/copilot/agent.py | 39 +++++++++++++++---------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
index a04d37354..0bf395068 100644
--- a/src/bcbench/agent/copilot/agent.py
+++ b/src/bcbench/agent/copilot/agent.py
@@ -2,7 +2,6 @@
 
 import asyncio
 import json
-import random
 import shutil
 import subprocess
 import sys
@@ -100,7 +99,7 @@ def handle_event(event):
 
             session.on(handle_event)
 
-            response = await session.send_and_wait(
+            await session.send_and_wait(
                 {"prompt": prompt},
                 timeout=_config.timeout.agent_execution,
             )
@@ -110,29 +109,29 @@ def handle_event(event):
 
         asyncio.run(run_copilot())
 
-        # result = subprocess.run(
-        #     cmd_args,
-        #     cwd=str(repo_path),
-        #     stderr=subprocess.PIPE,  # only capture stderr where metrics are printed
-        #     timeout=_config.timeout.agent_execution,
-        #     check=True,
-        # )
+        result = subprocess.run(
+            cmd_args,
+            cwd=str(repo_path),
+            stderr=subprocess.PIPE,  # only capture stderr where metrics are printed
+            timeout=_config.timeout.agent_execution,
+            check=True,
+        )
 
-        # if result.stderr:
-        #     sys.stdout.buffer.write(result.stderr)
-        #     sys.stdout.buffer.flush()
-        # logger.info(f"Copilot CLI run complete for: {entry.instance_id}")
+        if result.stderr:
+            sys.stdout.buffer.write(result.stderr)
+            sys.stdout.buffer.flush()
+        logger.info(f"Copilot CLI run complete for: {entry.instance_id}")
 
-        # stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
-        # stderr_lines = stderr.splitlines()
+        stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
+        stderr_lines = stderr.splitlines()
 
-        # # Find the most recent session log for tool usage parsing
-        # session_logs = list(output_dir.glob("process-*.log"))
-        # session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None
+        # Find the most recent session log for tool usage parsing
+        session_logs = list(output_dir.glob("process-*.log"))
+        session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None
 
-        # metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)
+        metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)
 
-        return None, config
+        return metrics, config
     except subprocess.TimeoutExpired:
         logger.error(f"Copilot CLI timed out after {_config.timeout.agent_execution} seconds")
         metrics = AgentMetrics(execution_time=_config.timeout.agent_execution)

From d81c469298b8d84afe9822e57bfcfb479bd74fe0 Mon Sep 17 00:00:00 2001
From: Jiawen Sun <t-jiawensun@microsoft.com>
Date: Fri, 30 Jan 2026 14:23:04 +0100
Subject: [PATCH 3/8] Update mcp_config

---
 src/bcbench/agent/copilot/agent.py | 26 ++++++++------------------
 src/bcbench/agent/shared/mcp.py    | 22 +++++++++++++---------
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
index 0bf395068..37b9a97ed 100644
--- a/src/bcbench/agent/copilot/agent.py
+++ b/src/bcbench/agent/copilot/agent.py
@@ -9,7 +9,7 @@
 from typing import cast
 
 import yaml
-from copilot import CopilotClient, MCPServerConfig
+from copilot import CopilotClient, MCPServerConfig, SessionConfig
 from copilot.generated.session_events import SessionEventType
 
 from bcbench.agent.copilot.metrics import parse_metrics
@@ -37,10 +37,10 @@ def run_copilot_agent(entry: DatasetEntry, model: str, category: EvaluationCateg
     logger.info(f"Running GitHub Copilot CLI on: {entry.instance_id}")
 
     prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
-    mcp_config_json, mcp_server_names = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
+    mcp_config: dict[str, MCPServerConfig] | None = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
     instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path)
     custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path)
-    config = ExperimentConfiguration(mcp_servers=mcp_server_names, custom_instructions=instructions_enabled, custom_agent=custom_agent)
+    config = ExperimentConfiguration(mcp_servers=list(mcp_config.keys()) if mcp_config else None, custom_instructions=instructions_enabled, custom_agent=custom_agent)
 
     logger.info(f"Executing Copilot CLI in directory: {repo_path}")
     logger.debug(f"Using prompt:\n{prompt}")
@@ -63,8 +63,6 @@ def run_copilot_agent(entry: DatasetEntry, model: str, category: EvaluationCateg
         ]
         if not instructions_enabled:
             cmd_args.append("--no-custom-instructions")
-        if mcp_config_json:
-            cmd_args.append(f"--additional-mcp-config={mcp_config_json}")
         if custom_agent:
             cmd_args.append(f"--agent={custom_agent}")
 
@@ -75,20 +73,12 @@ async def run_copilot():
             client = CopilotClient({"cli_path": copilot_cmd})
             await client.start()
 
-            if mcp_config_json:
-                raw = json.loads(mcp_config_json)
-                raw_servers = raw.get("mcpServers", {})
-            else:
-                raw_servers = {}
-
-            mcp_servers = cast(dict[str, MCPServerConfig], raw_servers or {})
-
             session = await client.create_session(
-                {
-                    "model": model,
-                    "mcp_servers": mcp_servers,
-                    "streaming": True,
-                }
+                SessionConfig(
+                    model=model,
+                    mcp_servers=mcp_config if mcp_config else {},
+                    streaming=True,
+                )
             )
 
             # Listen for response chunks
diff --git a/src/bcbench/agent/shared/mcp.py b/src/bcbench/agent/shared/mcp.py
index f43ae1ee0..3e897a290 100644
--- a/src/bcbench/agent/shared/mcp.py
+++ b/src/bcbench/agent/shared/mcp.py
@@ -40,33 +40,33 @@ def cleanup(self) -> None:
 _mcp_server_manager = _ALMcpServerManager()
 
 
-def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> tuple[str, dict[str, MCPServerConfig]]:
+def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> dict[str, MCPServerConfig]:
     server_type: str = server["type"]
     server_name: str = server["name"]
     tools: list[str] = server["tools"]
 
     match server_type:
         case "http":
-            return server_name, MCPRemoteServerConfig(
+            return {server_name: MCPRemoteServerConfig(
                 tools=tools,
                 url=server["url"],
                 type=server_type
-            )
+            )}
         case "local":
             args: list[str] = server["args"]
             rendered_args = [Template(arg).render(**template_context) for arg in args]
-            return server_name, MCPLocalServerConfig(
+            return {server_name: MCPLocalServerConfig(
                 tools=tools,
                 command=server["command"],
                 args=rendered_args,
                 type=server_type,
-            )
+            )}
         case _:
             logger.error(f"Unsupported MCP server type: {server_type}, {server}")
             raise AgentError(f"Unsupported MCP server type: {server_type}")
 
 
-def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False) -> tuple[str | None, list[str] | None]:
+def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False) -> dict[str, MCPServerConfig] | None:
     # following docs: https://docs.github.com/en/enterprise-cloud@latest/copilot/how-tos/use-copilot-agents/coding-agent/extend-coding-agent-with-mcp
     mcp_servers: list[dict[str, Any]] = config.get("mcp", {}).get("servers", [])
 
@@ -79,11 +79,15 @@ def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Pat
         logger.info("AL MCP server enabled via --al-mcp flag")
 
     if not mcp_servers:
-        return None, None
+        return None
 
     template_context = {"repo_path": repo_path}
     mcp_server_names: list[str] = [server["name"] for server in mcp_servers]
-    mcp_config = {"mcpServers": dict(map(lambda s: _build_server_entry(s, template_context), mcp_servers))}
+    # mcp_config = {"mcpServers": dict(map(lambda s: _build_server_entry(s, template_context), mcp_servers))}
+    mcp_config : dict[str, MCPServerConfig] = {}
+    for server in mcp_servers:
+        server_entry = _build_server_entry(server, template_context)
+        mcp_config.update(server_entry)
 
     if al_mcp:
         # Launch MCP server with all project paths separated by semicolons
@@ -93,4 +97,4 @@ def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Pat
     logger.info(f"Using MCP servers: {mcp_server_names}")
     logger.debug(f"MCP configuration: {json.dumps(mcp_config, indent=2)}")
 
-    return json.dumps(mcp_config, separators=(",", ":")), mcp_server_names
+    return mcp_config

From 80e2f260620d37c00ceafc1898b8fb8bdb760915 Mon Sep 17 00:00:00 2001
From: Jiawen Sun <t-jiawensun@microsoft.com>
Date: Fri, 30 Jan 2026 16:19:14 +0100
Subject: [PATCH 4/8] add agent skills

---
 src/bcbench/agent/copilot/agent.py            |  3 ++-
 .../skills/github-actions-debugging/SKILL.md  | 23 ++++++++++++++++
 src/bcbench/agent/shared/config.yaml          |  6 +++++
 src/bcbench/operations/__init__.py            |  2 +-
 .../operations/instruction_operations.py      | 26 +++++++++++++++++++
 5 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/github-actions-debugging/SKILL.md

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
index 37b9a97ed..39d576bd7 100644
--- a/src/bcbench/agent/copilot/agent.py
+++ b/src/bcbench/agent/copilot/agent.py
@@ -18,7 +18,7 @@
 from bcbench.dataset import DatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
-from bcbench.operations import setup_custom_agent, setup_instructions_from_config
+from bcbench.operations import setup_custom_agent, setup_instructions_from_config, setup_copilot_skills
 from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -39,6 +39,7 @@ def run_copilot_agent(entry: DatasetEntry, model: str, category: EvaluationCateg
     prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
     mcp_config: dict[str, MCPServerConfig] | None = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
     instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path)
+    copilot_skills: str | None = setup_copilot_skills(copilot_config, entry, repo_path)
     custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path)
     config = ExperimentConfiguration(mcp_servers=list(mcp_config.keys()) if mcp_config else None, custom_instructions=instructions_enabled, custom_agent=custom_agent)
 
diff --git a/src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/github-actions-debugging/SKILL.md b/src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/github-actions-debugging/SKILL.md
new file mode 100644
index 000000000..ac026cad2
--- /dev/null
+++ b/src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/github-actions-debugging/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: github-actions-debugging
+description: Guide for debugging failing GitHub Actions workflows. Use this when asked to debug failing GitHub Actions workflows.
+---
+
+# GitHub Actions Debugging
+
+This skill helps you debug failing GitHub Actions workflows in pull requests.
+
+## Process
+
+1. Use the `list_workflow_runs` tool to look up recent workflow runs for the pull request and their status
+2. Use the `summarize_job_log_failures` tool to get an AI summary of the logs for failed jobs
+3. If you need more information, use the `get_job_logs` or `get_workflow_run_logs` tool to get the full failure logs
+4. Try to reproduce the failure locally in your environment
+5. Fix the failing build and verify the fix before committing changes
+
+## Common issues
+
+- **Missing environment variables**: Check that all required secrets are configured
+- **Version mismatches**: Verify action versions and dependencies are compatible
+- **Permission issues**: Ensure the workflow has the necessary permissions
+- **Timeout issues**: Consider splitting long-running jobs or increasing timeout values
diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml
index c53792c05..9f886e080 100644
--- a/src/bcbench/agent/shared/config.yaml
+++ b/src/bcbench/agent/shared/config.yaml
@@ -58,6 +58,12 @@ prompt:
 instructions:
   enabled: false
 
+# controls:
+# 1. whether to copy custom agents (`src/bcbench/agent/copilot/instructions/<sanitized-repo>/skills/`) into the repo
+# 2. whether to pass --agent=<agent-name> to copilot
+skills:
+  enabled: false
+
 # controls:
 # 1. whether to copy custom agents (`src/bcbench/agent/copilot/instructions/<sanitized-repo>/agents/`) into the repo
 # 2. whether to pass --agent=<agent-name> to copilot
diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py
index 61b1f83d9..bc197e54a 100644
--- a/src/bcbench/operations/__init__.py
+++ b/src/bcbench/operations/__init__.py
@@ -14,7 +14,7 @@
     clean_repo,
     stage_and_get_diff,
 )
-from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_custom_agent, setup_instructions_from_config
+from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_copilot_skills, setup_custom_agent, setup_instructions_from_config
 from bcbench.operations.project_operations import categorize_projects
 from bcbench.operations.setup_operations import setup_repo_postbuild, setup_repo_prebuild
 from bcbench.operations.test_operations import extract_tests_from_patch
diff --git a/src/bcbench/operations/instruction_operations.py b/src/bcbench/operations/instruction_operations.py
index 7ac9d9ab0..d5c94c275 100644
--- a/src/bcbench/operations/instruction_operations.py
+++ b/src/bcbench/operations/instruction_operations.py
@@ -37,6 +37,32 @@ def setup_instructions_from_config(copilot_config: dict, entry: DatasetEntry, re
     return instructions_enabled
 
 
+def setup_copilot_skills(copilot_config: dict, entry: DatasetEntry, repo_path: Path) -> str | None:
+    """
+    Setup skills in the repository if available.
+    """
+    skills_config: dict = copilot_config["skills"]
+    skills_enabled: bool = skills_config["enabled"]
+
+    if skills_enabled:
+        source_skills: Path = _get_source_instructions_path(entry.repo)
+        source_skills_dir = source_skills / "skills"
+
+        # Skip if skills folder doesn't exist for this repo
+        if not source_skills_dir.exists():
+            logger.info(f"No skills folder found at {source_skills_dir}, skipping")
+            return None
+
+        github_dir: Path = repo_path / ".github"
+        skills_dir = github_dir / "skills"
+        copytree(source_skills_dir, skills_dir, dirs_exist_ok=True)
+
+        logger.info(f"Skills are set up from {source_skills_dir}")
+        return skills_config.get("name")
+
+    return None
+
+
 def setup_custom_agent(copilot_config: dict, entry: DatasetEntry, repo_path: Path) -> str | None:
     """
     Setup custom agents in the repository if available.

From e908e207f956714265e0668a35eed374f866edc3 Mon Sep 17 00:00:00 2001
From: Jiawen Sun <t-jiawensun@microsoft.com>
Date: Fri, 30 Jan 2026 16:25:53 +0100
Subject: [PATCH 5/8] remove unused package

---
 src/bcbench/agent/copilot/agent.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
index 39d576bd7..07190e3aa 100644
--- a/src/bcbench/agent/copilot/agent.py
+++ b/src/bcbench/agent/copilot/agent.py
@@ -1,12 +1,10 @@
 """GitHub Copilot CLI Agent implementation."""
 
 import asyncio
-import json
 import shutil
 import subprocess
 import sys
 from pathlib import Path
-from typing import cast
 
 import yaml
 from copilot import CopilotClient, MCPServerConfig, SessionConfig

From 351adddc51ca1f29b70b1fff7072159998b35ca1 Mon Sep 17 00:00:00 2001
From: Jiawen Sun <t-jiawensun@microsoft.com>
Date: Mon, 2 Feb 2026 09:57:48 +0100
Subject: [PATCH 6/8] Try to update packages

---
 src/bcbench/agent/copilot/agent.py | 4 +++-
 src/bcbench/operations/__init__.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
index 07190e3aa..cbbb3c098 100644
--- a/src/bcbench/agent/copilot/agent.py
+++ b/src/bcbench/agent/copilot/agent.py
@@ -16,7 +16,7 @@
 from bcbench.dataset import DatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
-from bcbench.operations import setup_custom_agent, setup_instructions_from_config, setup_copilot_skills
+from bcbench.operations import setup_copilot_skills, setup_custom_agent, setup_instructions_from_config
 from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -62,6 +62,8 @@ def run_copilot_agent(entry: DatasetEntry, model: str, category: EvaluationCateg
         ]
         if not instructions_enabled:
             cmd_args.append("--no-custom-instructions")
+        if copilot_skills:
+            cmd_args.append(f"--skills-dir={copilot_skills}")
         if custom_agent:
             cmd_args.append(f"--agent={custom_agent}")
 
diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py
index bc197e54a..881eed9b8 100644
--- a/src/bcbench/operations/__init__.py
+++ b/src/bcbench/operations/__init__.py
@@ -32,6 +32,7 @@
     "copy_problem_statement_folder",
     "extract_tests_from_patch",
     "run_tests",
+    "setup_copilot_skills",
     "setup_custom_agent",
     "setup_instructions_from_config",
     "setup_repo_postbuild",

From 806a3d222bc72e3fab11ffbd29ee03d1a74ee6c8 Mon Sep 17 00:00:00 2001
From: Jiawen Sun <t-jiawensun@microsoft.com>
Date: Mon, 2 Feb 2026 12:56:48 +0100
Subject: [PATCH 7/8] Add another skills

---
 src/bcbench/agent/copilot/agent.py            |  8 +-
 .../skills/al-test-generation/SKILL.md        | 93 +++++++++++++++++++
 .../skills/al-test-generation/SKILL.md        | 93 +++++++++++++++++++
 .../skills/github-actions-debugging/SKILL.md  | 23 -----
 src/bcbench/agent/shared/config.yaml          |  8 +-
 src/bcbench/operations/__init__.py            |  3 +-
 .../operations/instruction_operations.py      | 26 ------
 src/bcbench/operations/skills_operations.py   | 44 +++++++++
 8 files changed, 241 insertions(+), 57 deletions(-)
 create mode 100644 src/bcbench/agent/copilot/instructions/microsoft-BCApps/skills/al-test-generation/SKILL.md
 create mode 100644 src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/al-test-generation/SKILL.md
 delete mode 100644 src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/github-actions-debugging/SKILL.md
 create mode 100644 src/bcbench/operations/skills_operations.py

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
index cbbb3c098..45619c258 100644
--- a/src/bcbench/agent/copilot/agent.py
+++ b/src/bcbench/agent/copilot/agent.py
@@ -16,7 +16,8 @@
 from bcbench.dataset import DatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
-from bcbench.operations import setup_copilot_skills, setup_custom_agent, setup_instructions_from_config
+from bcbench.operations import setup_custom_agent, setup_instructions_from_config
+from bcbench.operations.skills_operations import setup_copilot_skills
 from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -37,7 +38,7 @@ def run_copilot_agent(entry: DatasetEntry, model: str, category: EvaluationCateg
     prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
     mcp_config: dict[str, MCPServerConfig] | None = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
     instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path)
-    copilot_skills: str | None = setup_copilot_skills(copilot_config, entry, repo_path)
+    copilot_skills: list[str] | None = setup_copilot_skills(copilot_config, entry, repo_path)
     custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path)
     config = ExperimentConfiguration(mcp_servers=list(mcp_config.keys()) if mcp_config else None, custom_instructions=instructions_enabled, custom_agent=custom_agent)
 
@@ -62,8 +63,6 @@ def run_copilot_agent(entry: DatasetEntry, model: str, category: EvaluationCateg
         ]
         if not instructions_enabled:
             cmd_args.append("--no-custom-instructions")
-        if copilot_skills:
-            cmd_args.append(f"--skills-dir={copilot_skills}")
         if custom_agent:
             cmd_args.append(f"--agent={custom_agent}")
 
@@ -79,6 +78,7 @@ async def run_copilot():
                     model=model,
                     mcp_servers=mcp_config if mcp_config else {},
                     streaming=True,
+                    skill_directories=copilot_skills if copilot_skills else [],
                 )
             )
 
diff --git a/src/bcbench/agent/copilot/instructions/microsoft-BCApps/skills/al-test-generation/SKILL.md b/src/bcbench/agent/copilot/instructions/microsoft-BCApps/skills/al-test-generation/SKILL.md
new file mode 100644
index 000000000..6817e6319
--- /dev/null
+++ b/src/bcbench/agent/copilot/instructions/microsoft-BCApps/skills/al-test-generation/SKILL.md
@@ -0,0 +1,93 @@
+---
+name: al-test-generation
+description: Guide for creating AL tests for Microsoft Dynamics 365 Business Central. Use this when asked to write, create, or generate AL test codeunits, test procedures, or test automation for Business Central.
+---
+
+To create AL tests for Microsoft Dynamics 365 Business Central, follow this process:
+
+## 1. Analyze the Code Under Test
+
+Before writing any test code:
+1. Read and understand the procedure or functionality being tested
+2. Trace through all code paths to identify UI interactions
+3. Examine table definitions for TableRelation constraints
+
+## 2. Identify Required Handler Methods
+
+**CRITICAL: Tests fail with "Unhandled UI" errors when handlers are missing.**
+
+Look for these patterns in the code under test:
+
+| Code Pattern                          | Required Handler            |
+| ------------------------------------- | --------------------------- |
+| `Confirm()`                           | `[ConfirmHandler]`          |
+| `Message()`                           | `[MessageHandler]`          |
+| `StrMenu()`                           | `[StrMenuHandler]`          |
+| `Page.Run()`                          | `[PageHandler]`             |
+| `Page.RunModal()`                     | `[ModalPageHandler]`        |
+| `Report.Run()` or `Report.RunModal()` | `[ReportHandler]`           |
+| Report request page                   | `[RequestPageHandler]`      |
+| `Hyperlink()`                         | `[HyperlinkHandler]`        |
+| `Notification.Send()`                 | `[SendNotificationHandler]` |
+
+## 3. Analyze TableRelation Constraints
+
+**CRITICAL: Tests fail with validation errors when inserting data that violates TableRelation constraints.**
+
+Before inserting test data:
+1. Read the table definition for all fields receiving values
+2. Identify fields with `TableRelation` properties
+3. Ensure related records exist before inserting test data
+4. Use Library functions (e.g., `LibrarySales`, `LibraryPurchase`) to create prerequisite data
+
+## 4. Write Test Structure
+
+Follow the AAA pattern (Arrange-Act-Assert):
+
+```AL
+[Test]
+[HandlerFunctions('RequiredHandlers')]
+procedure TestProcedureName()
+begin
+    // [GIVEN] Setup test data and preconditions
+    Initialize();
+    CreateTestData();
+
+    // [WHEN] Execute the action being tested
+    ExecuteAction();
+
+    // [THEN] Verify the expected results
+    VerifyResults();
+end;
+```
+
+## 5. Handler Method Signatures
+
+```AL
+[ConfirmHandler]
+procedure ConfirmHandlerYes(Question: Text[1024]; var Reply: Boolean)
+begin
+    Reply := true;
+end;
+
+[MessageHandler]
+procedure MessageHandler(Message: Text[1024])
+begin
+    // Empty - suppresses message display
+end;
+
+[ModalPageHandler]
+procedure ModalPageHandler(var TestPage: TestPage "Page Name")
+begin
+    TestPage.OK().Invoke();
+end;
+```
+
+## 6. Best Practices
+
+- Use descriptive test procedure names that explain what is being tested
+- One assertion concept per test
+- Use Library Variable Storage to pass data between handlers and tests
+- Do NOT verify values inside handler procedures
+- Clean up test data in teardown or use transaction rollback
+- Use `Initialize()` procedure to set up common test fixtures
diff --git a/src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/al-test-generation/SKILL.md b/src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/al-test-generation/SKILL.md
new file mode 100644
index 000000000..6817e6319
--- /dev/null
+++ b/src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/al-test-generation/SKILL.md
@@ -0,0 +1,93 @@
+---
+name: al-test-generation
+description: Guide for creating AL tests for Microsoft Dynamics 365 Business Central. Use this when asked to write, create, or generate AL test codeunits, test procedures, or test automation for Business Central.
+---
+
+To create AL tests for Microsoft Dynamics 365 Business Central, follow this process:
+
+## 1. Analyze the Code Under Test
+
+Before writing any test code:
+1. Read and understand the procedure or functionality being tested
+2. Trace through all code paths to identify UI interactions
+3. Examine table definitions for TableRelation constraints
+
+## 2. Identify Required Handler Methods
+
+**CRITICAL: Tests fail with "Unhandled UI" errors when handlers are missing.**
+
+Look for these patterns in the code under test:
+
+| Code Pattern                          | Required Handler            |
+| ------------------------------------- | --------------------------- |
+| `Confirm()`                           | `[ConfirmHandler]`          |
+| `Message()`                           | `[MessageHandler]`          |
+| `StrMenu()`                           | `[StrMenuHandler]`          |
+| `Page.Run()`                          | `[PageHandler]`             |
+| `Page.RunModal()`                     | `[ModalPageHandler]`        |
+| `Report.Run()` or `Report.RunModal()` | `[ReportHandler]`           |
+| Report request page                   | `[RequestPageHandler]`      |
+| `Hyperlink()`                         | `[HyperlinkHandler]`        |
+| `Notification.Send()`                 | `[SendNotificationHandler]` |
+
+## 3. Analyze TableRelation Constraints
+
+**CRITICAL: Tests fail with validation errors when inserting data that violates TableRelation constraints.**
+
+Before inserting test data:
+1. Read the table definition for all fields receiving values
+2. Identify fields with `TableRelation` properties
+3. Ensure related records exist before inserting test data
+4. Use Library functions (e.g., `LibrarySales`, `LibraryPurchase`) to create prerequisite data
+
+## 4. Write Test Structure
+
+Follow the AAA pattern (Arrange-Act-Assert):
+
+```AL
+[Test]
+[HandlerFunctions('RequiredHandlers')]
+procedure TestProcedureName()
+begin
+    // [GIVEN] Setup test data and preconditions
+    Initialize();
+    CreateTestData();
+
+    // [WHEN] Execute the action being tested
+    ExecuteAction();
+
+    // [THEN] Verify the expected results
+    VerifyResults();
+end;
+```
+
+## 5. Handler Method Signatures
+
+```AL
+[ConfirmHandler]
+procedure ConfirmHandlerYes(Question: Text[1024]; var Reply: Boolean)
+begin
+    Reply := true;
+end;
+
+[MessageHandler]
+procedure MessageHandler(Message: Text[1024])
+begin
+    // Empty - suppresses message display
+end;
+
+[ModalPageHandler]
+procedure ModalPageHandler(var TestPage: TestPage "Page Name")
+begin
+    TestPage.OK().Invoke();
+end;
+```
+
+## 6. Best Practices
+
+- Use descriptive test procedure names that explain what is being tested
+- One assertion concept per test
+- Use Library Variable Storage to pass data between handlers and tests
+- Do NOT verify values inside handler procedures
+- Clean up test data in teardown or use transaction rollback
+- Use `Initialize()` procedure to set up common test fixtures
diff --git a/src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/github-actions-debugging/SKILL.md b/src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/github-actions-debugging/SKILL.md
deleted file mode 100644
index ac026cad2..000000000
--- a/src/bcbench/agent/copilot/instructions/microsoftInternal-NAV/skills/github-actions-debugging/SKILL.md
+++ /dev/null
@@ -1,23 +0,0 @@
----
-name: github-actions-debugging
-description: Guide for debugging failing GitHub Actions workflows. Use this when asked to debug failing GitHub Actions workflows.
----
-
-# GitHub Actions Debugging
-
-This skill helps you debug failing GitHub Actions workflows in pull requests.
-
-## Process
-
-1. Use the `list_workflow_runs` tool to look up recent workflow runs for the pull request and their status
-2. Use the `summarize_job_log_failures` tool to get an AI summary of the logs for failed jobs
-3. If you need more information, use the `get_job_logs` or `get_workflow_run_logs` tool to get the full failure logs
-4. Try to reproduce the failure locally in your environment
-5. Fix the failing build and verify the fix before committing changes
-
-## Common issues
-
-- **Missing environment variables**: Check that all required secrets are configured
-- **Version mismatches**: Verify action versions and dependencies are compatible
-- **Permission issues**: Ensure the workflow has the necessary permissions
-- **Timeout issues**: Consider splitting long-running jobs or increasing timeout values
diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml
index 9f886e080..827f49c5a 100644
--- a/src/bcbench/agent/shared/config.yaml
+++ b/src/bcbench/agent/shared/config.yaml
@@ -59,10 +59,12 @@ instructions:
   enabled: false
 
 # controls:
-# 1. whether to copy custom agents (`src/bcbench/agent/copilot/instructions/<sanitized-repo>/skills/`) into the repo
-# 2. whether to pass --agent=<agent-name> to copilot
+# 1. whether to copy skills (`src/bcbench/agent/copilot/instructions/<sanitized-repo>/skills/`) into repo/.github/skills/
+# 2. SDK passes skill_directories to SessionConfig; CLI auto-discovers from .github/skills/
 skills:
-  enabled: false
+  enabled: true
+  # name is for documentation only - all skills in the skills/ folder are loaded
+  name: al-test-generation
 
 # controls:
 # 1. whether to copy custom agents (`src/bcbench/agent/copilot/instructions/<sanitized-repo>/agents/`) into the repo
diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py
index 881eed9b8..5ab4d4ab4 100644
--- a/src/bcbench/operations/__init__.py
+++ b/src/bcbench/operations/__init__.py
@@ -14,7 +14,8 @@
     clean_repo,
     stage_and_get_diff,
 )
-from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_copilot_skills, setup_custom_agent, setup_instructions_from_config
+from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_custom_agent, setup_instructions_from_config
+from bcbench.operations.skills_operations import setup_copilot_skills
 from bcbench.operations.project_operations import categorize_projects
 from bcbench.operations.setup_operations import setup_repo_postbuild, setup_repo_prebuild
 from bcbench.operations.test_operations import extract_tests_from_patch
diff --git a/src/bcbench/operations/instruction_operations.py b/src/bcbench/operations/instruction_operations.py
index d5c94c275..7ac9d9ab0 100644
--- a/src/bcbench/operations/instruction_operations.py
+++ b/src/bcbench/operations/instruction_operations.py
@@ -37,32 +37,6 @@ def setup_instructions_from_config(copilot_config: dict, entry: DatasetEntry, re
     return instructions_enabled
 
 
-def setup_copilot_skills(copilot_config: dict, entry: DatasetEntry, repo_path: Path) -> str | None:
-    """
-    Setup skills in the repository if available.
-    """
-    skills_config: dict = copilot_config["skills"]
-    skills_enabled: bool = skills_config["enabled"]
-
-    if skills_enabled:
-        source_skills: Path = _get_source_instructions_path(entry.repo)
-        source_skills_dir = source_skills / "skills"
-
-        # Skip if skills folder doesn't exist for this repo
-        if not source_skills_dir.exists():
-            logger.info(f"No skills folder found at {source_skills_dir}, skipping")
-            return None
-
-        github_dir: Path = repo_path / ".github"
-        skills_dir = github_dir / "skills"
-        copytree(source_skills_dir, skills_dir, dirs_exist_ok=True)
-
-        logger.info(f"Skills are set up from {source_skills_dir}")
-        return skills_config.get("name")
-
-    return None
-
-
 def setup_custom_agent(copilot_config: dict, entry: DatasetEntry, repo_path: Path) -> str | None:
     """
     Setup custom agents in the repository if available.
diff --git a/src/bcbench/operations/skills_operations.py b/src/bcbench/operations/skills_operations.py
new file mode 100644
index 000000000..38bfa4ca2
--- /dev/null
+++ b/src/bcbench/operations/skills_operations.py
@@ -0,0 +1,44 @@
+from pathlib import Path
+import yaml
+from shutil import copytree
+
+from bcbench.config import get_config
+from bcbench.dataset.dataset_entry import DatasetEntry
+from bcbench.operations.instruction_operations import _get_source_instructions_path
+from bcbench.logger import get_logger
+
+logger = get_logger(__name__)
+_config = get_config()
+
+
+def setup_copilot_skills(copilot_config: dict, entry: DatasetEntry, repo_path: Path) -> list[str] | None:
+    """
+    Setup skills in the repository if available.
+
+    Returns:
+        List of skill directory paths if skills are enabled and exist, None otherwise.
+    """
+    skills_config: dict = copilot_config["skills"]
+    skills_enabled: bool = skills_config["enabled"]
+
+    if skills_enabled:
+        source_skills: Path = _get_source_instructions_path(entry.repo)
+        source_skills_dir = source_skills / "skills"
+
+        # Skip if skills folder doesn't exist for this repo
+        if not source_skills_dir.exists():
+            logger.info(f"No skills folder found at {source_skills_dir}, skipping")
+            return None
+
+        github_dir: Path = repo_path / ".github"
+        skills_dir = github_dir / "skills"
+        copytree(source_skills_dir, skills_dir, dirs_exist_ok=True)
+
+        logger.info(f"Skills copied from {source_skills_dir} to {skills_dir}")
+        return [str(skills_dir)]
+
+    return None
+
+
+
+

From 95270bbfa6ad6cd4cc8a39e65c4995ad6b7f329f Mon Sep 17 00:00:00 2001
From: Jiawen Sun <t-jiawensun@microsoft.com>
Date: Mon, 2 Feb 2026 13:03:48 +0100
Subject: [PATCH 8/8] reomve unused package

---
 src/bcbench/operations/__init__.py          | 2 +-
 src/bcbench/operations/skills_operations.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py
index 5ab4d4ab4..9348a510d 100644
--- a/src/bcbench/operations/__init__.py
+++ b/src/bcbench/operations/__init__.py
@@ -15,9 +15,9 @@
     stage_and_get_diff,
 )
 from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_custom_agent, setup_instructions_from_config
-from bcbench.operations.skills_operations import setup_copilot_skills
 from bcbench.operations.project_operations import categorize_projects
 from bcbench.operations.setup_operations import setup_repo_postbuild, setup_repo_prebuild
+from bcbench.operations.skills_operations import setup_copilot_skills
 from bcbench.operations.test_operations import extract_tests_from_patch
 
 __all__ = [
diff --git a/src/bcbench/operations/skills_operations.py b/src/bcbench/operations/skills_operations.py
index 38bfa4ca2..9b60c14f1 100644
--- a/src/bcbench/operations/skills_operations.py
+++ b/src/bcbench/operations/skills_operations.py
@@ -1,11 +1,10 @@
 from pathlib import Path
-import yaml
 from shutil import copytree
 
 from bcbench.config import get_config
 from bcbench.dataset.dataset_entry import DatasetEntry
-from bcbench.operations.instruction_operations import _get_source_instructions_path
 from bcbench.logger import get_logger
+from bcbench.operations.instruction_operations import _get_source_instructions_path
 
 logger = get_logger(__name__)
 _config = get_config()