microsoft · Jiawen-CS · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "typing-extensions>=4.0",
     "pyyaml>=6.0",
     "pydantic>=2.0",
+    "github-copilot-sdk>=0.1.18",
     "textual>=7.0",
 ]
 

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
@@ -1,11 +1,14 @@
 """GitHub Copilot CLI Agent implementation."""
 
+import asyncio
 import shutil
 import subprocess
 import sys
 from pathlib import Path
 
 import yaml
+from copilot import CopilotClient, MCPServerConfig, SessionConfig
+from copilot.generated.session_events import SessionEventType
 
 from bcbench.agent.copilot.metrics import parse_metrics
 from bcbench.agent.shared import build_mcp_config, build_prompt
@@ -14,6 +17,7 @@
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
 from bcbench.operations import setup_custom_agent, setup_instructions_from_config
+from bcbench.operations.skills_operations import setup_copilot_skills
 from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -32,10 +36,11 @@ def run_copilot_agent(entry: DatasetEntry, model: str, category: EvaluationCateg
     logger.info(f"Running GitHub Copilot CLI on: {entry.instance_id}")
 
     prompt: str = build_prompt(entry, repo_path, copilot_config, category, al_mcp=al_mcp)
-    mcp_config_json, mcp_server_names = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
+    mcp_config: dict[str, MCPServerConfig] | None = build_mcp_config(copilot_config, entry, repo_path, al_mcp=al_mcp)
     instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path)
+    copilot_skills: list[str] | None = setup_copilot_skills(copilot_config, entry, repo_path)
     custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path)
-    config = ExperimentConfiguration(mcp_servers=mcp_server_names, custom_instructions=instructions_enabled, custom_agent=custom_agent)
+    config = ExperimentConfiguration(mcp_servers=list(mcp_config.keys()) if mcp_config else None, custom_instructions=instructions_enabled, custom_agent=custom_agent)
 
     logger.info(f"Executing Copilot CLI in directory: {repo_path}")
     logger.debug(f"Using prompt:\n{prompt}")
@@ -58,13 +63,43 @@ def run_copilot_agent(entry: DatasetEntry, model: str, category: EvaluationCateg
         ]
         if not instructions_enabled:
             cmd_args.append("--no-custom-instructions")
-        if mcp_config_json:
-            cmd_args.append(f"--additional-mcp-config={mcp_config_json}")
         if custom_agent:
             cmd_args.append(f"--agent={custom_agent}")
 
         logger.debug(f"Copilot command args: {cmd_args}")
 
+        # Copilot SDK
+        async def run_copilot():
+            client = CopilotClient({"cli_path": copilot_cmd})
+            await client.start()
+
+            session = await client.create_session(
+                SessionConfig(
+                    model=model,
+                    mcp_servers=mcp_config if mcp_config else {},
+                    streaming=True,
+                    skill_directories=copilot_skills if copilot_skills else [],
+                )
+            )
+
+            # Listen for response chunks
+            def handle_event(event):
+                if event.type == SessionEventType.ASSISTANT_MESSAGE_DELTA:
+                    sys.stdout.write(event.data.delta_content)
+                    sys.stdout.flush()
+
+            session.on(handle_event)
+
+            await session.send_and_wait(
+                {"prompt": prompt},
+                timeout=_config.timeout.agent_execution,
+            )
+            print()  # newline after streaming
+
+            await client.stop()
+
+        asyncio.run(run_copilot())
+
         result = subprocess.run(
             cmd_args,
             cwd=str(repo_path),

diff --git a/.../agent/copilot/instructions/microsoft-BCApps/skills/al-test-generation/SKILL.md b/.../agent/copilot/instructions/microsoft-BCApps/skills/al-test-generation/SKILL.md
@@ -0,0 +1,93 @@
+---
+name: al-test-generation
+description: Guide for creating AL tests for Microsoft Dynamics 365 Business Central. Use this when asked to write, create, or generate AL test codeunits, test procedures, or test automation for Business Central.
+---
+
+To create AL tests for Microsoft Dynamics 365 Business Central, follow this process:
+
+## 1. Analyze the Code Under Test
+
+Before writing any test code:
+1. Read and understand the procedure or functionality being tested
+2. Trace through all code paths to identify UI interactions
+3. Examine table definitions for TableRelation constraints
+
+## 2. Identify Required Handler Methods
+
+**CRITICAL: Tests fail with "Unhandled UI" errors when handlers are missing.**
+
+Look for these patterns in the code under test:
+
+| Code Pattern                          | Required Handler            |
+| ------------------------------------- | --------------------------- |
+| `Confirm()`                           | `[ConfirmHandler]`          |
+| `Message()`                           | `[MessageHandler]`          |
+| `StrMenu()`                           | `[StrMenuHandler]`          |
+| `Page.Run()`                          | `[PageHandler]`             |
+| `Page.RunModal()`                     | `[ModalPageHandler]`        |
+| `Report.Run()` or `Report.RunModal()` | `[ReportHandler]`           |
+| Report request page                   | `[RequestPageHandler]`      |
+| `Hyperlink()`                         | `[HyperlinkHandler]`        |
+| `Notification.Send()`                 | `[SendNotificationHandler]` |
+
+## 3. Analyze TableRelation Constraints
+
+**CRITICAL: Tests fail with validation errors when inserting data that violates TableRelation constraints.**
+
+Before inserting test data:
+1. Read the table definition for all fields receiving values
+2. Identify fields with `TableRelation` properties
+3. Ensure related records exist before inserting test data
+4. Use Library functions (e.g., `LibrarySales`, `LibraryPurchase`) to create prerequisite data
+
+## 4. Write Test Structure
+
+Follow the AAA pattern (Arrange-Act-Assert):
+
+```AL
+[Test]
+[HandlerFunctions('RequiredHandlers')]
+procedure TestProcedureName()
+begin
+    // [GIVEN] Setup test data and preconditions
+    Initialize();
+    CreateTestData();
+
+    // [WHEN] Execute the action being tested
+    ExecuteAction();
+
+    // [THEN] Verify the expected results
+    VerifyResults();
+end;
+```
+
+## 5. Handler Method Signatures
+
+```AL
+[ConfirmHandler]
+procedure ConfirmHandlerYes(Question: Text[1024]; var Reply: Boolean)
+begin
+    Reply := true;
+end;
+
+[MessageHandler]
+procedure MessageHandler(Message: Text[1024])
+begin
+    // Empty - suppresses message display
+end;
+
+[ModalPageHandler]
+procedure ModalPageHandler(var TestPage: TestPage "Page Name")
+begin
+    TestPage.OK().Invoke();
+end;
+```
+
+## 6. Best Practices
+
+- Use descriptive test procedure names that explain what is being tested
+- One assertion concept per test
+- Use Library Variable Storage to pass data between handlers and tests
+- Do NOT verify values inside handler procedures
+- Clean up test data in teardown or use transaction rollback
+- Use `Initialize()` procedure to set up common test fixtures
diff --git a/...t/copilot/instructions/microsoftInternal-NAV/skills/al-test-generation/SKILL.md b/...t/copilot/instructions/microsoftInternal-NAV/skills/al-test-generation/SKILL.md
@@ -0,0 +1,93 @@
+---
+name: al-test-generation
+description: Guide for creating AL tests for Microsoft Dynamics 365 Business Central. Use this when asked to write, create, or generate AL test codeunits, test procedures, or test automation for Business Central.
+---
+
+To create AL tests for Microsoft Dynamics 365 Business Central, follow this process:
+
+## 1. Analyze the Code Under Test
+
+Before writing any test code:
+1. Read and understand the procedure or functionality being tested
+2. Trace through all code paths to identify UI interactions
+3. Examine table definitions for TableRelation constraints
+
+## 2. Identify Required Handler Methods
+
+**CRITICAL: Tests fail with "Unhandled UI" errors when handlers are missing.**
+
+Look for these patterns in the code under test:
+
+| Code Pattern                          | Required Handler            |
+| ------------------------------------- | --------------------------- |
+| `Confirm()`                           | `[ConfirmHandler]`          |
+| `Message()`                           | `[MessageHandler]`          |
+| `StrMenu()`                           | `[StrMenuHandler]`          |
+| `Page.Run()`                          | `[PageHandler]`             |
+| `Page.RunModal()`                     | `[ModalPageHandler]`        |
+| `Report.Run()` or `Report.RunModal()` | `[ReportHandler]`           |
+| Report request page                   | `[RequestPageHandler]`      |
+| `Hyperlink()`                         | `[HyperlinkHandler]`        |
+| `Notification.Send()`                 | `[SendNotificationHandler]` |
+
+## 3. Analyze TableRelation Constraints
+
+**CRITICAL: Tests fail with validation errors when inserting data that violates TableRelation constraints.**
+
+Before inserting test data:
+1. Read the table definition for all fields receiving values
+2. Identify fields with `TableRelation` properties
+3. Ensure related records exist before inserting test data
+4. Use Library functions (e.g., `LibrarySales`, `LibraryPurchase`) to create prerequisite data
+
+## 4. Write Test Structure
+
+Follow the AAA pattern (Arrange-Act-Assert):
+
+```AL
+[Test]
+[HandlerFunctions('RequiredHandlers')]
+procedure TestProcedureName()
+begin
+    // [GIVEN] Setup test data and preconditions
+    Initialize();
+    CreateTestData();
+
+    // [WHEN] Execute the action being tested
+    ExecuteAction();
+
+    // [THEN] Verify the expected results
+    VerifyResults();
+end;
+```
+
+## 5. Handler Method Signatures
+
+```AL
+[ConfirmHandler]
+procedure ConfirmHandlerYes(Question: Text[1024]; var Reply: Boolean)
+begin
+    Reply := true;
+end;
+
+[MessageHandler]
+procedure MessageHandler(Message: Text[1024])
+begin
+    // Empty - suppresses message display
+end;
+
+[ModalPageHandler]
+procedure ModalPageHandler(var TestPage: TestPage "Page Name")
+begin
+    TestPage.OK().Invoke();
+end;
+```
+
+## 6. Best Practices
+
+- Use descriptive test procedure names that explain what is being tested
+- One assertion concept per test
+- Use Library Variable Storage to pass data between handlers and tests
+- Do NOT verify values inside handler procedures
+- Clean up test data in teardown or use transaction rollback
+- Use `Initialize()` procedure to set up common test fixtures
diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml
@@ -58,6 +58,14 @@ prompt:
 instructions:
   enabled: false
 
+# controls:
+# 1. whether to copy skills (`src/bcbench/agent/copilot/instructions/<sanitized-repo>/skills/`) into repo/.github/skills/
+# 2. SDK passes skill_directories to SessionConfig; CLI auto-discovers from .github/skills/
+skills:
+  enabled: true
+  # name is for documentation only - all skills in the skills/ folder are loaded
+  name: al-test-generation
+
 # controls:
 # 1. whether to copy custom agents (`src/bcbench/agent/copilot/instructions/<sanitized-repo>/agents/`) into the repo
 # 2. whether to pass --agent=<agent-name> to copilot

diff --git a/src/bcbench/agent/shared/mcp.py b/src/bcbench/agent/shared/mcp.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import Any
 
+from copilot import MCPLocalServerConfig, MCPRemoteServerConfig, MCPServerConfig
 from jinja2 import Template
 
 from bcbench.dataset import DatasetEntry
@@ -39,33 +40,33 @@ def cleanup(self) -> None:
 _mcp_server_manager = _ALMcpServerManager()
 
 
-def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> tuple[str, dict[str, Any]]:
+def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]) -> dict[str, MCPServerConfig]:
     server_type: str = server["type"]
     server_name: str = server["name"]
     tools: list[str] = server["tools"]
 
     match server_type:
         case "http":
-            return server_name, {
-                "type": server_type,
-                "url": server["url"],
-                "tools": tools,
-            }
+            return {server_name: MCPRemoteServerConfig(
+                tools=tools,
+                url=server["url"],
+                type=server_type
+            )}
         case "local":
             args: list[str] = server["args"]
             rendered_args = [Template(arg).render(**template_context) for arg in args]
-            return server_name, {
-                "type": server_type,
-                "command": server["command"],
-                "args": rendered_args,
-                "tools": tools,
-            }
+            return {server_name: MCPLocalServerConfig(
+                tools=tools,
+                command=server["command"],
+                args=rendered_args,
+                type=server_type,
+            )}
         case _:
             logger.error(f"Unsupported MCP server type: {server_type}, {server}")
             raise AgentError(f"Unsupported MCP server type: {server_type}")
 
 
-def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False) -> tuple[str | None, list[str] | None]:
+def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False) -> dict[str, MCPServerConfig] | None:
     # following docs: https://docs.github.com/en/enterprise-cloud@latest/copilot/how-tos/use-copilot-agents/coding-agent/extend-coding-agent-with-mcp
     mcp_servers: list[dict[str, Any]] = config.get("mcp", {}).get("servers", [])
 
@@ -78,11 +79,15 @@ def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Pat
         logger.info("AL MCP server enabled via --al-mcp flag")
 
     if not mcp_servers:
-        return None, None
+        return None
 
     template_context = {"repo_path": repo_path}
     mcp_server_names: list[str] = [server["name"] for server in mcp_servers]
-    mcp_config = {"mcpServers": dict(map(lambda s: _build_server_entry(s, template_context), mcp_servers))}
+    # mcp_config = {"mcpServers": dict(map(lambda s: _build_server_entry(s, template_context), mcp_servers))}
+    mcp_config : dict[str, MCPServerConfig] = {}
+    for server in mcp_servers:
+        server_entry = _build_server_entry(server, template_context)
+        mcp_config.update(server_entry)
 
     if al_mcp:
         # Launch MCP server with all project paths separated by semicolons
@@ -92,4 +97,4 @@ def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Pat
     logger.info(f"Using MCP servers: {mcp_server_names}")
     logger.debug(f"MCP configuration: {json.dumps(mcp_config, indent=2)}")
 
-    return json.dumps(mcp_config, separators=(",", ":")), mcp_server_names
+    return mcp_config
diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py
@@ -1,23 +1,24 @@
 """Operations for Business Central and Git."""

 from bcbench.operations.bc_operations import (
    build_and_publish_projects,
    build_ps_app_build_and_publish,
    build_ps_dataset_tests_script,
    build_ps_test_script,
    run_tests,
 )
 from bcbench.operations.git_operations import (
    apply_patch,
    checkout_commit,
    clean_project_paths,
    clean_repo,
     stage_and_get_diff,
 )
 from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_custom_agent, setup_instructions_from_config
+from bcbench.operations.skills_operations import setup_copilot_skills
 from bcbench.operations.project_operations import categorize_projects
 from bcbench.operations.setup_operations import setup_repo_postbuild, setup_repo_prebuild
 from bcbench.operations.test_operations import extract_tests_from_patch

 __all__ = [
    "apply_patch",
@@ -32,6 +33,7 @@
     "copy_problem_statement_folder",
     "extract_tests_from_patch",
     "run_tests",
+    "setup_copilot_skills",
     "setup_custom_agent",
     "setup_instructions_from_config",
     "setup_repo_postbuild",