From e7dc9a71a73d063b7271570d3da3a4ed6b858ef6 Mon Sep 17 00:00:00 2001
From: Sean <liuyuxin@pjlab.org.cn>
Date: Fri, 5 Jun 2026 09:52:24 +0800
Subject: [PATCH 1/2] feat(agent-eval): add LLMAgent* evaluators for agent
 trace monitoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add 8 LLM evaluators + 3 rule evaluators for AI agent trace quality assessment,
aligned with DeepEval 3-layer taxonomy (Execution/Action/Reasoning) + Recovery.

LLM evaluators (dingo/model/llm/agent_eval/):
- LLMAgentTaskCompletion, LLMAgentStepEfficiency (Execution)
- LLMAgentToolCorrectness, LLMAgentArgumentCorrectness (Action)
- LLMAgentPlanQuality, LLMAgentPlanAdherence (Reasoning)
- LLMAgentErrorRecovery (Recovery), LLMAgentTraceConclusion (synthesis)

Rule evaluators: RuleAgentTraceLoopDetection, RuleAgentTraceTokenBudget,
RuleAgentTraceLatencyAnomaly

Shared BaseLLMAgentEval with 0-10→0.0-1.0 normalization, CJK language
detection, and configurable threshold.
---
 .pre-commit-config.yaml                       |  46 ++--
 dingo/model/llm/agent_eval/__init__.py        |  10 +
 .../llm/agent_eval/base_llm_agent_eval.py     | 112 +++++++++
 .../llm_agent_argument_correctness.py         |  82 +++++++
 .../agent_eval/llm_agent_error_recovery.py    | 150 ++++++++++++
 .../agent_eval/llm_agent_plan_adherence.py    |  85 +++++++
 .../llm/agent_eval/llm_agent_plan_quality.py  | 160 ++++++++++++
 .../agent_eval/llm_agent_step_efficiency.py   |  75 ++++++
 .../agent_eval/llm_agent_task_completion.py   |  72 ++++++
 .../agent_eval/llm_agent_tool_correctness.py  |  82 +++++++
 .../agent_eval/llm_agent_trace_conclusion.py  | 119 +++++++++
 dingo/model/rule/rule_agent.py                | 230 ++++++++++++++++++
 12 files changed, 1200 insertions(+), 23 deletions(-)
 create mode 100644 dingo/model/llm/agent_eval/__init__.py
 create mode 100644 dingo/model/llm/agent_eval/base_llm_agent_eval.py
 create mode 100644 dingo/model/llm/agent_eval/llm_agent_argument_correctness.py
 create mode 100644 dingo/model/llm/agent_eval/llm_agent_error_recovery.py
 create mode 100644 dingo/model/llm/agent_eval/llm_agent_plan_adherence.py
 create mode 100644 dingo/model/llm/agent_eval/llm_agent_plan_quality.py
 create mode 100644 dingo/model/llm/agent_eval/llm_agent_step_efficiency.py
 create mode 100644 dingo/model/llm/agent_eval/llm_agent_task_completion.py
 create mode 100644 dingo/model/llm/agent_eval/llm_agent_tool_correctness.py
 create mode 100644 dingo/model/llm/agent_eval/llm_agent_trace_conclusion.py
 create mode 100644 dingo/model/rule/rule_agent.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c1fd47c1..6b50e489 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,23 +1,23 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
-repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
-    hooks:
-    -   id: trailing-whitespace
-        exclude: '^README.*\.md$'
-    -   id: end-of-file-fixer
-        exclude: 'docs/metrics\.md'
-    -   id: check-yaml
-    -   id: check-added-large-files
--   repo: https://github.com/PyCQA/isort
-    rev: 6.0.1
-    hooks:
-    -   id: isort
-        args: [ "-l", "200", "-m", "0", "-p", "dingo" ]
--   repo: https://github.com/PyCQA/flake8
-    rev: 7.2.0
-    hooks:
-    -   id: flake8
-        args: [ "--max-line-length=2200", "--ignore=E121,E131,E125,W503,W504,W604,E203,E231,E702,E128,F541,F401,E266" ]
-        exclude: 'app/'
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+    -   id: trailing-whitespace
+        exclude: '^README.*\.md$'
+    -   id: end-of-file-fixer
+        exclude: 'docs/metrics\.md'
+    -   id: check-yaml
+    -   id: check-added-large-files
+-   repo: https://github.com/PyCQA/isort
+    rev: 6.0.1
+    hooks:
+    -   id: isort
+        args: [ "-l", "200", "-m", "0", "-p", "dingo" ]
+-   repo: https://github.com/PyCQA/flake8
+    rev: 7.2.0
+    hooks:
+    -   id: flake8
+        args: [ "--max-line-length=2200", "--ignore=E121,E131,E125,W503,W504,W604,E203,E231,E702,E128,F541,F401,E266" ]
+        exclude: 'app/'
diff --git a/dingo/model/llm/agent_eval/__init__.py b/dingo/model/llm/agent_eval/__init__.py
new file mode 100644
index 00000000..a6e8dbbb
--- /dev/null
+++ b/dingo/model/llm/agent_eval/__init__.py
@@ -0,0 +1,10 @@
+"""
+Agent Trace Evaluation metrics (LLMAgent* series).
+
+These evaluators assess agent trace quality using LLM-as-Judge methodology.
+They are distinct from the agent/ directory which contains agent-framework-based
+evaluators (AgentFactCheck, AgentHallucination) that USE agent frameworks to DO evaluation.
+
+Evaluators in this package EVALUATE agent traces — task completion, plan quality,
+tool correctness, error recovery, etc.
+"""
diff --git a/dingo/model/llm/agent_eval/base_llm_agent_eval.py b/dingo/model/llm/agent_eval/base_llm_agent_eval.py
new file mode 100644
index 00000000..5669cf1e
--- /dev/null
+++ b/dingo/model/llm/agent_eval/base_llm_agent_eval.py
@@ -0,0 +1,112 @@
+"""
+Base class for all Agent evaluation metrics.
+
+Provides standardized:
+- 0~10 score → 0.0~1.0 normalization
+- Configurable threshold via dynamic_config.model_extra
+- Unified JSON response parsing with {"score": 0-10, "reason": "...", ...}
+- Error fallback handling
+- `eval_layer` and `input_data_type` declarations for orchestrator integration
+
+Subclasses only need to define:
+- `prompt`: the evaluation prompt template
+- `build_messages()`: how to format input data into LLM messages
+- Optionally override `process_response()` for custom parsing
+"""
+
+import json
+from typing import Optional
+
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.utils import log
+from dingo.utils.exception import ConvertJsonError
+
+
+class BaseLLMAgentEval(BaseOpenAI):
+    """Shared base class for all Agent evaluation metrics."""
+
+    eval_layer: str = ""
+    input_data_type: str = "trace_summary"
+    default_threshold: float = 0.6
+
+    @classmethod
+    def _detect_language_hint(cls, text: str) -> str:
+        """Detect if text contains CJK characters and return a language instruction."""
+        if not text:
+            return ""
+        import re
+        cjk_count = len(re.findall(r'[一-鿿㐀-䶿]', text[:500]))
+        if cjk_count > 5:
+            return '\n\n注意：请用中文回答 "reason" 字段。'
+        return ""
+
+    @classmethod
+    def _get_threshold(cls) -> float:
+        if cls.dynamic_config and cls.dynamic_config.model_extra:
+            return float(cls.dynamic_config.model_extra.get(
+                "threshold", cls.default_threshold
+            ))
+        return cls.default_threshold
+
+    @classmethod
+    def _strip_json_fences(cls, response: str) -> str:
+        response = response.strip()
+        if response.startswith("```json"):
+            response = response[7:]
+        if response.startswith("```"):
+            response = response[3:]
+        if response.endswith("```"):
+            response = response[:-3]
+        return response.strip()
+
+    @classmethod
+    def _parse_json_response(cls, response: str) -> dict:
+        cleaned = cls._strip_json_fences(response)
+        try:
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            raise ConvertJsonError(
+                f"Failed to parse agent eval JSON: {cleaned[:500]}"
+            )
+
+    @classmethod
+    def process_response(cls, response: str) -> EvalDetail:
+        """Standardized response processing for agent evaluators.
+
+        Expected LLM output: {"score": 0-10, "reason": "...", ...extra fields...}
+        Score is normalized to 0.0~1.0 and compared against threshold.
+        Extra fields are preserved in EvalDetail.reason as JSON.
+        """
+        log.info(response)
+        data = cls._parse_json_response(response)
+
+        raw_score = data.get("score", data.get("overall_score", 0))
+        try:
+            raw_score = float(raw_score)
+        except (TypeError, ValueError):
+            raw_score = 0.0
+
+        normalized_score = max(0.0, min(1.0, raw_score / 10.0))
+        threshold = cls._get_threshold()
+
+        reason_text = data.get("reason", "")
+        details = {k: v for k, v in data.items() if k not in ("score", "reason")}
+
+        result = EvalDetail(metric=cls.__name__)
+        result.score = normalized_score
+
+        if normalized_score >= threshold:
+            result.status = False
+            result.label = [QualityLabel.QUALITY_GOOD]
+        else:
+            result.status = True
+            result.label = [f"AGENT_QUALITY.{cls.__name__}"]
+
+        reason_parts = [reason_text] if reason_text else []
+        if details:
+            reason_parts.append(json.dumps(details, ensure_ascii=False, default=str))
+        result.reason = reason_parts if reason_parts else None
+
+        return result
diff --git a/dingo/model/llm/agent_eval/llm_agent_argument_correctness.py b/dingo/model/llm/agent_eval/llm_agent_argument_correctness.py
new file mode 100644
index 00000000..3ef3b9ca
--- /dev/null
+++ b/dingo/model/llm/agent_eval/llm_agent_argument_correctness.py
@@ -0,0 +1,82 @@
+"""
+LLMAgentArgumentCorrectness: Evaluates whether the agent passed correct arguments to each tool call.
+
+Performs referenceless LLM-judge evaluation — no ground-truth arguments are required.
+The judge assesses argument quality based on the task objective and the expected
+semantics of each tool.
+"""
+
+from typing import List
+
+from dingo.io.input import Data, RequiredField
+from dingo.model import Model
+from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval
+
+
+@Model.llm_register("LLMAgentArgumentCorrectness")
+class LLMAgentArgumentCorrectness(BaseLLMAgentEval):
+    """
+    Evaluates the correctness of tool arguments in an agent's execution.
+
+    Input:
+        prompt  - The task objective or user request
+        content - JSON-formatted sequence of tool calls with their arguments
+
+    Performs referenceless evaluation: the LLM judge determines whether
+    each tool received correct, well-formed, and contextually appropriate
+    arguments, without requiring ground-truth argument values.
+    """
+
+    eval_layer = "action"
+    input_data_type = "tool_calls"
+    default_threshold = 0.6
+
+    _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT]
+
+    prompt = """You are an expert evaluator assessing whether an AI agent passed correct arguments to its tool calls.
+
+For each tool call in the sequence, evaluate the arguments:
+- Are the argument values correct and appropriate for the task context?
+- Are required arguments present and non-null?
+- Are argument types and formats valid?
+- Do the arguments make semantic sense given what the tool does?
+
+Count:
+- **correct_args**: Tool calls where arguments were fully correct
+- **total_calls**: Total number of tool calls evaluated
+
+List specific argument issues found (wrong value, missing required argument, type mismatch, semantically incorrect argument, etc.).
+
+A score of 10 means all tool calls had perfectly correct arguments.
+A score of 0 means all tool calls had wrong or missing arguments.
+
+Respond in the same language as the input content for the "reason" field.
+
+Return your evaluation as a JSON object with this exact schema:
+{
+  "correct_args": <integer>,
+  "total_calls": <integer>,
+  "issues": ["<issue description, e.g. tool X received wrong value for param Y>", ...],
+  "score": <integer 0-10>,
+  "reason": "<concise summary of argument correctness across all tool calls>"
+}
+
+Do not include any text outside the JSON object."""
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List[dict]:
+        """Build LLM messages for argument correctness evaluation."""
+        lang_hint = cls._detect_language_hint(
+            str(input_data.prompt) + str(input_data.content)
+        )
+        user_content = f"""{cls.prompt}
+
+## Task Objective
+{input_data.prompt}
+
+## Tool Call Sequence with Arguments
+{input_data.content}
+
+Evaluate the tool arguments and return the JSON evaluation.{lang_hint}"""
+
+        return [{"role": "user", "content": user_content}]
diff --git a/dingo/model/llm/agent_eval/llm_agent_error_recovery.py b/dingo/model/llm/agent_eval/llm_agent_error_recovery.py
new file mode 100644
index 00000000..b7f260ab
--- /dev/null
+++ b/dingo/model/llm/agent_eval/llm_agent_error_recovery.py
@@ -0,0 +1,150 @@
+"""
+LLMAgentErrorRecovery: Evaluates the agent's ability to recover from errors encountered during execution.
+
+If no error events are present in the input, returns score=1.0 (pass) immediately,
+since perfect execution with no errors requires no recovery.
+"""
+
+import time
+from typing import List
+
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model import Model
+from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval
+from dingo.utils import log
+from dingo.utils.exception import ConvertJsonError, ExceedMaxTokens
+
+try:
+    from pydantic import ValidationError
+except ImportError:
+    ValidationError = Exception
+
+
+@Model.llm_register("LLMAgentErrorRecovery")
+class LLMAgentErrorRecovery(BaseLLMAgentEval):
+    """
+    Evaluates the error recovery capability of an agent.
+
+    Input:
+        prompt  - The task objective or user request
+        content - The error events or failure log from the agent execution
+
+    If no errors are found in the content, the evaluator short-circuits
+    and returns score=1.0 (pass) without calling the LLM.
+    """
+
+    eval_layer = "recovery"
+    input_data_type = "error_events"
+    default_threshold = 0.5
+
+    _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT]
+
+    _NO_ERROR_INDICATORS = [
+        "no error", "no errors", "no failures", "no exception",
+        "0 errors", "zero errors", "none", "n/a", "[]", "{}",
+    ]
+
+    prompt = """You are an expert evaluator assessing how well an AI agent recovered from errors during task execution.
+
+For each error event, evaluate:
+- Did the agent detect the error?
+- Did the agent attempt recovery?
+- Was the recovery successful?
+- Was the recovery strategy appropriate?
+
+Count:
+- **errors_encountered**: Total number of distinct errors or failures
+- **recovered_count**: Number of errors from which the agent successfully recovered
+
+Assess overall **recovery_quality** from 0 to 10:
+- 10: Agent recovered from all errors with optimal strategies
+- 7-9: Agent recovered from most errors with reasonable strategies
+- 4-6: Agent recovered from some errors but used suboptimal approaches
+- 1-3: Agent attempted recovery but largely failed
+- 0: Agent did not attempt recovery or made errors worse
+
+Respond in the same language as the input content for the "reason" field.
+
+Return your evaluation as a JSON object with this exact schema:
+{
+  "errors_encountered": <integer>,
+  "recovered_count": <integer>,
+  "recovery_quality": <integer 0-10>,
+  "score": <integer 0-10>,
+  "reason": "<concise explanation of recovery behavior for each error type>"
+}
+
+Do not include any text outside the JSON object."""
+
+    @classmethod
+    def _has_error_events(cls, content: str) -> bool:
+        """Check if the content contains actual error events."""
+        if not content or not content.strip():
+            return False
+        stripped = content.strip().lower()
+        for indicator in cls._NO_ERROR_INDICATORS:
+            if stripped == indicator:
+                return False
+        return True
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List[dict]:
+        """Build LLM messages for error recovery evaluation."""
+        lang_hint = cls._detect_language_hint(
+            str(input_data.prompt) + str(input_data.content)
+        )
+        user_content = f"""{cls.prompt}
+
+## Task Objective
+{input_data.prompt}
+
+## Error Events / Failure Log
+{input_data.content}
+
+Evaluate the agent's error recovery and return the JSON evaluation.{lang_hint}"""
+
+        return [{"role": "user", "content": user_content}]
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        """Override eval() to handle the no-error special case."""
+        content = getattr(input_data, "content", "") or ""
+
+        if not cls._has_error_events(content):
+            log.info(f"{cls.__name__}: No error events detected, returning pass")
+            result = EvalDetail(metric=cls.__name__)
+            result.status = False
+            result.label = [QualityLabel.QUALITY_GOOD]
+            result.score = 1.0
+            result.reason = ["No error events found in execution trace; recovery evaluation skipped."]
+            return result
+
+        if cls.client is None:
+            cls.create_client()
+
+        messages = cls.build_messages(input_data)
+
+        attempts = 0
+        except_msg = ""
+        except_name = Exception.__class__.__name__
+        while attempts < 3:
+            try:
+                response = cls.send_messages(messages)
+                res: EvalDetail = cls.process_response(response)
+                return res
+            except (ValidationError, ExceedMaxTokens, ConvertJsonError) as e:
+                except_msg = str(e)
+                except_name = e.__class__.__name__
+                break
+            except Exception as e:
+                attempts += 1
+                time.sleep(1)
+                except_msg = str(e)
+                except_name = e.__class__.__name__
+
+        res = EvalDetail(metric=cls.__name__)
+        res.status = True
+        res.label = [f"QUALITY_BAD.{except_name}"]
+        res.reason = [except_msg]
+        return res
diff --git a/dingo/model/llm/agent_eval/llm_agent_plan_adherence.py b/dingo/model/llm/agent_eval/llm_agent_plan_adherence.py
new file mode 100644
index 00000000..0755b47b
--- /dev/null
+++ b/dingo/model/llm/agent_eval/llm_agent_plan_adherence.py
@@ -0,0 +1,85 @@
+"""
+LLMAgentPlanAdherence: Evaluates how closely the agent followed its stated plan during execution.
+
+Compares the original plan (prompt) against the actual execution steps (content),
+with the task goal available as context. Justified deviations are scored more
+leniently than unjustified ones.
+"""
+
+from typing import List
+
+from dingo.io.input import Data, RequiredField
+from dingo.model import Model
+from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval
+
+
+@Model.llm_register("LLMAgentPlanAdherence")
+class LLMAgentPlanAdherence(BaseLLMAgentEval):
+    """
+    Evaluates how closely the agent adhered to its original plan.
+
+    Input:
+        prompt  - The agent's original plan (steps or strategy)
+        content - The actual execution steps taken by the agent
+        context - The overarching task goal or user objective
+
+    Deviations are classified as justified (e.g., adapting to unexpected
+    obstacles) or unjustified (e.g., skipping steps without reason).
+    """
+
+    eval_layer = "reasoning"
+    input_data_type = "plan_vs_execution"
+    default_threshold = 0.5
+
+    _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT, RequiredField.CONTEXT]
+
+    prompt = """You are an expert evaluator assessing how closely an AI agent followed its original plan during execution.
+
+Compare the original plan against the actual execution steps and classify any deviations:
+- **Justified deviations**: The agent deviated from the plan for a valid reason (e.g., encountered an obstacle, discovered new information, adapted to dynamic conditions).
+- **Unjustified deviations**: The agent deviated from the plan without apparent reason (e.g., skipped steps, added unplanned steps unrelated to the goal).
+
+Count:
+- **followed_steps**: Number of planned steps that were executed as intended
+- **total_planned**: Total number of steps in the original plan
+- **justified_deviations**: Deviations with valid justification
+- **unjustified_deviations**: Deviations without justification
+
+A score of 10 means perfect adherence (or all deviations were justified).
+A score of 0 means the agent completely ignored its plan without justification.
+
+Respond in the same language as the input content for the "reason" field.
+
+Return your evaluation as a JSON object with this exact schema:
+{
+  "followed_steps": <integer>,
+  "total_planned": <integer>,
+  "justified_deviations": <integer>,
+  "unjustified_deviations": <integer>,
+  "score": <integer 0-10>,
+  "reason": "<concise explanation of adherence quality and notable deviations>"
+}
+
+Do not include any text outside the JSON object."""
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List[dict]:
+        """Build LLM messages for plan adherence evaluation."""
+        task_goal = getattr(input_data, "context", "") or ""
+        lang_hint = cls._detect_language_hint(
+            str(input_data.prompt) + str(input_data.content)
+        )
+        user_content = f"""{cls.prompt}
+
+## Task Goal
+{task_goal}
+
+## Original Plan
+{input_data.prompt}
+
+## Actual Execution Steps
+{input_data.content}
+
+Evaluate how closely the agent followed its plan and return the JSON evaluation.{lang_hint}"""
+
+        return [{"role": "user", "content": user_content}]
diff --git a/dingo/model/llm/agent_eval/llm_agent_plan_quality.py b/dingo/model/llm/agent_eval/llm_agent_plan_quality.py
new file mode 100644
index 00000000..be4f33da
--- /dev/null
+++ b/dingo/model/llm/agent_eval/llm_agent_plan_quality.py
@@ -0,0 +1,160 @@
+"""
+LLMAgentPlanQuality: Evaluates the quality of an agent's reasoning plan.
+
+Assesses coherence, completeness, and feasibility of the agent's plan.
+If no planning content is found in the trace, defaults to passing (score=1.0).
+"""
+
+import time
+from typing import List
+
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model import Model
+from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval
+from dingo.utils import log
+from dingo.utils.exception import ConvertJsonError, ExceedMaxTokens
+
+try:
+    from pydantic import ValidationError
+except ImportError:
+    ValidationError = Exception
+
+
+@Model.llm_register("LLMAgentPlanQuality")
+class LLMAgentPlanQuality(BaseLLMAgentEval):
+    """
+    Evaluates the quality of an agent's reasoning plan.
+
+    Input:
+        prompt  - The task objective or user request
+        content - The agent trace or plan description
+
+    If no planning content is detected in the trace, the evaluator
+    returns score=1.0 (pass) because absence of planning may be
+    acceptable for simple tasks.
+    """
+
+    eval_layer = "reasoning"
+    input_data_type = "trace_summary"
+    default_threshold = 0.6
+
+    _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT]
+
+    _NO_PLAN_KEYWORDS = [
+        "no plan", "no planning", "no explicit plan",
+        "did not plan", "skipped planning", "planning not found",
+    ]
+
+    prompt = """You are an expert evaluator assessing the quality of an AI agent's reasoning plan.
+
+First, determine whether the trace contains any planning content (explicit steps, strategy, or reasoning about how to approach the task). If there is no planning content at all, set score to -1 as a sentinel value.
+
+If planning content exists, evaluate it on three dimensions:
+1. **Coherence** (1-5): Is the plan logically structured and internally consistent?
+2. **Completeness** (1-5): Does the plan cover all necessary steps to achieve the goal?
+3. **Feasibility** (1-5): Are the planned steps realistic and achievable?
+
+Compute an overall score from 0 to 10.
+
+IMPORTANT: The "reason" field MUST be in the same language as the Task Objective. If the task objective is in Chinese, respond in Chinese. If in English, respond in English.
+
+Return your evaluation as a JSON object with this exact schema:
+{
+  "coherence": <integer 1-5, or null if no plan>,
+  "completeness": <integer 1-5, or null if no plan>,
+  "feasibility": <integer 1-5, or null if no plan>,
+  "score": <integer -1 if no plan found, otherwise 0-10>,
+  "reason": "<explanation; if no plan, state that planning content was not found>"
+}
+
+Do not include any text outside the JSON object."""
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List[dict]:
+        """Build LLM messages for plan quality evaluation."""
+        lang_hint = cls._detect_language_hint(
+            str(input_data.prompt) + str(input_data.content)
+        )
+        user_content = f"""{cls.prompt}
+
+## Task Objective
+{input_data.prompt}
+
+## Agent Trace / Plan
+{input_data.content}
+
+Evaluate the plan quality and return the JSON evaluation.{lang_hint}"""
+
+        return [{"role": "user", "content": user_content}]
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        """Override eval() to handle the no-planning special case."""
+        if cls.client is None:
+            cls.create_client()
+
+        messages = cls.build_messages(input_data)
+
+        attempts = 0
+        except_msg = ""
+        except_name = Exception.__class__.__name__
+        while attempts < 3:
+            try:
+                response = cls.send_messages(messages)
+
+                data = cls._parse_json_response(response)
+                raw_score = data.get("score", 0)
+
+                try:
+                    raw_score = float(raw_score)
+                except (TypeError, ValueError):
+                    raw_score = 0.0
+
+                result = EvalDetail(metric=cls.__name__)
+
+                if raw_score < 0:
+                    # Sentinel value: no planning content found, treat as pass
+                    log.info(f"{cls.__name__}: No planning content found in trace, defaulting to pass")
+                    result.status = False
+                    result.label = [QualityLabel.QUALITY_GOOD]
+                    result.score = 1.0
+                    result.reason = [data.get("reason", "No planning content found; evaluation skipped.")]
+                    return result
+
+                normalized_score = max(0.0, min(1.0, raw_score / 10.0))
+                threshold = cls._get_threshold()
+                reason_text = data.get("reason", "")
+                details = {k: v for k, v in data.items() if k not in ("score", "reason")}
+
+                import json
+                result.score = normalized_score
+                if normalized_score >= threshold:
+                    result.status = False
+                    result.label = [QualityLabel.QUALITY_GOOD]
+                else:
+                    result.status = True
+                    result.label = [f"AGENT_QUALITY.{cls.__name__}"]
+
+                reason_parts = [reason_text] if reason_text else []
+                if details:
+                    reason_parts.append(json.dumps(details, ensure_ascii=False, default=str))
+                result.reason = reason_parts if reason_parts else None
+
+                return result
+
+            except (ValidationError, ExceedMaxTokens, ConvertJsonError) as e:
+                except_msg = str(e)
+                except_name = e.__class__.__name__
+                break
+            except Exception as e:
+                attempts += 1
+                time.sleep(1)
+                except_msg = str(e)
+                except_name = e.__class__.__name__
+
+        res = EvalDetail(metric=cls.__name__)
+        res.status = True
+        res.label = [f"QUALITY_BAD.{except_name}"]
+        res.reason = [except_msg]
+        return res
diff --git a/dingo/model/llm/agent_eval/llm_agent_step_efficiency.py b/dingo/model/llm/agent_eval/llm_agent_step_efficiency.py
new file mode 100644
index 00000000..cb3ab959
--- /dev/null
+++ b/dingo/model/llm/agent_eval/llm_agent_step_efficiency.py
@@ -0,0 +1,75 @@
+"""
+LLMAgentStepEfficiency: Evaluates whether the agent executed its task with minimal redundant steps.
+
+Detects wasted steps, execution loops, and unnecessary operations in the agent trace,
+scoring higher for lean and purposeful execution.
+"""
+
+from typing import List
+
+from dingo.io.input import Data, RequiredField
+from dingo.model import Model
+from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval
+
+
+@Model.llm_register("LLMAgentStepEfficiency")
+class LLMAgentStepEfficiency(BaseLLMAgentEval):
+    """
+    Evaluates the step efficiency of an agent's execution trace.
+
+    Input:
+        prompt  - The task objective or user request
+        content - The agent execution trace or step-by-step summary
+
+    Output score reflects how efficiently the agent reached its goal,
+    penalizing redundant steps, loops, and unnecessary operations.
+    """
+
+    eval_layer = "execution"
+    input_data_type = "trace_summary"
+    default_threshold = 0.5
+
+    _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT]
+
+    prompt = """You are an expert evaluator assessing the step efficiency of an AI agent's execution.
+
+Analyze the agent's execution trace and identify:
+- **Total steps**: Count all steps/actions taken by the agent
+- **Necessary steps**: Steps that directly contribute to completing the task
+- **Wasted steps**: Redundant, repeated, or unnecessary steps
+- **Loops detected**: Whether the agent got stuck in a repetitive pattern
+
+A score of 10 means perfectly efficient execution with no wasted steps.
+A score of 0 means the agent was completely stuck in loops or took entirely unnecessary actions.
+
+Respond in the same language as the input content for the "reason" field.
+
+Return your evaluation as a JSON object with this exact schema:
+{
+  "total_steps": <integer>,
+  "necessary_steps": <integer>,
+  "wasted_steps": <integer>,
+  "loops_detected": <boolean>,
+  "score": <integer 0-10>,
+  "reason": "<concise explanation including specific examples of inefficiency if found>"
+}
+
+Do not include any text outside the JSON object."""
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List[dict]:
+        """Build LLM messages for step efficiency evaluation."""
+        lang_hint = cls._detect_language_hint(
+            str(input_data.prompt) + str(input_data.content)
+        )
+        user_content = f"""{cls.prompt}
+
+## Task Objective
+{input_data.prompt}
+
+## Agent Execution Trace
+{input_data.content}
+
+Analyze the execution efficiency and return the JSON evaluation.{lang_hint}"""
+
+        return [{"role": "user", "content": user_content}]
diff --git a/dingo/model/llm/agent_eval/llm_agent_task_completion.py b/dingo/model/llm/agent_eval/llm_agent_task_completion.py
new file mode 100644
index 00000000..ec21019b
--- /dev/null
+++ b/dingo/model/llm/agent_eval/llm_agent_task_completion.py
@@ -0,0 +1,72 @@
+"""
+LLMAgentTaskCompletion: Evaluates whether an Agent successfully completed its assigned task.
+
+Compares the task objective (prompt) against the execution result summary (content)
+and scores on goal achievement, accuracy, and completeness.
+"""
+
+from typing import List
+
+from dingo.io.input import Data, RequiredField
+from dingo.model import Model
+from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval
+
+
+@Model.llm_register("LLMAgentTaskCompletion")
+class LLMAgentTaskCompletion(BaseLLMAgentEval):
+    """
+    Evaluates whether the agent completed its assigned task.
+
+    Input:
+        prompt  - The task objective or user request
+        content - The agent execution result summary
+
+    Output score reflects the degree to which the agent achieved the goal,
+    produced accurate results, and covered all required aspects.
+    """
+
+    eval_layer = "execution"
+    input_data_type = "trace_summary"
+    default_threshold = 0.6
+
+    _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT]
+
+    prompt = """You are an expert evaluator assessing whether an AI agent successfully completed its assigned task.
+
+Evaluate the agent's performance across three dimensions:
+1. **Goal Achievement** (1-5): Did the agent accomplish the main objective?
+2. **Accuracy** (1-5): Is the result correct and free of errors?
+3. **Completeness** (1-5): Did the agent address all aspects of the task?
+
+Then compute an overall score from 0 to 10 reflecting the combined quality.
+
+Respond in the same language as the input content for the "reason" field.
+
+Return your evaluation as a JSON object with this exact schema:
+{
+  "goal_achievement": <integer 1-5>,
+  "accuracy": <integer 1-5>,
+  "completeness": <integer 1-5>,
+  "score": <integer 0-10>,
+  "reason": "<concise explanation of the evaluation>"
+}
+
+Do not include any text outside the JSON object."""
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List[dict]:
+        """Build LLM messages for task completion evaluation."""
+        lang_hint = cls._detect_language_hint(
+            str(input_data.prompt) + str(input_data.content)
+        )
+        user_content = f"""{cls.prompt}
+
+## Task Objective
+{input_data.prompt}
+
+## Agent Execution Result
+{input_data.content}
+
+Evaluate whether the agent completed its task and return the JSON evaluation.{lang_hint}"""
+
+        return [{"role": "user", "content": user_content}]
diff --git a/dingo/model/llm/agent_eval/llm_agent_tool_correctness.py b/dingo/model/llm/agent_eval/llm_agent_tool_correctness.py
new file mode 100644
index 00000000..2d0f5232
--- /dev/null
+++ b/dingo/model/llm/agent_eval/llm_agent_tool_correctness.py
@@ -0,0 +1,82 @@
+"""
+LLMAgentToolCorrectness: Evaluates whether the agent selected the correct tools for each step.
+
+Performs referenceless evaluation — no expected tool sequence is required.
+The LLM judge assesses tool choices based on the task objective and the
+context of each tool invocation.
+"""
+
+from typing import List
+
+from dingo.io.input import Data, RequiredField
+from dingo.model import Model
+from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval
+
+
+@Model.llm_register("LLMAgentToolCorrectness")
+class LLMAgentToolCorrectness(BaseLLMAgentEval):
+    """
+    Evaluates the correctness of tool selections in an agent's execution.
+
+    Input:
+        prompt  - The task objective or user request
+        content - JSON-formatted sequence of tool calls made by the agent
+
+    Performs referenceless evaluation: the LLM judge determines whether
+    each tool choice was appropriate given the task and execution context,
+    without requiring a ground-truth tool sequence.
+    """
+
+    eval_layer = "action"
+    input_data_type = "tool_calls"
+    default_threshold = 0.6
+
+    _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT]
+
+    prompt = """You are an expert evaluator assessing whether an AI agent selected the correct tools during task execution.
+
+For each tool call in the sequence, determine:
+- Was this the right tool for the situation?
+- Was this tool call necessary, or was it redundant?
+
+Count:
+- **correct_calls**: Tool calls that were appropriate and necessary
+- **total_calls**: Total number of tool calls made
+- **redundant_calls**: Tool calls that were unnecessary or duplicated without reason
+
+List specific issues (wrong tool chosen, tool used out of order, missing tool that should have been called, etc.).
+
+A score of 10 means every tool call was correct and necessary.
+A score of 0 means all tool calls were wrong or the agent failed to use required tools.
+
+Respond in the same language as the input content for the "reason" field.
+
+Return your evaluation as a JSON object with this exact schema:
+{
+  "correct_calls": <integer>,
+  "total_calls": <integer>,
+  "redundant_calls": <integer>,
+  "score": <integer 0-10>,
+  "issues": ["<issue description>", ...],
+  "reason": "<concise summary of tool selection quality>"
+}
+
+Do not include any text outside the JSON object."""
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List[dict]:
+        """Build LLM messages for tool correctness evaluation."""
+        lang_hint = cls._detect_language_hint(
+            str(input_data.prompt) + str(input_data.content)
+        )
+        user_content = f"""{cls.prompt}
+
+## Task Objective
+{input_data.prompt}
+
+## Tool Call Sequence
+{input_data.content}
+
+Evaluate the tool selections and return the JSON evaluation.{lang_hint}"""
+
+        return [{"role": "user", "content": user_content}]
diff --git a/dingo/model/llm/agent_eval/llm_agent_trace_conclusion.py b/dingo/model/llm/agent_eval/llm_agent_trace_conclusion.py
new file mode 100644
index 00000000..5cb21869
--- /dev/null
+++ b/dingo/model/llm/agent_eval/llm_agent_trace_conclusion.py
@@ -0,0 +1,119 @@
+"""
+LLMAgentTraceConclusion — synthesizes all evaluation results into a structured diagnosis.
+
+Called after all other evaluators complete for a trace. Takes the full set of
+evaluation scores as input and produces:
+- Overall severity (critical / warning / good)
+- Root cause analysis
+- Actionable recommendations
+- A single aggregate score (0-10)
+
+This is NOT an evaluator in the traditional sense — it's a diagnostic synthesizer.
+"""
+
+from typing import List
+
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval
+from dingo.model.model import Model
+
+CONCLUSION_PROMPT = """You are an AI agent quality analyst. Given the evaluation results from multiple evaluators that assessed an agent's execution trace, synthesize a comprehensive diagnosis.
+
+## Task Objective
+{objective}
+
+## Evaluation Results
+{eval_results}
+
+## Trace Summary
+{trace_summary}
+
+## Instructions
+Analyze all evaluation scores and produce a structured JSON diagnosis:
+
+1. **severity**: "critical" (any score < 0.3), "warning" (any score < 0.6), or "good" (all scores >= 0.6)
+2. **root_causes**: List the primary reasons for any failures or low scores
+3. **recommendations**: Actionable suggestions to improve the agent's performance
+4. **highlights**: What the agent did well
+5. **score**: Overall quality score from 0 to 10, weighing task completion most heavily
+
+Output STRICTLY as JSON:
+```json
+{{
+    "severity": "critical|warning|good",
+    "root_causes": ["cause 1", "cause 2"],
+    "recommendations": ["rec 1", "rec 2"],
+    "highlights": ["highlight 1"],
+    "score": 0-10,
+    "summary": "One-paragraph overall assessment in the same language as the task objective"
+}}
+```"""
+
+
+@Model.llm_register("LLMAgentTraceConclusion")
+class LLMAgentTraceConclusion(BaseLLMAgentEval):
+    """Synthesize evaluation results into a structured trace-level diagnosis."""
+
+    eval_layer = "conclusion"
+    input_data_type = "eval_synthesis"
+    default_threshold = 0.5
+    _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT]
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List:
+        objective = getattr(input_data, "prompt", "") or "Agent trace"
+        eval_results = getattr(input_data, "content", "") or "{}"
+        trace_summary = getattr(input_data, "context", "") or ""
+        lang_hint = cls._detect_language_hint(
+            str(input_data.prompt) + str(input_data.content)
+        )
+
+        prompt_text = CONCLUSION_PROMPT.format(
+            objective=objective,
+            eval_results=eval_results,
+            trace_summary=trace_summary,
+        ) + lang_hint
+        return [{"role": "user", "content": prompt_text}]
+
+    @classmethod
+    def process_response(cls, response: str) -> EvalDetail:
+        from dingo.utils import log
+        log.info(response)
+
+        data = cls._parse_json_response(response)
+
+        raw_score = data.get("score", 5)
+        try:
+            raw_score = float(raw_score)
+        except (TypeError, ValueError):
+            raw_score = 5.0
+
+        normalized_score = max(0.0, min(1.0, raw_score / 10.0))
+        severity = data.get("severity", "warning")
+
+        result = EvalDetail(metric=cls.__name__)
+        result.score = normalized_score
+
+        if severity == "good":
+            result.status = False
+            result.label = [QualityLabel.QUALITY_GOOD]
+        elif severity == "critical":
+            result.status = True
+            result.label = ["AGENT_QUALITY.TraceConclusion.CRITICAL"]
+        else:
+            result.status = True
+            result.label = ["AGENT_QUALITY.TraceConclusion.WARNING"]
+
+        import json
+        result.reason = [
+            data.get("summary", ""),
+            json.dumps({
+                "severity": severity,
+                "root_causes": data.get("root_causes", []),
+                "recommendations": data.get("recommendations", []),
+                "highlights": data.get("highlights", []),
+            }, ensure_ascii=False),
+        ]
+
+        return result
diff --git a/dingo/model/rule/rule_agent.py b/dingo/model/rule/rule_agent.py
new file mode 100644
index 00000000..2606a81c
--- /dev/null
+++ b/dingo/model/rule/rule_agent.py
@@ -0,0 +1,230 @@
+"""
+Agent-specific rule evaluators for deterministic quality checks.
+
+These rules run without LLM calls, checking structural properties
+of agent execution traces (loops, token budget, latency anomalies).
+"""
+
+import json
+import statistics
+from typing import List, Optional
+
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model.model import Model
+from dingo.model.rule.base import BaseRule
+
+
+@Model.rule_register("AGENT_TRACE_QUALITY", ["agent_trace_basic"])
+class RuleAgentTraceLoopDetection(BaseRule):
+    """Detect repetitive tool call patterns indicating infinite loops.
+
+    Input: content = JSON array of tool call objects with 'tool_name' field.
+    Detection: n-gram analysis on tool name sequences.
+    A loop is detected when the same subsequence of 2+ tool names
+    repeats 3 or more consecutive times.
+    """
+
+    _required_fields = [RequiredField.CONTENT]
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        result = EvalDetail(metric=cls.__name__)
+
+        tool_names = cls._extract_tool_names(input_data.content)
+        if len(tool_names) < 6:
+            result.label = [QualityLabel.QUALITY_GOOD]
+            return result
+
+        loop_info = cls._detect_loops(tool_names)
+        if loop_info:
+            result.status = True
+            result.label = [f"{cls.metric_type}.{cls.__name__}"]
+            result.reason = [
+                f"Loop detected: pattern {loop_info['pattern']} "
+                f"repeats {loop_info['count']} times at position {loop_info['position']}"
+            ]
+        else:
+            result.label = [QualityLabel.QUALITY_GOOD]
+
+        return result
+
+    @classmethod
+    def _extract_tool_names(cls, content: str) -> List[str]:
+        try:
+            data = json.loads(content) if isinstance(content, str) else content
+        except (json.JSONDecodeError, TypeError):
+            return []
+
+        if isinstance(data, dict):
+            items = data.get("tool_calls", data.get("steps", []))
+        elif isinstance(data, list):
+            items = data
+        else:
+            return []
+
+        return [
+            item.get("tool_name", item.get("name", ""))
+            for item in items
+            if isinstance(item, dict) and item.get("tool_name") or item.get("name")
+        ]
+
+    @classmethod
+    def _detect_loops(
+        cls, names: List[str], min_pattern_len: int = 2, min_repeats: int = 3
+    ) -> Optional[dict]:
+        for pattern_len in range(min_pattern_len, len(names) // min_repeats + 1):
+            for start in range(len(names) - pattern_len * min_repeats + 1):
+                pattern = names[start : start + pattern_len]
+                count = 1
+                pos = start + pattern_len
+                while pos + pattern_len <= len(names):
+                    if names[pos : pos + pattern_len] == pattern:
+                        count += 1
+                        pos += pattern_len
+                    else:
+                        break
+                if count >= min_repeats:
+                    return {
+                        "pattern": pattern,
+                        "count": count,
+                        "position": start,
+                    }
+        return None
+
+
+@Model.rule_register("AGENT_TRACE_QUALITY", ["agent_trace_basic"])
+class RuleAgentTraceTokenBudget(BaseRule):
+    """Check if total token usage exceeds a configurable budget.
+
+    Input: content = JSON with 'total_tokens' field, or metadata with token info.
+    Default budget: 500,000 tokens (configurable via dynamic_config.threshold).
+    """
+
+    _required_fields = [RequiredField.CONTENT]
+    dynamic_config = None
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        result = EvalDetail(metric=cls.__name__)
+
+        budget = 500_000
+        if cls.dynamic_config and hasattr(cls.dynamic_config, "threshold"):
+            try:
+                budget = int(cls.dynamic_config.threshold)
+            except (TypeError, ValueError):
+                pass
+
+        total_tokens = cls._extract_tokens(input_data)
+        if total_tokens is None:
+            result.label = [QualityLabel.QUALITY_GOOD]
+            return result
+
+        if total_tokens > budget:
+            result.status = True
+            result.label = [f"{cls.metric_type}.{cls.__name__}"]
+            result.reason = [
+                f"Token usage {total_tokens:,} exceeds budget {budget:,}"
+            ]
+            result.score = min(1.0, budget / total_tokens) if total_tokens > 0 else 0.0
+        else:
+            result.label = [QualityLabel.QUALITY_GOOD]
+            result.score = 1.0
+
+        return result
+
+    @classmethod
+    def _extract_tokens(cls, input_data: Data) -> Optional[int]:
+        for source in [input_data.content, getattr(input_data, "metadata", None)]:
+            if source is None:
+                continue
+            try:
+                data = json.loads(source) if isinstance(source, str) else source
+            except (json.JSONDecodeError, TypeError):
+                continue
+            if isinstance(data, dict):
+                val = data.get("total_tokens")
+                if val is not None:
+                    try:
+                        return int(val)
+                    except (TypeError, ValueError):
+                        pass
+        return None
+
+
+@Model.rule_register("AGENT_TRACE_QUALITY", ["agent_trace_basic"])
+class RuleAgentTraceLatencyAnomaly(BaseRule):
+    """Detect abnormally slow steps using statistical outlier analysis.
+
+    Input: content = JSON array of step objects with 'duration' or 'duration_seconds' field.
+    A step is flagged if its duration exceeds mean + 3*stddev.
+    """
+
+    _required_fields = [RequiredField.CONTENT]
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        result = EvalDetail(metric=cls.__name__)
+
+        steps = cls._extract_steps(input_data.content)
+        durations = [s["duration"] for s in steps if s["duration"] is not None and s["duration"] > 0]
+
+        if len(durations) < 3:
+            result.label = [QualityLabel.QUALITY_GOOD]
+            return result
+
+        mean = statistics.mean(durations)
+        stdev = statistics.stdev(durations)
+        threshold = mean + 3 * stdev
+
+        anomalies = [
+            s for s in steps
+            if s["duration"] is not None and s["duration"] > threshold
+        ]
+
+        if anomalies:
+            result.status = True
+            result.label = [f"{cls.metric_type}.{cls.__name__}"]
+            result.reason = [
+                f"Step '{a['name']}' took {a['duration']:.2f}s "
+                f"(threshold: {threshold:.2f}s, mean: {mean:.2f}s)"
+                for a in anomalies[:5]
+            ]
+        else:
+            result.label = [QualityLabel.QUALITY_GOOD]
+
+        return result
+
+    @classmethod
+    def _extract_steps(cls, content: str) -> List[dict]:
+        try:
+            data = json.loads(content) if isinstance(content, str) else content
+        except (json.JSONDecodeError, TypeError):
+            return []
+
+        if isinstance(data, dict):
+            items = data.get("steps", data.get("tool_calls", []))
+        elif isinstance(data, list):
+            items = data
+        else:
+            return []
+
+        return [
+            {
+                "name": item.get("name", "unknown"),
+                "duration": cls._safe_float(
+                    item.get("duration", item.get("duration_seconds"))
+                ),
+            }
+            for item in items
+            if isinstance(item, dict)
+        ]
+
+    @classmethod
+    def _safe_float(cls, val) -> Optional[float]:
+        if val is None:
+            return None
+        try:
+            return float(val)
+        except (TypeError, ValueError):
+            return None

From 0965f01cd44d94eeff25aecc8b9db1269ffb6c6f Mon Sep 17 00:00:00 2001
From: Sean <liuyuxin@pjlab.org.cn>
Date: Mon, 8 Jun 2026 14:12:00 +0800
Subject: [PATCH 2/2] chore: update .gitignore with AI artifacts, benchmarks,
 and downloads

---
 .gitignore | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.gitignore b/.gitignore
index 91b2065f..e45c88dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,20 @@ outputs/
 coverage.xml
 
 dingo_python.egg-info/*
+
+# Claude Code / AI assistant artifacts
+.claude/
+CLAUDE.md
+CLAUDE.local.md
+claude_docs/
+
+# Benchmarks and experiment data
+benchmarks/
+meta_rater/
+
+# Downloaded models / large binary files
+downloads/
+
+# Temporary docs (plans, specs)
+docs/plans/
+docs/superpowers/