MigoXLab · seancoding-day · Jun 5, 2026 · Jun 8, 2026 · gemini-code-assist · Jun 12, 2026
diff --git a/.gitignore b/.gitignore
@@ -46,3 +46,20 @@ outputs/
 coverage.xml
 
 dingo_python.egg-info/*
+
+# Claude Code / AI assistant artifacts
+.claude/
+CLAUDE.md
+CLAUDE.local.md
+claude_docs/
+
+# Benchmarks and experiment data
+benchmarks/
+meta_rater/
+
+# Downloaded models / large binary files
+downloads/
+
+# Temporary docs (plans, specs)
+docs/plans/
+docs/superpowers/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,23 +1,23 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
-repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
-    hooks:
-    -   id: trailing-whitespace
-        exclude: '^README.*\.md$'
-    -   id: end-of-file-fixer
-        exclude: 'docs/metrics\.md'
-    -   id: check-yaml
-    -   id: check-added-large-files
--   repo: https://github.com/PyCQA/isort
-    rev: 6.0.1
-    hooks:
-    -   id: isort
-        args: [ "-l", "200", "-m", "0", "-p", "dingo" ]
--   repo: https://github.com/PyCQA/flake8
-    rev: 7.2.0
-    hooks:
-    -   id: flake8
-        args: [ "--max-line-length=2200", "--ignore=E121,E131,E125,W503,W504,W604,E203,E231,E702,E128,F541,F401,E266" ]
-        exclude: 'app/'
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+    -   id: trailing-whitespace
+        exclude: '^README.*\.md$'
+    -   id: end-of-file-fixer
+        exclude: 'docs/metrics\.md'
+    -   id: check-yaml
+    -   id: check-added-large-files
+-   repo: https://github.com/PyCQA/isort
+    rev: 6.0.1
+    hooks:
+    -   id: isort
+        args: [ "-l", "200", "-m", "0", "-p", "dingo" ]
+-   repo: https://github.com/PyCQA/flake8
+    rev: 7.2.0
+    hooks:
+    -   id: flake8
+        args: [ "--max-line-length=2200", "--ignore=E121,E131,E125,W503,W504,W604,E203,E231,E702,E128,F541,F401,E266" ]
+        exclude: 'app/'
diff --git a/dingo/model/llm/agent_eval/__init__.py b/dingo/model/llm/agent_eval/__init__.py
@@ -0,0 +1,10 @@
+"""
+Agent Trace Evaluation metrics (LLMAgent* series).
+
+These evaluators assess agent trace quality using LLM-as-Judge methodology.
+They are distinct from the agent/ directory which contains agent-framework-based
+evaluators (AgentFactCheck, AgentHallucination) that USE agent frameworks to DO evaluation.
+
+Evaluators in this package EVALUATE agent traces — task completion, plan quality,
+tool correctness, error recovery, etc.
+"""
diff --git a/dingo/model/llm/agent_eval/base_llm_agent_eval.py b/dingo/model/llm/agent_eval/base_llm_agent_eval.py
@@ -0,0 +1,112 @@
+"""
+Base class for all Agent evaluation metrics.
+
+Provides standardized:
+- 0~10 score → 0.0~1.0 normalization
+- Configurable threshold via dynamic_config.model_extra
+- Unified JSON response parsing with {"score": 0-10, "reason": "...", ...}
+- Error fallback handling
+- `eval_layer` and `input_data_type` declarations for orchestrator integration
+
+Subclasses only need to define:
+- `prompt`: the evaluation prompt template
+- `build_messages()`: how to format input data into LLM messages
+- Optionally override `process_response()` for custom parsing
+"""
+
+import json
+from typing import Optional
+
+from dingo.io.input import Data, RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.utils import log
+from dingo.utils.exception import ConvertJsonError
+
+
+class BaseLLMAgentEval(BaseOpenAI):
+    """Shared base class for all Agent evaluation metrics."""
+
+    eval_layer: str = ""
+    input_data_type: str = "trace_summary"
+    default_threshold: float = 0.6
+
+    @classmethod
+    def _detect_language_hint(cls, text: str) -> str:
+        """Detect if text contains CJK characters and return a language instruction."""
+        if not text:
+            return ""
+        import re
+        cjk_count = len(re.findall(r'[一-鿿㐀-䶿]', text[:500]))
+        if cjk_count > 5:
+            return '\n\n注意：请用中文回答 "reason" 字段。'
+        return ""
+
+    @classmethod
+    def _get_threshold(cls) -> float:
+        if cls.dynamic_config and cls.dynamic_config.model_extra:
+            return float(cls.dynamic_config.model_extra.get(
+                "threshold", cls.default_threshold
+            ))
+        return cls.default_threshold
+
+    @classmethod
+    def _strip_json_fences(cls, response: str) -> str:
+        response = response.strip()
+        if response.startswith("```json"):
+            response = response[7:]
+        if response.startswith("```"):
+            response = response[3:]
+        if response.endswith("```"):
+            response = response[:-3]
+        return response.strip()
+
+    @classmethod
+    def _parse_json_response(cls, response: str) -> dict:
+        cleaned = cls._strip_json_fences(response)
+        try:
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            raise ConvertJsonError(
+                f"Failed to parse agent eval JSON: {cleaned[:500]}"
+            )
+
+    @classmethod
+    def process_response(cls, response: str) -> EvalDetail:
+        """Standardized response processing for agent evaluators.
+
+        Expected LLM output: {"score": 0-10, "reason": "...", ...extra fields...}
+        Score is normalized to 0.0~1.0 and compared against threshold.
+        Extra fields are preserved in EvalDetail.reason as JSON.
+        """
+        log.info(response)
+        data = cls._parse_json_response(response)
+
+        raw_score = data.get("score", data.get("overall_score", 0))
+        try:
+            raw_score = float(raw_score)
+        except (TypeError, ValueError):
+            raw_score = 0.0
+
+        normalized_score = max(0.0, min(1.0, raw_score / 10.0))
+        threshold = cls._get_threshold()
+
+        reason_text = data.get("reason", "")
+        details = {k: v for k, v in data.items() if k not in ("score", "reason")}
+
+        result = EvalDetail(metric=cls.__name__)
+        result.score = normalized_score
+
+        if normalized_score >= threshold:
+            result.status = False
+            result.label = [QualityLabel.QUALITY_GOOD]
+        else:
+            result.status = True
+            result.label = [f"AGENT_QUALITY.{cls.__name__}"]
+
+        reason_parts = [reason_text] if reason_text else []
+        if details:
+            reason_parts.append(json.dumps(details, ensure_ascii=False, default=str))
+        result.reason = reason_parts if reason_parts else None
+
+        return result
diff --git a/dingo/model/llm/agent_eval/llm_agent_argument_correctness.py b/dingo/model/llm/agent_eval/llm_agent_argument_correctness.py
@@ -0,0 +1,82 @@
+"""
+LLMAgentArgumentCorrectness: Evaluates whether the agent passed correct arguments to each tool call.
+
+Performs referenceless LLM-judge evaluation — no ground-truth arguments are required.
+The judge assesses argument quality based on the task objective and the expected
+semantics of each tool.
+"""
+
+from typing import List
+
+from dingo.io.input import Data, RequiredField
+from dingo.model import Model
+from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval
+
+
+@Model.llm_register("LLMAgentArgumentCorrectness")
+class LLMAgentArgumentCorrectness(BaseLLMAgentEval):
+    """
+    Evaluates the correctness of tool arguments in an agent's execution.
+
+    Input:
+        prompt  - The task objective or user request
+        content - JSON-formatted sequence of tool calls with their arguments
+
+    Performs referenceless evaluation: the LLM judge determines whether
+    each tool received correct, well-formed, and contextually appropriate
+    arguments, without requiring ground-truth argument values.
+    """
+
+    eval_layer = "action"
+    input_data_type = "tool_calls"
+    default_threshold = 0.6
+
+    _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT]
+
+    prompt = """You are an expert evaluator assessing whether an AI agent passed correct arguments to its tool calls.
+
+For each tool call in the sequence, evaluate the arguments:
+- Are the argument values correct and appropriate for the task context?
+- Are required arguments present and non-null?
+- Are argument types and formats valid?
+- Do the arguments make semantic sense given what the tool does?
+
+Count:
+- **correct_args**: Tool calls where arguments were fully correct
+- **total_calls**: Total number of tool calls evaluated
+
+List specific argument issues found (wrong value, missing required argument, type mismatch, semantically incorrect argument, etc.).
+
+A score of 10 means all tool calls had perfectly correct arguments.
+A score of 0 means all tool calls had wrong or missing arguments.
+
+Respond in the same language as the input content for the "reason" field.
+
+Return your evaluation as a JSON object with this exact schema:
+{
+  "correct_args": <integer>,
+  "total_calls": <integer>,
+  "issues": ["<issue description, e.g. tool X received wrong value for param Y>", ...],
+  "score": <integer 0-10>,
+  "reason": "<concise summary of argument correctness across all tool calls>"
+}
+
+Do not include any text outside the JSON object."""
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List[dict]:
+        """Build LLM messages for argument correctness evaluation."""
+        lang_hint = cls._detect_language_hint(
+            str(input_data.prompt) + str(input_data.content)
+        )
+        user_content = f"""{cls.prompt}
+
+## Task Objective
+{input_data.prompt}
+
+## Tool Call Sequence with Arguments
+{input_data.content}
+
+Evaluate the tool arguments and return the JSON evaluation.{lang_hint}"""
+
+        return [{"role": "user", "content": user_content}]