From e7dc9a71a73d063b7271570d3da3a4ed6b858ef6 Mon Sep 17 00:00:00 2001 From: Sean Date: Fri, 5 Jun 2026 09:52:24 +0800 Subject: [PATCH 1/2] feat(agent-eval): add LLMAgent* evaluators for agent trace monitoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 8 LLM evaluators + 3 rule evaluators for AI agent trace quality assessment, aligned with DeepEval 3-layer taxonomy (Execution/Action/Reasoning) + Recovery. LLM evaluators (dingo/model/llm/agent_eval/): - LLMAgentTaskCompletion, LLMAgentStepEfficiency (Execution) - LLMAgentToolCorrectness, LLMAgentArgumentCorrectness (Action) - LLMAgentPlanQuality, LLMAgentPlanAdherence (Reasoning) - LLMAgentErrorRecovery (Recovery), LLMAgentTraceConclusion (synthesis) Rule evaluators: RuleAgentTraceLoopDetection, RuleAgentTraceTokenBudget, RuleAgentTraceLatencyAnomaly Shared BaseLLMAgentEval with 0-10→0.0-1.0 normalization, CJK language detection, and configurable threshold. --- .pre-commit-config.yaml | 46 ++-- dingo/model/llm/agent_eval/__init__.py | 10 + .../llm/agent_eval/base_llm_agent_eval.py | 112 +++++++++ .../llm_agent_argument_correctness.py | 82 +++++++ .../agent_eval/llm_agent_error_recovery.py | 150 ++++++++++++ .../agent_eval/llm_agent_plan_adherence.py | 85 +++++++ .../llm/agent_eval/llm_agent_plan_quality.py | 160 ++++++++++++ .../agent_eval/llm_agent_step_efficiency.py | 75 ++++++ .../agent_eval/llm_agent_task_completion.py | 72 ++++++ .../agent_eval/llm_agent_tool_correctness.py | 82 +++++++ .../agent_eval/llm_agent_trace_conclusion.py | 119 +++++++++ dingo/model/rule/rule_agent.py | 230 ++++++++++++++++++ 12 files changed, 1200 insertions(+), 23 deletions(-) create mode 100644 dingo/model/llm/agent_eval/__init__.py create mode 100644 dingo/model/llm/agent_eval/base_llm_agent_eval.py create mode 100644 dingo/model/llm/agent_eval/llm_agent_argument_correctness.py create mode 100644 dingo/model/llm/agent_eval/llm_agent_error_recovery.py create mode 100644 dingo/model/llm/agent_eval/llm_agent_plan_adherence.py create mode 100644 dingo/model/llm/agent_eval/llm_agent_plan_quality.py create mode 100644 dingo/model/llm/agent_eval/llm_agent_step_efficiency.py create mode 100644 dingo/model/llm/agent_eval/llm_agent_task_completion.py create mode 100644 dingo/model/llm/agent_eval/llm_agent_tool_correctness.py create mode 100644 dingo/model/llm/agent_eval/llm_agent_trace_conclusion.py create mode 100644 dingo/model/rule/rule_agent.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c1fd47c1..6b50e489 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,23 +1,23 @@ -# See https://pre-commit.com for more information -# See https://pre-commit.com/hooks.html for more hooks -repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 - hooks: - - id: trailing-whitespace - exclude: '^README.*\.md$' - - id: end-of-file-fixer - exclude: 'docs/metrics\.md' - - id: check-yaml - - id: check-added-large-files -- repo: https://github.com/PyCQA/isort - rev: 6.0.1 - hooks: - - id: isort - args: [ "-l", "200", "-m", "0", "-p", "dingo" ] -- repo: https://github.com/PyCQA/flake8 - rev: 7.2.0 - hooks: - - id: flake8 - args: [ "--max-line-length=2200", "--ignore=E121,E131,E125,W503,W504,W604,E203,E231,E702,E128,F541,F401,E266" ] - exclude: 'app/' +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + exclude: '^README.*\.md$' + - id: end-of-file-fixer + exclude: 'docs/metrics\.md' + - id: check-yaml + - id: check-added-large-files +- repo: https://github.com/PyCQA/isort + rev: 6.0.1 + hooks: + - id: isort + args: [ "-l", "200", "-m", "0", "-p", "dingo" ] +- repo: https://github.com/PyCQA/flake8 + rev: 7.2.0 + hooks: + - id: flake8 + args: [ "--max-line-length=2200", "--ignore=E121,E131,E125,W503,W504,W604,E203,E231,E702,E128,F541,F401,E266" ] + exclude: 'app/' diff --git a/dingo/model/llm/agent_eval/__init__.py b/dingo/model/llm/agent_eval/__init__.py new file mode 100644 index 00000000..a6e8dbbb --- /dev/null +++ b/dingo/model/llm/agent_eval/__init__.py @@ -0,0 +1,10 @@ +""" +Agent Trace Evaluation metrics (LLMAgent* series). + +These evaluators assess agent trace quality using LLM-as-Judge methodology. +They are distinct from the agent/ directory which contains agent-framework-based +evaluators (AgentFactCheck, AgentHallucination) that USE agent frameworks to DO evaluation. + +Evaluators in this package EVALUATE agent traces — task completion, plan quality, +tool correctness, error recovery, etc. +""" diff --git a/dingo/model/llm/agent_eval/base_llm_agent_eval.py b/dingo/model/llm/agent_eval/base_llm_agent_eval.py new file mode 100644 index 00000000..5669cf1e --- /dev/null +++ b/dingo/model/llm/agent_eval/base_llm_agent_eval.py @@ -0,0 +1,112 @@ +""" +Base class for all Agent evaluation metrics. + +Provides standardized: +- 0~10 score → 0.0~1.0 normalization +- Configurable threshold via dynamic_config.model_extra +- Unified JSON response parsing with {"score": 0-10, "reason": "...", ...} +- Error fallback handling +- `eval_layer` and `input_data_type` declarations for orchestrator integration + +Subclasses only need to define: +- `prompt`: the evaluation prompt template +- `build_messages()`: how to format input data into LLM messages +- Optionally override `process_response()` for custom parsing +""" + +import json +from typing import Optional + +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model.llm.base_openai import BaseOpenAI +from dingo.utils import log +from dingo.utils.exception import ConvertJsonError + + +class BaseLLMAgentEval(BaseOpenAI): + """Shared base class for all Agent evaluation metrics.""" + + eval_layer: str = "" + input_data_type: str = "trace_summary" + default_threshold: float = 0.6 + + @classmethod + def _detect_language_hint(cls, text: str) -> str: + """Detect if text contains CJK characters and return a language instruction.""" + if not text: + return "" + import re + cjk_count = len(re.findall(r'[一-鿿㐀-䶿]', text[:500])) + if cjk_count > 5: + return '\n\n注意:请用中文回答 "reason" 字段。' + return "" + + @classmethod + def _get_threshold(cls) -> float: + if cls.dynamic_config and cls.dynamic_config.model_extra: + return float(cls.dynamic_config.model_extra.get( + "threshold", cls.default_threshold + )) + return cls.default_threshold + + @classmethod + def _strip_json_fences(cls, response: str) -> str: + response = response.strip() + if response.startswith("```json"): + response = response[7:] + if response.startswith("```"): + response = response[3:] + if response.endswith("```"): + response = response[:-3] + return response.strip() + + @classmethod + def _parse_json_response(cls, response: str) -> dict: + cleaned = cls._strip_json_fences(response) + try: + return json.loads(cleaned) + except json.JSONDecodeError: + raise ConvertJsonError( + f"Failed to parse agent eval JSON: {cleaned[:500]}" + ) + + @classmethod + def process_response(cls, response: str) -> EvalDetail: + """Standardized response processing for agent evaluators. + + Expected LLM output: {"score": 0-10, "reason": "...", ...extra fields...} + Score is normalized to 0.0~1.0 and compared against threshold. + Extra fields are preserved in EvalDetail.reason as JSON. + """ + log.info(response) + data = cls._parse_json_response(response) + + raw_score = data.get("score", data.get("overall_score", 0)) + try: + raw_score = float(raw_score) + except (TypeError, ValueError): + raw_score = 0.0 + + normalized_score = max(0.0, min(1.0, raw_score / 10.0)) + threshold = cls._get_threshold() + + reason_text = data.get("reason", "") + details = {k: v for k, v in data.items() if k not in ("score", "reason")} + + result = EvalDetail(metric=cls.__name__) + result.score = normalized_score + + if normalized_score >= threshold: + result.status = False + result.label = [QualityLabel.QUALITY_GOOD] + else: + result.status = True + result.label = [f"AGENT_QUALITY.{cls.__name__}"] + + reason_parts = [reason_text] if reason_text else [] + if details: + reason_parts.append(json.dumps(details, ensure_ascii=False, default=str)) + result.reason = reason_parts if reason_parts else None + + return result diff --git a/dingo/model/llm/agent_eval/llm_agent_argument_correctness.py b/dingo/model/llm/agent_eval/llm_agent_argument_correctness.py new file mode 100644 index 00000000..3ef3b9ca --- /dev/null +++ b/dingo/model/llm/agent_eval/llm_agent_argument_correctness.py @@ -0,0 +1,82 @@ +""" +LLMAgentArgumentCorrectness: Evaluates whether the agent passed correct arguments to each tool call. + +Performs referenceless LLM-judge evaluation — no ground-truth arguments are required. +The judge assesses argument quality based on the task objective and the expected +semantics of each tool. +""" + +from typing import List + +from dingo.io.input import Data, RequiredField +from dingo.model import Model +from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval + + +@Model.llm_register("LLMAgentArgumentCorrectness") +class LLMAgentArgumentCorrectness(BaseLLMAgentEval): + """ + Evaluates the correctness of tool arguments in an agent's execution. + + Input: + prompt - The task objective or user request + content - JSON-formatted sequence of tool calls with their arguments + + Performs referenceless evaluation: the LLM judge determines whether + each tool received correct, well-formed, and contextually appropriate + arguments, without requiring ground-truth argument values. + """ + + eval_layer = "action" + input_data_type = "tool_calls" + default_threshold = 0.6 + + _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT] + + prompt = """You are an expert evaluator assessing whether an AI agent passed correct arguments to its tool calls. + +For each tool call in the sequence, evaluate the arguments: +- Are the argument values correct and appropriate for the task context? +- Are required arguments present and non-null? +- Are argument types and formats valid? +- Do the arguments make semantic sense given what the tool does? + +Count: +- **correct_args**: Tool calls where arguments were fully correct +- **total_calls**: Total number of tool calls evaluated + +List specific argument issues found (wrong value, missing required argument, type mismatch, semantically incorrect argument, etc.). + +A score of 10 means all tool calls had perfectly correct arguments. +A score of 0 means all tool calls had wrong or missing arguments. + +Respond in the same language as the input content for the "reason" field. + +Return your evaluation as a JSON object with this exact schema: +{ + "correct_args": , + "total_calls": , + "issues": ["", ...], + "score": , + "reason": "" +} + +Do not include any text outside the JSON object.""" + + @classmethod + def build_messages(cls, input_data: Data) -> List[dict]: + """Build LLM messages for argument correctness evaluation.""" + lang_hint = cls._detect_language_hint( + str(input_data.prompt) + str(input_data.content) + ) + user_content = f"""{cls.prompt} + +## Task Objective +{input_data.prompt} + +## Tool Call Sequence with Arguments +{input_data.content} + +Evaluate the tool arguments and return the JSON evaluation.{lang_hint}""" + + return [{"role": "user", "content": user_content}] diff --git a/dingo/model/llm/agent_eval/llm_agent_error_recovery.py b/dingo/model/llm/agent_eval/llm_agent_error_recovery.py new file mode 100644 index 00000000..b7f260ab --- /dev/null +++ b/dingo/model/llm/agent_eval/llm_agent_error_recovery.py @@ -0,0 +1,150 @@ +""" +LLMAgentErrorRecovery: Evaluates the agent's ability to recover from errors encountered during execution. + +If no error events are present in the input, returns score=1.0 (pass) immediately, +since perfect execution with no errors requires no recovery. +""" + +import time +from typing import List + +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model import Model +from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval +from dingo.utils import log +from dingo.utils.exception import ConvertJsonError, ExceedMaxTokens + +try: + from pydantic import ValidationError +except ImportError: + ValidationError = Exception + + +@Model.llm_register("LLMAgentErrorRecovery") +class LLMAgentErrorRecovery(BaseLLMAgentEval): + """ + Evaluates the error recovery capability of an agent. + + Input: + prompt - The task objective or user request + content - The error events or failure log from the agent execution + + If no errors are found in the content, the evaluator short-circuits + and returns score=1.0 (pass) without calling the LLM. + """ + + eval_layer = "recovery" + input_data_type = "error_events" + default_threshold = 0.5 + + _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT] + + _NO_ERROR_INDICATORS = [ + "no error", "no errors", "no failures", "no exception", + "0 errors", "zero errors", "none", "n/a", "[]", "{}", + ] + + prompt = """You are an expert evaluator assessing how well an AI agent recovered from errors during task execution. + +For each error event, evaluate: +- Did the agent detect the error? +- Did the agent attempt recovery? +- Was the recovery successful? +- Was the recovery strategy appropriate? + +Count: +- **errors_encountered**: Total number of distinct errors or failures +- **recovered_count**: Number of errors from which the agent successfully recovered + +Assess overall **recovery_quality** from 0 to 10: +- 10: Agent recovered from all errors with optimal strategies +- 7-9: Agent recovered from most errors with reasonable strategies +- 4-6: Agent recovered from some errors but used suboptimal approaches +- 1-3: Agent attempted recovery but largely failed +- 0: Agent did not attempt recovery or made errors worse + +Respond in the same language as the input content for the "reason" field. + +Return your evaluation as a JSON object with this exact schema: +{ + "errors_encountered": , + "recovered_count": , + "recovery_quality": , + "score": , + "reason": "" +} + +Do not include any text outside the JSON object.""" + + @classmethod + def _has_error_events(cls, content: str) -> bool: + """Check if the content contains actual error events.""" + if not content or not content.strip(): + return False + stripped = content.strip().lower() + for indicator in cls._NO_ERROR_INDICATORS: + if stripped == indicator: + return False + return True + + @classmethod + def build_messages(cls, input_data: Data) -> List[dict]: + """Build LLM messages for error recovery evaluation.""" + lang_hint = cls._detect_language_hint( + str(input_data.prompt) + str(input_data.content) + ) + user_content = f"""{cls.prompt} + +## Task Objective +{input_data.prompt} + +## Error Events / Failure Log +{input_data.content} + +Evaluate the agent's error recovery and return the JSON evaluation.{lang_hint}""" + + return [{"role": "user", "content": user_content}] + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + """Override eval() to handle the no-error special case.""" + content = getattr(input_data, "content", "") or "" + + if not cls._has_error_events(content): + log.info(f"{cls.__name__}: No error events detected, returning pass") + result = EvalDetail(metric=cls.__name__) + result.status = False + result.label = [QualityLabel.QUALITY_GOOD] + result.score = 1.0 + result.reason = ["No error events found in execution trace; recovery evaluation skipped."] + return result + + if cls.client is None: + cls.create_client() + + messages = cls.build_messages(input_data) + + attempts = 0 + except_msg = "" + except_name = Exception.__class__.__name__ + while attempts < 3: + try: + response = cls.send_messages(messages) + res: EvalDetail = cls.process_response(response) + return res + except (ValidationError, ExceedMaxTokens, ConvertJsonError) as e: + except_msg = str(e) + except_name = e.__class__.__name__ + break + except Exception as e: + attempts += 1 + time.sleep(1) + except_msg = str(e) + except_name = e.__class__.__name__ + + res = EvalDetail(metric=cls.__name__) + res.status = True + res.label = [f"QUALITY_BAD.{except_name}"] + res.reason = [except_msg] + return res diff --git a/dingo/model/llm/agent_eval/llm_agent_plan_adherence.py b/dingo/model/llm/agent_eval/llm_agent_plan_adherence.py new file mode 100644 index 00000000..0755b47b --- /dev/null +++ b/dingo/model/llm/agent_eval/llm_agent_plan_adherence.py @@ -0,0 +1,85 @@ +""" +LLMAgentPlanAdherence: Evaluates how closely the agent followed its stated plan during execution. + +Compares the original plan (prompt) against the actual execution steps (content), +with the task goal available as context. Justified deviations are scored more +leniently than unjustified ones. +""" + +from typing import List + +from dingo.io.input import Data, RequiredField +from dingo.model import Model +from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval + + +@Model.llm_register("LLMAgentPlanAdherence") +class LLMAgentPlanAdherence(BaseLLMAgentEval): + """ + Evaluates how closely the agent adhered to its original plan. + + Input: + prompt - The agent's original plan (steps or strategy) + content - The actual execution steps taken by the agent + context - The overarching task goal or user objective + + Deviations are classified as justified (e.g., adapting to unexpected + obstacles) or unjustified (e.g., skipping steps without reason). + """ + + eval_layer = "reasoning" + input_data_type = "plan_vs_execution" + default_threshold = 0.5 + + _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT, RequiredField.CONTEXT] + + prompt = """You are an expert evaluator assessing how closely an AI agent followed its original plan during execution. + +Compare the original plan against the actual execution steps and classify any deviations: +- **Justified deviations**: The agent deviated from the plan for a valid reason (e.g., encountered an obstacle, discovered new information, adapted to dynamic conditions). +- **Unjustified deviations**: The agent deviated from the plan without apparent reason (e.g., skipped steps, added unplanned steps unrelated to the goal). + +Count: +- **followed_steps**: Number of planned steps that were executed as intended +- **total_planned**: Total number of steps in the original plan +- **justified_deviations**: Deviations with valid justification +- **unjustified_deviations**: Deviations without justification + +A score of 10 means perfect adherence (or all deviations were justified). +A score of 0 means the agent completely ignored its plan without justification. + +Respond in the same language as the input content for the "reason" field. + +Return your evaluation as a JSON object with this exact schema: +{ + "followed_steps": , + "total_planned": , + "justified_deviations": , + "unjustified_deviations": , + "score": , + "reason": "" +} + +Do not include any text outside the JSON object.""" + + @classmethod + def build_messages(cls, input_data: Data) -> List[dict]: + """Build LLM messages for plan adherence evaluation.""" + task_goal = getattr(input_data, "context", "") or "" + lang_hint = cls._detect_language_hint( + str(input_data.prompt) + str(input_data.content) + ) + user_content = f"""{cls.prompt} + +## Task Goal +{task_goal} + +## Original Plan +{input_data.prompt} + +## Actual Execution Steps +{input_data.content} + +Evaluate how closely the agent followed its plan and return the JSON evaluation.{lang_hint}""" + + return [{"role": "user", "content": user_content}] diff --git a/dingo/model/llm/agent_eval/llm_agent_plan_quality.py b/dingo/model/llm/agent_eval/llm_agent_plan_quality.py new file mode 100644 index 00000000..be4f33da --- /dev/null +++ b/dingo/model/llm/agent_eval/llm_agent_plan_quality.py @@ -0,0 +1,160 @@ +""" +LLMAgentPlanQuality: Evaluates the quality of an agent's reasoning plan. + +Assesses coherence, completeness, and feasibility of the agent's plan. +If no planning content is found in the trace, defaults to passing (score=1.0). +""" + +import time +from typing import List + +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model import Model +from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval +from dingo.utils import log +from dingo.utils.exception import ConvertJsonError, ExceedMaxTokens + +try: + from pydantic import ValidationError +except ImportError: + ValidationError = Exception + + +@Model.llm_register("LLMAgentPlanQuality") +class LLMAgentPlanQuality(BaseLLMAgentEval): + """ + Evaluates the quality of an agent's reasoning plan. + + Input: + prompt - The task objective or user request + content - The agent trace or plan description + + If no planning content is detected in the trace, the evaluator + returns score=1.0 (pass) because absence of planning may be + acceptable for simple tasks. + """ + + eval_layer = "reasoning" + input_data_type = "trace_summary" + default_threshold = 0.6 + + _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT] + + _NO_PLAN_KEYWORDS = [ + "no plan", "no planning", "no explicit plan", + "did not plan", "skipped planning", "planning not found", + ] + + prompt = """You are an expert evaluator assessing the quality of an AI agent's reasoning plan. + +First, determine whether the trace contains any planning content (explicit steps, strategy, or reasoning about how to approach the task). If there is no planning content at all, set score to -1 as a sentinel value. + +If planning content exists, evaluate it on three dimensions: +1. **Coherence** (1-5): Is the plan logically structured and internally consistent? +2. **Completeness** (1-5): Does the plan cover all necessary steps to achieve the goal? +3. **Feasibility** (1-5): Are the planned steps realistic and achievable? + +Compute an overall score from 0 to 10. + +IMPORTANT: The "reason" field MUST be in the same language as the Task Objective. If the task objective is in Chinese, respond in Chinese. If in English, respond in English. + +Return your evaluation as a JSON object with this exact schema: +{ + "coherence": , + "completeness": , + "feasibility": , + "score": , + "reason": "" +} + +Do not include any text outside the JSON object.""" + + @classmethod + def build_messages(cls, input_data: Data) -> List[dict]: + """Build LLM messages for plan quality evaluation.""" + lang_hint = cls._detect_language_hint( + str(input_data.prompt) + str(input_data.content) + ) + user_content = f"""{cls.prompt} + +## Task Objective +{input_data.prompt} + +## Agent Trace / Plan +{input_data.content} + +Evaluate the plan quality and return the JSON evaluation.{lang_hint}""" + + return [{"role": "user", "content": user_content}] + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + """Override eval() to handle the no-planning special case.""" + if cls.client is None: + cls.create_client() + + messages = cls.build_messages(input_data) + + attempts = 0 + except_msg = "" + except_name = Exception.__class__.__name__ + while attempts < 3: + try: + response = cls.send_messages(messages) + + data = cls._parse_json_response(response) + raw_score = data.get("score", 0) + + try: + raw_score = float(raw_score) + except (TypeError, ValueError): + raw_score = 0.0 + + result = EvalDetail(metric=cls.__name__) + + if raw_score < 0: + # Sentinel value: no planning content found, treat as pass + log.info(f"{cls.__name__}: No planning content found in trace, defaulting to pass") + result.status = False + result.label = [QualityLabel.QUALITY_GOOD] + result.score = 1.0 + result.reason = [data.get("reason", "No planning content found; evaluation skipped.")] + return result + + normalized_score = max(0.0, min(1.0, raw_score / 10.0)) + threshold = cls._get_threshold() + reason_text = data.get("reason", "") + details = {k: v for k, v in data.items() if k not in ("score", "reason")} + + import json + result.score = normalized_score + if normalized_score >= threshold: + result.status = False + result.label = [QualityLabel.QUALITY_GOOD] + else: + result.status = True + result.label = [f"AGENT_QUALITY.{cls.__name__}"] + + reason_parts = [reason_text] if reason_text else [] + if details: + reason_parts.append(json.dumps(details, ensure_ascii=False, default=str)) + result.reason = reason_parts if reason_parts else None + + return result + + except (ValidationError, ExceedMaxTokens, ConvertJsonError) as e: + except_msg = str(e) + except_name = e.__class__.__name__ + break + except Exception as e: + attempts += 1 + time.sleep(1) + except_msg = str(e) + except_name = e.__class__.__name__ + + res = EvalDetail(metric=cls.__name__) + res.status = True + res.label = [f"QUALITY_BAD.{except_name}"] + res.reason = [except_msg] + return res diff --git a/dingo/model/llm/agent_eval/llm_agent_step_efficiency.py b/dingo/model/llm/agent_eval/llm_agent_step_efficiency.py new file mode 100644 index 00000000..cb3ab959 --- /dev/null +++ b/dingo/model/llm/agent_eval/llm_agent_step_efficiency.py @@ -0,0 +1,75 @@ +""" +LLMAgentStepEfficiency: Evaluates whether the agent executed its task with minimal redundant steps. + +Detects wasted steps, execution loops, and unnecessary operations in the agent trace, +scoring higher for lean and purposeful execution. +""" + +from typing import List + +from dingo.io.input import Data, RequiredField +from dingo.model import Model +from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval + + +@Model.llm_register("LLMAgentStepEfficiency") +class LLMAgentStepEfficiency(BaseLLMAgentEval): + """ + Evaluates the step efficiency of an agent's execution trace. + + Input: + prompt - The task objective or user request + content - The agent execution trace or step-by-step summary + + Output score reflects how efficiently the agent reached its goal, + penalizing redundant steps, loops, and unnecessary operations. + """ + + eval_layer = "execution" + input_data_type = "trace_summary" + default_threshold = 0.5 + + _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT] + + prompt = """You are an expert evaluator assessing the step efficiency of an AI agent's execution. + +Analyze the agent's execution trace and identify: +- **Total steps**: Count all steps/actions taken by the agent +- **Necessary steps**: Steps that directly contribute to completing the task +- **Wasted steps**: Redundant, repeated, or unnecessary steps +- **Loops detected**: Whether the agent got stuck in a repetitive pattern + +A score of 10 means perfectly efficient execution with no wasted steps. +A score of 0 means the agent was completely stuck in loops or took entirely unnecessary actions. + +Respond in the same language as the input content for the "reason" field. + +Return your evaluation as a JSON object with this exact schema: +{ + "total_steps": , + "necessary_steps": , + "wasted_steps": , + "loops_detected": , + "score": , + "reason": "" +} + +Do not include any text outside the JSON object.""" + + @classmethod + def build_messages(cls, input_data: Data) -> List[dict]: + """Build LLM messages for step efficiency evaluation.""" + lang_hint = cls._detect_language_hint( + str(input_data.prompt) + str(input_data.content) + ) + user_content = f"""{cls.prompt} + +## Task Objective +{input_data.prompt} + +## Agent Execution Trace +{input_data.content} + +Analyze the execution efficiency and return the JSON evaluation.{lang_hint}""" + + return [{"role": "user", "content": user_content}] diff --git a/dingo/model/llm/agent_eval/llm_agent_task_completion.py b/dingo/model/llm/agent_eval/llm_agent_task_completion.py new file mode 100644 index 00000000..ec21019b --- /dev/null +++ b/dingo/model/llm/agent_eval/llm_agent_task_completion.py @@ -0,0 +1,72 @@ +""" +LLMAgentTaskCompletion: Evaluates whether an Agent successfully completed its assigned task. + +Compares the task objective (prompt) against the execution result summary (content) +and scores on goal achievement, accuracy, and completeness. +""" + +from typing import List + +from dingo.io.input import Data, RequiredField +from dingo.model import Model +from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval + + +@Model.llm_register("LLMAgentTaskCompletion") +class LLMAgentTaskCompletion(BaseLLMAgentEval): + """ + Evaluates whether the agent completed its assigned task. + + Input: + prompt - The task objective or user request + content - The agent execution result summary + + Output score reflects the degree to which the agent achieved the goal, + produced accurate results, and covered all required aspects. + """ + + eval_layer = "execution" + input_data_type = "trace_summary" + default_threshold = 0.6 + + _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT] + + prompt = """You are an expert evaluator assessing whether an AI agent successfully completed its assigned task. + +Evaluate the agent's performance across three dimensions: +1. **Goal Achievement** (1-5): Did the agent accomplish the main objective? +2. **Accuracy** (1-5): Is the result correct and free of errors? +3. **Completeness** (1-5): Did the agent address all aspects of the task? + +Then compute an overall score from 0 to 10 reflecting the combined quality. + +Respond in the same language as the input content for the "reason" field. + +Return your evaluation as a JSON object with this exact schema: +{ + "goal_achievement": , + "accuracy": , + "completeness": , + "score": , + "reason": "" +} + +Do not include any text outside the JSON object.""" + + @classmethod + def build_messages(cls, input_data: Data) -> List[dict]: + """Build LLM messages for task completion evaluation.""" + lang_hint = cls._detect_language_hint( + str(input_data.prompt) + str(input_data.content) + ) + user_content = f"""{cls.prompt} + +## Task Objective +{input_data.prompt} + +## Agent Execution Result +{input_data.content} + +Evaluate whether the agent completed its task and return the JSON evaluation.{lang_hint}""" + + return [{"role": "user", "content": user_content}] diff --git a/dingo/model/llm/agent_eval/llm_agent_tool_correctness.py b/dingo/model/llm/agent_eval/llm_agent_tool_correctness.py new file mode 100644 index 00000000..2d0f5232 --- /dev/null +++ b/dingo/model/llm/agent_eval/llm_agent_tool_correctness.py @@ -0,0 +1,82 @@ +""" +LLMAgentToolCorrectness: Evaluates whether the agent selected the correct tools for each step. + +Performs referenceless evaluation — no expected tool sequence is required. +The LLM judge assesses tool choices based on the task objective and the +context of each tool invocation. +""" + +from typing import List + +from dingo.io.input import Data, RequiredField +from dingo.model import Model +from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval + + +@Model.llm_register("LLMAgentToolCorrectness") +class LLMAgentToolCorrectness(BaseLLMAgentEval): + """ + Evaluates the correctness of tool selections in an agent's execution. + + Input: + prompt - The task objective or user request + content - JSON-formatted sequence of tool calls made by the agent + + Performs referenceless evaluation: the LLM judge determines whether + each tool choice was appropriate given the task and execution context, + without requiring a ground-truth tool sequence. + """ + + eval_layer = "action" + input_data_type = "tool_calls" + default_threshold = 0.6 + + _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT] + + prompt = """You are an expert evaluator assessing whether an AI agent selected the correct tools during task execution. + +For each tool call in the sequence, determine: +- Was this the right tool for the situation? +- Was this tool call necessary, or was it redundant? + +Count: +- **correct_calls**: Tool calls that were appropriate and necessary +- **total_calls**: Total number of tool calls made +- **redundant_calls**: Tool calls that were unnecessary or duplicated without reason + +List specific issues (wrong tool chosen, tool used out of order, missing tool that should have been called, etc.). + +A score of 10 means every tool call was correct and necessary. +A score of 0 means all tool calls were wrong or the agent failed to use required tools. + +Respond in the same language as the input content for the "reason" field. + +Return your evaluation as a JSON object with this exact schema: +{ + "correct_calls": , + "total_calls": , + "redundant_calls": , + "score": , + "issues": ["", ...], + "reason": "" +} + +Do not include any text outside the JSON object.""" + + @classmethod + def build_messages(cls, input_data: Data) -> List[dict]: + """Build LLM messages for tool correctness evaluation.""" + lang_hint = cls._detect_language_hint( + str(input_data.prompt) + str(input_data.content) + ) + user_content = f"""{cls.prompt} + +## Task Objective +{input_data.prompt} + +## Tool Call Sequence +{input_data.content} + +Evaluate the tool selections and return the JSON evaluation.{lang_hint}""" + + return [{"role": "user", "content": user_content}] diff --git a/dingo/model/llm/agent_eval/llm_agent_trace_conclusion.py b/dingo/model/llm/agent_eval/llm_agent_trace_conclusion.py new file mode 100644 index 00000000..5cb21869 --- /dev/null +++ b/dingo/model/llm/agent_eval/llm_agent_trace_conclusion.py @@ -0,0 +1,119 @@ +""" +LLMAgentTraceConclusion — synthesizes all evaluation results into a structured diagnosis. + +Called after all other evaluators complete for a trace. Takes the full set of +evaluation scores as input and produces: +- Overall severity (critical / warning / good) +- Root cause analysis +- Actionable recommendations +- A single aggregate score (0-10) + +This is NOT an evaluator in the traditional sense — it's a diagnostic synthesizer. +""" + +from typing import List + +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model.llm.agent_eval.base_llm_agent_eval import BaseLLMAgentEval +from dingo.model.model import Model + +CONCLUSION_PROMPT = """You are an AI agent quality analyst. Given the evaluation results from multiple evaluators that assessed an agent's execution trace, synthesize a comprehensive diagnosis. + +## Task Objective +{objective} + +## Evaluation Results +{eval_results} + +## Trace Summary +{trace_summary} + +## Instructions +Analyze all evaluation scores and produce a structured JSON diagnosis: + +1. **severity**: "critical" (any score < 0.3), "warning" (any score < 0.6), or "good" (all scores >= 0.6) +2. **root_causes**: List the primary reasons for any failures or low scores +3. **recommendations**: Actionable suggestions to improve the agent's performance +4. **highlights**: What the agent did well +5. **score**: Overall quality score from 0 to 10, weighing task completion most heavily + +Output STRICTLY as JSON: +```json +{{ + "severity": "critical|warning|good", + "root_causes": ["cause 1", "cause 2"], + "recommendations": ["rec 1", "rec 2"], + "highlights": ["highlight 1"], + "score": 0-10, + "summary": "One-paragraph overall assessment in the same language as the task objective" +}} +```""" + + +@Model.llm_register("LLMAgentTraceConclusion") +class LLMAgentTraceConclusion(BaseLLMAgentEval): + """Synthesize evaluation results into a structured trace-level diagnosis.""" + + eval_layer = "conclusion" + input_data_type = "eval_synthesis" + default_threshold = 0.5 + _required_fields = [RequiredField.PROMPT, RequiredField.CONTENT] + + @classmethod + def build_messages(cls, input_data: Data) -> List: + objective = getattr(input_data, "prompt", "") or "Agent trace" + eval_results = getattr(input_data, "content", "") or "{}" + trace_summary = getattr(input_data, "context", "") or "" + lang_hint = cls._detect_language_hint( + str(input_data.prompt) + str(input_data.content) + ) + + prompt_text = CONCLUSION_PROMPT.format( + objective=objective, + eval_results=eval_results, + trace_summary=trace_summary, + ) + lang_hint + return [{"role": "user", "content": prompt_text}] + + @classmethod + def process_response(cls, response: str) -> EvalDetail: + from dingo.utils import log + log.info(response) + + data = cls._parse_json_response(response) + + raw_score = data.get("score", 5) + try: + raw_score = float(raw_score) + except (TypeError, ValueError): + raw_score = 5.0 + + normalized_score = max(0.0, min(1.0, raw_score / 10.0)) + severity = data.get("severity", "warning") + + result = EvalDetail(metric=cls.__name__) + result.score = normalized_score + + if severity == "good": + result.status = False + result.label = [QualityLabel.QUALITY_GOOD] + elif severity == "critical": + result.status = True + result.label = ["AGENT_QUALITY.TraceConclusion.CRITICAL"] + else: + result.status = True + result.label = ["AGENT_QUALITY.TraceConclusion.WARNING"] + + import json + result.reason = [ + data.get("summary", ""), + json.dumps({ + "severity": severity, + "root_causes": data.get("root_causes", []), + "recommendations": data.get("recommendations", []), + "highlights": data.get("highlights", []), + }, ensure_ascii=False), + ] + + return result diff --git a/dingo/model/rule/rule_agent.py b/dingo/model/rule/rule_agent.py new file mode 100644 index 00000000..2606a81c --- /dev/null +++ b/dingo/model/rule/rule_agent.py @@ -0,0 +1,230 @@ +""" +Agent-specific rule evaluators for deterministic quality checks. + +These rules run without LLM calls, checking structural properties +of agent execution traces (loops, token budget, latency anomalies). +""" + +import json +import statistics +from typing import List, Optional + +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model.model import Model +from dingo.model.rule.base import BaseRule + + +@Model.rule_register("AGENT_TRACE_QUALITY", ["agent_trace_basic"]) +class RuleAgentTraceLoopDetection(BaseRule): + """Detect repetitive tool call patterns indicating infinite loops. + + Input: content = JSON array of tool call objects with 'tool_name' field. + Detection: n-gram analysis on tool name sequences. + A loop is detected when the same subsequence of 2+ tool names + repeats 3 or more consecutive times. + """ + + _required_fields = [RequiredField.CONTENT] + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + result = EvalDetail(metric=cls.__name__) + + tool_names = cls._extract_tool_names(input_data.content) + if len(tool_names) < 6: + result.label = [QualityLabel.QUALITY_GOOD] + return result + + loop_info = cls._detect_loops(tool_names) + if loop_info: + result.status = True + result.label = [f"{cls.metric_type}.{cls.__name__}"] + result.reason = [ + f"Loop detected: pattern {loop_info['pattern']} " + f"repeats {loop_info['count']} times at position {loop_info['position']}" + ] + else: + result.label = [QualityLabel.QUALITY_GOOD] + + return result + + @classmethod + def _extract_tool_names(cls, content: str) -> List[str]: + try: + data = json.loads(content) if isinstance(content, str) else content + except (json.JSONDecodeError, TypeError): + return [] + + if isinstance(data, dict): + items = data.get("tool_calls", data.get("steps", [])) + elif isinstance(data, list): + items = data + else: + return [] + + return [ + item.get("tool_name", item.get("name", "")) + for item in items + if isinstance(item, dict) and item.get("tool_name") or item.get("name") + ] + + @classmethod + def _detect_loops( + cls, names: List[str], min_pattern_len: int = 2, min_repeats: int = 3 + ) -> Optional[dict]: + for pattern_len in range(min_pattern_len, len(names) // min_repeats + 1): + for start in range(len(names) - pattern_len * min_repeats + 1): + pattern = names[start : start + pattern_len] + count = 1 + pos = start + pattern_len + while pos + pattern_len <= len(names): + if names[pos : pos + pattern_len] == pattern: + count += 1 + pos += pattern_len + else: + break + if count >= min_repeats: + return { + "pattern": pattern, + "count": count, + "position": start, + } + return None + + +@Model.rule_register("AGENT_TRACE_QUALITY", ["agent_trace_basic"]) +class RuleAgentTraceTokenBudget(BaseRule): + """Check if total token usage exceeds a configurable budget. + + Input: content = JSON with 'total_tokens' field, or metadata with token info. + Default budget: 500,000 tokens (configurable via dynamic_config.threshold). + """ + + _required_fields = [RequiredField.CONTENT] + dynamic_config = None + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + result = EvalDetail(metric=cls.__name__) + + budget = 500_000 + if cls.dynamic_config and hasattr(cls.dynamic_config, "threshold"): + try: + budget = int(cls.dynamic_config.threshold) + except (TypeError, ValueError): + pass + + total_tokens = cls._extract_tokens(input_data) + if total_tokens is None: + result.label = [QualityLabel.QUALITY_GOOD] + return result + + if total_tokens > budget: + result.status = True + result.label = [f"{cls.metric_type}.{cls.__name__}"] + result.reason = [ + f"Token usage {total_tokens:,} exceeds budget {budget:,}" + ] + result.score = min(1.0, budget / total_tokens) if total_tokens > 0 else 0.0 + else: + result.label = [QualityLabel.QUALITY_GOOD] + result.score = 1.0 + + return result + + @classmethod + def _extract_tokens(cls, input_data: Data) -> Optional[int]: + for source in [input_data.content, getattr(input_data, "metadata", None)]: + if source is None: + continue + try: + data = json.loads(source) if isinstance(source, str) else source + except (json.JSONDecodeError, TypeError): + continue + if isinstance(data, dict): + val = data.get("total_tokens") + if val is not None: + try: + return int(val) + except (TypeError, ValueError): + pass + return None + + +@Model.rule_register("AGENT_TRACE_QUALITY", ["agent_trace_basic"]) +class RuleAgentTraceLatencyAnomaly(BaseRule): + """Detect abnormally slow steps using statistical outlier analysis. + + Input: content = JSON array of step objects with 'duration' or 'duration_seconds' field. + A step is flagged if its duration exceeds mean + 3*stddev. + """ + + _required_fields = [RequiredField.CONTENT] + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + result = EvalDetail(metric=cls.__name__) + + steps = cls._extract_steps(input_data.content) + durations = [s["duration"] for s in steps if s["duration"] is not None and s["duration"] > 0] + + if len(durations) < 3: + result.label = [QualityLabel.QUALITY_GOOD] + return result + + mean = statistics.mean(durations) + stdev = statistics.stdev(durations) + threshold = mean + 3 * stdev + + anomalies = [ + s for s in steps + if s["duration"] is not None and s["duration"] > threshold + ] + + if anomalies: + result.status = True + result.label = [f"{cls.metric_type}.{cls.__name__}"] + result.reason = [ + f"Step '{a['name']}' took {a['duration']:.2f}s " + f"(threshold: {threshold:.2f}s, mean: {mean:.2f}s)" + for a in anomalies[:5] + ] + else: + result.label = [QualityLabel.QUALITY_GOOD] + + return result + + @classmethod + def _extract_steps(cls, content: str) -> List[dict]: + try: + data = json.loads(content) if isinstance(content, str) else content + except (json.JSONDecodeError, TypeError): + return [] + + if isinstance(data, dict): + items = data.get("steps", data.get("tool_calls", [])) + elif isinstance(data, list): + items = data + else: + return [] + + return [ + { + "name": item.get("name", "unknown"), + "duration": cls._safe_float( + item.get("duration", item.get("duration_seconds")) + ), + } + for item in items + if isinstance(item, dict) + ] + + @classmethod + def _safe_float(cls, val) -> Optional[float]: + if val is None: + return None + try: + return float(val) + except (TypeError, ValueError): + return None From 0965f01cd44d94eeff25aecc8b9db1269ffb6c6f Mon Sep 17 00:00:00 2001 From: Sean Date: Mon, 8 Jun 2026 14:12:00 +0800 Subject: [PATCH 2/2] chore: update .gitignore with AI artifacts, benchmarks, and downloads --- .gitignore | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.gitignore b/.gitignore index 91b2065f..e45c88dc 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,20 @@ outputs/ coverage.xml dingo_python.egg-info/* + +# Claude Code / AI assistant artifacts +.claude/ +CLAUDE.md +CLAUDE.local.md +claude_docs/ + +# Benchmarks and experiment data +benchmarks/ +meta_rater/ + +# Downloaded models / large binary files +downloads/ + +# Temporary docs (plans, specs) +docs/plans/ +docs/superpowers/