feat(agent-eval): add LLMAgent* evaluators for agent trace monitoring#432
feat(agent-eval): add LLMAgent* evaluators for agent trace monitoring#432seancoding-day wants to merge 2 commits into
Conversation
Add 8 LLM evaluators + 3 rule evaluators for AI agent trace quality assessment, aligned with DeepEval 3-layer taxonomy (Execution/Action/Reasoning) + Recovery. LLM evaluators (dingo/model/llm/agent_eval/): - LLMAgentTaskCompletion, LLMAgentStepEfficiency (Execution) - LLMAgentToolCorrectness, LLMAgentArgumentCorrectness (Action) - LLMAgentPlanQuality, LLMAgentPlanAdherence (Reasoning) - LLMAgentErrorRecovery (Recovery), LLMAgentTraceConclusion (synthesis) Rule evaluators: RuleAgentTraceLoopDetection, RuleAgentTraceTokenBudget, RuleAgentTraceLatencyAnomaly Shared BaseLLMAgentEval with 0-10→0.0-1.0 normalization, CJK language detection, and configurable threshold.
There was a problem hiding this comment.
Code Review
This pull request introduces a comprehensive suite of LLM-as-Judge and rule-based evaluators for assessing AI agent execution traces, including a base class for standardized scoring and a synthesizer for overall trace diagnosis. The review feedback highlights several critical opportunities to improve code quality and robustness. Specifically, it recommends avoiding boilerplate duplication in LLMAgentErrorRecovery and LLMAgentPlanQuality by leveraging inheritance and calling super(). Additionally, the feedback points out potential TypeError and AttributeError vulnerabilities in JSON parsing and attribute access across multiple files, and advises removing an overriding dynamic_config = None statement that could break configuration inheritance.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| def eval(cls, input_data: Data) -> EvalDetail: | ||
| """Override eval() to handle the no-error special case.""" | ||
| content = getattr(input_data, "content", "") or "" | ||
|
|
||
| if not cls._has_error_events(content): | ||
| log.info(f"{cls.__name__}: No error events detected, returning pass") | ||
| result = EvalDetail(metric=cls.__name__) | ||
| result.status = False | ||
| result.label = [QualityLabel.QUALITY_GOOD] | ||
| result.score = 1.0 | ||
| result.reason = ["No error events found in execution trace; recovery evaluation skipped."] | ||
| return result | ||
|
|
||
| if cls.client is None: | ||
| cls.create_client() | ||
|
|
||
| messages = cls.build_messages(input_data) | ||
|
|
||
| attempts = 0 | ||
| except_msg = "" | ||
| except_name = Exception.__class__.__name__ | ||
| while attempts < 3: | ||
| try: | ||
| response = cls.send_messages(messages) | ||
| res: EvalDetail = cls.process_response(response) | ||
| return res | ||
| except (ValidationError, ExceedMaxTokens, ConvertJsonError) as e: | ||
| except_msg = str(e) | ||
| except_name = e.__class__.__name__ | ||
| break | ||
| except Exception as e: | ||
| attempts += 1 | ||
| time.sleep(1) | ||
| except_msg = str(e) | ||
| except_name = e.__class__.__name__ | ||
|
|
||
| res = EvalDetail(metric=cls.__name__) | ||
| res.status = True | ||
| res.label = [f"QUALITY_BAD.{except_name}"] | ||
| res.reason = [except_msg] | ||
| return res |
There was a problem hiding this comment.
Avoid duplicating the entire eval retry loop and exception handling boilerplate from BaseOpenAI.eval. Instead, call super().eval(input_data) after performing the short-circuit check.
| def eval(cls, input_data: Data) -> EvalDetail: | |
| """Override eval() to handle the no-error special case.""" | |
| content = getattr(input_data, "content", "") or "" | |
| if not cls._has_error_events(content): | |
| log.info(f"{cls.__name__}: No error events detected, returning pass") | |
| result = EvalDetail(metric=cls.__name__) | |
| result.status = False | |
| result.label = [QualityLabel.QUALITY_GOOD] | |
| result.score = 1.0 | |
| result.reason = ["No error events found in execution trace; recovery evaluation skipped."] | |
| return result | |
| if cls.client is None: | |
| cls.create_client() | |
| messages = cls.build_messages(input_data) | |
| attempts = 0 | |
| except_msg = "" | |
| except_name = Exception.__class__.__name__ | |
| while attempts < 3: | |
| try: | |
| response = cls.send_messages(messages) | |
| res: EvalDetail = cls.process_response(response) | |
| return res | |
| except (ValidationError, ExceedMaxTokens, ConvertJsonError) as e: | |
| except_msg = str(e) | |
| except_name = e.__class__.__name__ | |
| break | |
| except Exception as e: | |
| attempts += 1 | |
| time.sleep(1) | |
| except_msg = str(e) | |
| except_name = e.__class__.__name__ | |
| res = EvalDetail(metric=cls.__name__) | |
| res.status = True | |
| res.label = [f"QUALITY_BAD.{except_name}"] | |
| res.reason = [except_msg] | |
| return res | |
| @classmethod | |
| def eval(cls, input_data: Data) -> EvalDetail: | |
| """Override eval() to handle the no-error special case.""" | |
| content = getattr(input_data, "content", "") or "" | |
| if not cls._has_error_events(content): | |
| log.info(f"{cls.__name__}: No error events detected, returning pass") | |
| result = EvalDetail(metric=cls.__name__) | |
| result.status = False | |
| result.label = [QualityLabel.QUALITY_GOOD] | |
| result.score = 1.0 | |
| result.reason = ["No error events found in execution trace; recovery evaluation skipped."] | |
| return result | |
| return super().eval(input_data) |
| def eval(cls, input_data: Data) -> EvalDetail: | ||
| """Override eval() to handle the no-planning special case.""" | ||
| if cls.client is None: | ||
| cls.create_client() | ||
|
|
||
| messages = cls.build_messages(input_data) | ||
|
|
||
| attempts = 0 | ||
| except_msg = "" | ||
| except_name = Exception.__class__.__name__ | ||
| while attempts < 3: | ||
| try: | ||
| response = cls.send_messages(messages) | ||
|
|
||
| data = cls._parse_json_response(response) | ||
| raw_score = data.get("score", 0) | ||
|
|
||
| try: | ||
| raw_score = float(raw_score) | ||
| except (TypeError, ValueError): | ||
| raw_score = 0.0 | ||
|
|
||
| result = EvalDetail(metric=cls.__name__) | ||
|
|
||
| if raw_score < 0: | ||
| # Sentinel value: no planning content found, treat as pass | ||
| log.info(f"{cls.__name__}: No planning content found in trace, defaulting to pass") | ||
| result.status = False | ||
| result.label = [QualityLabel.QUALITY_GOOD] | ||
| result.score = 1.0 | ||
| result.reason = [data.get("reason", "No planning content found; evaluation skipped.")] | ||
| return result | ||
|
|
||
| normalized_score = max(0.0, min(1.0, raw_score / 10.0)) | ||
| threshold = cls._get_threshold() | ||
| reason_text = data.get("reason", "") | ||
| details = {k: v for k, v in data.items() if k not in ("score", "reason")} | ||
|
|
||
| import json | ||
| result.score = normalized_score | ||
| if normalized_score >= threshold: | ||
| result.status = False | ||
| result.label = [QualityLabel.QUALITY_GOOD] | ||
| else: | ||
| result.status = True | ||
| result.label = [f"AGENT_QUALITY.{cls.__name__}"] | ||
|
|
||
| reason_parts = [reason_text] if reason_text else [] | ||
| if details: | ||
| reason_parts.append(json.dumps(details, ensure_ascii=False, default=str)) | ||
| result.reason = reason_parts if reason_parts else None | ||
|
|
||
| return result | ||
|
|
||
| except (ValidationError, ExceedMaxTokens, ConvertJsonError) as e: | ||
| except_msg = str(e) | ||
| except_name = e.__class__.__name__ | ||
| break | ||
| except Exception as e: | ||
| attempts += 1 | ||
| time.sleep(1) | ||
| except_msg = str(e) | ||
| except_name = e.__class__.__name__ | ||
|
|
||
| res = EvalDetail(metric=cls.__name__) | ||
| res.status = True | ||
| res.label = [f"QUALITY_BAD.{except_name}"] | ||
| res.reason = [except_msg] | ||
| return res |
There was a problem hiding this comment.
Instead of overriding the entire eval method and duplicating the retry loop and exception handling boilerplate, override process_response to handle the sentinel value raw_score < 0 and delegate to super().process_response(response) otherwise.
@classmethod
def process_response(cls, response: str) -> EvalDetail:
"""Process response and handle the no-planning special case."""
data = cls._parse_json_response(response)
raw_score = data.get("score", 0)
try:
raw_score = float(raw_score)
except (TypeError, ValueError):
raw_score = 0.0
if raw_score < 0:
log.info(f"{cls.__name__}: No planning content found in trace, defaulting to pass")
result = EvalDetail(metric=cls.__name__)
result.status = False
result.label = [QualityLabel.QUALITY_GOOD]
result.score = 1.0
result.reason = [data.get("reason", "No planning content found; evaluation skipped.")]
return result
return super().process_response(response)| def _extract_tool_names(cls, content: str) -> List[str]: | ||
| try: | ||
| data = json.loads(content) if isinstance(content, str) else content | ||
| except (json.JSONDecodeError, TypeError): | ||
| return [] | ||
|
|
||
| if isinstance(data, dict): | ||
| items = data.get("tool_calls", data.get("steps", [])) | ||
| elif isinstance(data, list): | ||
| items = data | ||
| else: | ||
| return [] | ||
|
|
||
| return [ | ||
| item.get("tool_name", item.get("name", "")) | ||
| for item in items | ||
| if isinstance(item, dict) and item.get("tool_name") or item.get("name") | ||
| ] |
There was a problem hiding this comment.
Fix potential TypeError if tool_calls or steps is None (e.g., when parsed from JSON null), and fix operator precedence in the list comprehension filter to prevent AttributeError on non-dict items.
@classmethod
def _extract_tool_names(cls, content: str) -> List[str]:
try:
data = json.loads(content) if isinstance(content, str) else content
except (json.JSONDecodeError, TypeError):
return []
items = []
if isinstance(data, dict):
items = data.get("tool_calls") or data.get("steps") or []
elif isinstance(data, list):
items = data
if not isinstance(items, list):
return []
return [
item.get("tool_name", item.get("name", ""))
for item in items
if isinstance(item, dict) and (item.get("tool_name") or item.get("name"))
]| _required_fields = [RequiredField.CONTENT] | ||
| dynamic_config = None |
There was a problem hiding this comment.
Remove dynamic_config = None to prevent AttributeError: 'NoneType' object has no attribute 'model_copy' when Model.set_config_rule is called. It will correctly inherit the default EvaluatorRuleArgs from BaseRule.
| _required_fields = [RequiredField.CONTENT] | |
| dynamic_config = None | |
| _required_fields = [RequiredField.CONTENT] |
| def _extract_steps(cls, content: str) -> List[dict]: | ||
| try: | ||
| data = json.loads(content) if isinstance(content, str) else content | ||
| except (json.JSONDecodeError, TypeError): | ||
| return [] | ||
|
|
||
| if isinstance(data, dict): | ||
| items = data.get("steps", data.get("tool_calls", [])) | ||
| elif isinstance(data, list): | ||
| items = data | ||
| else: | ||
| return [] | ||
|
|
||
| return [ | ||
| { | ||
| "name": item.get("name", "unknown"), | ||
| "duration": cls._safe_float( | ||
| item.get("duration", item.get("duration_seconds")) | ||
| ), | ||
| } | ||
| for item in items | ||
| if isinstance(item, dict) | ||
| ] | ||
|
|
There was a problem hiding this comment.
Fix potential TypeError if steps or tool_calls is None (e.g., when parsed from JSON null) by using safe fallback checks.
@classmethod
def _extract_steps(cls, content: str) -> List[dict]:
try:
data = json.loads(content) if isinstance(content, str) else content
except (json.JSONDecodeError, TypeError):
return []
items = []
if isinstance(data, dict):
items = data.get("steps") or data.get("tool_calls") or []
elif isinstance(data, list):
items = data
if not isinstance(items, list):
return []
return [
{
"name": item.get("name", "unknown"),
"duration": cls._safe_float(
item.get("duration", item.get("duration_seconds"))
),
}
for item in items
if isinstance(item, dict)
]| def _parse_json_response(cls, response: str) -> dict: | ||
| cleaned = cls._strip_json_fences(response) | ||
| try: | ||
| return json.loads(cleaned) | ||
| except json.JSONDecodeError: | ||
| raise ConvertJsonError( | ||
| f"Failed to parse agent eval JSON: {cleaned[:500]}" | ||
| ) |
There was a problem hiding this comment.
Ensure that the parsed JSON response is a dictionary before returning it. If the LLM returns a JSON array or primitive value, calling .get() on the parsed object in process_response will raise an AttributeError.
@classmethod
def _parse_json_response(cls, response: str) -> dict:
cleaned = cls._strip_json_fences(response)
try:
data = json.loads(cleaned)
if not isinstance(data, dict):
raise ConvertJsonError(
f"Failed to parse agent eval JSON: expected JSON object, got {type(data).__name__}"
)
return data
except json.JSONDecodeError:
raise ConvertJsonError(
f"Failed to parse agent eval JSON: {cleaned[:500]}"
)| def build_messages(cls, input_data: Data) -> List: | ||
| objective = getattr(input_data, "prompt", "") or "Agent trace" | ||
| eval_results = getattr(input_data, "content", "") or "{}" | ||
| trace_summary = getattr(input_data, "context", "") or "" | ||
| lang_hint = cls._detect_language_hint( | ||
| str(input_data.prompt) + str(input_data.content) | ||
| ) | ||
|
|
||
| prompt_text = CONCLUSION_PROMPT.format( | ||
| objective=objective, | ||
| eval_results=eval_results, | ||
| trace_summary=trace_summary, | ||
| ) + lang_hint | ||
| return [{"role": "user", "content": prompt_text}] |
There was a problem hiding this comment.
Re-use the extracted objective and eval_results variables when calling _detect_language_hint to avoid potential AttributeError if prompt or content are missing on input_data.
| def build_messages(cls, input_data: Data) -> List: | |
| objective = getattr(input_data, "prompt", "") or "Agent trace" | |
| eval_results = getattr(input_data, "content", "") or "{}" | |
| trace_summary = getattr(input_data, "context", "") or "" | |
| lang_hint = cls._detect_language_hint( | |
| str(input_data.prompt) + str(input_data.content) | |
| ) | |
| prompt_text = CONCLUSION_PROMPT.format( | |
| objective=objective, | |
| eval_results=eval_results, | |
| trace_summary=trace_summary, | |
| ) + lang_hint | |
| return [{"role": "user", "content": prompt_text}] | |
| @classmethod | |
| def build_messages(cls, input_data: Data) -> List: | |
| objective = getattr(input_data, "prompt", "") or "Agent trace" | |
| eval_results = getattr(input_data, "content", "") or "{}" | |
| trace_summary = getattr(input_data, "context", "") or "" | |
| lang_hint = cls._detect_language_hint( | |
| str(objective) + str(eval_results) | |
| ) | |
| prompt_text = CONCLUSION_PROMPT.format( | |
| objective=objective, | |
| eval_results=eval_results, | |
| trace_summary=trace_summary, | |
| ) + lang_hint | |
| return [{"role": "user", "content": prompt_text}] |
No description provided.