diff --git a/README.md b/README.md
index d8578827..24362229 100755
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@

[](https://hits.seeyoufarm.com)
[](https://github.com/BerriAI/litellm)
+[](https://www.minimax.io)
[Project Credits](https://github.com/Luodian/Otter/blob/main/docs/credits.md) | [Otter Paper](https://arxiv.org/abs/2305.03726) | [OtterHD Paper](https://arxiv.org/abs/2311.04219) | [MIMIC-IT Paper](https://arxiv.org/abs/2306.05425)
@@ -41,6 +42,7 @@ For who in the mainland China: [
debug: true # put debug=true will save the model response in log file.
- name: mme
split: test
@@ -70,6 +72,18 @@ For who in the mainland China: [ with your selected model and set your API keys in the environment. For more information see [LiteLLM](https://github.com/BerriAI/litellm/)
+2. Added [MiniMax](https://www.minimax.io) as a supported LLM provider for both the Syphus data generation pipeline and benchmark evaluation. Configure via environment variables:
+ ```bash
+ # For Syphus data generation (via liteLLM)
+ export MINIMAX_API_KEY="your-minimax-key"
+ export OPENAI_API_ENGINE="openai/MiniMax-M3"
+ export OPENAI_API_BASE="https://api.minimax.io/v1"
+
+ # For benchmark evaluation (MagnifierBench, MathVista, MM-Vet)
+ export EVAL_LLM_PROVIDER="minimax"
+ export MINIMAX_API_KEY="your-minimax-key"
+ ```
+ MiniMax M3 offers a 512K context window, up to 128K max output, and image input support. M2.7 and M2.7-highspeed remain available as alternatives. See `pipeline/benchmarks/utils/eval_llm.py` for details.
**[2023-07]: Anouncing MIMIC-IT dataset for multiple interleaved image-text/video instruction tuning.**
diff --git a/mimic-it/syphus/file_utils.py b/mimic-it/syphus/file_utils.py
index ec1870ef..8ac1d351 100755
--- a/mimic-it/syphus/file_utils.py
+++ b/mimic-it/syphus/file_utils.py
@@ -1,5 +1,18 @@
"""
file utils
+
+Supports multiple LLM providers via liteLLM. Configure via environment variables:
+
+OpenAI (default):
+ export OPENAI_API_KEY="your-openai-key"
+ export OPENAI_API_ENGINE="gpt-4"
+
+MiniMax:
+ export MINIMAX_API_KEY="your-minimax-key"
+ export OPENAI_API_ENGINE="openai/MiniMax-M3"
+ export OPENAI_API_BASE="https://api.minimax.io/v1"
+
+See https://docs.litellm.ai/docs/providers for all supported providers.
"""
import json
@@ -13,11 +26,15 @@
engine = os.environ.get("OPENAI_API_ENGINE", "davinci")
-def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]:
+def query_llm(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]:
"""
- Query the GPT API with the given inputs.
+ Query the LLM API with the given inputs.
+
+ Supports multiple providers via liteLLM (OpenAI, MiniMax, Anthropic, etc.).
+ Configure via OPENAI_API_ENGINE and OPENAI_API_BASE environment variables.
+
Returns:
- Response (dict[str, str]): the response from GPT API.
+ Response (dict[str, str]): the response from the LLM API.
Input ID (str): the id that specifics the input.
"""
if dataset_name == "3d.SceneNavigation":
@@ -47,13 +64,19 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str
"content": inputs["query_input"]["sentences"],
},
)
+
+ # Clamp temperature for MiniMax (requires (0.0, 1.0])
+ temperature = 0.7
+ if os.environ.get("MINIMAX_API_KEY"):
+ temperature = max(temperature, 0.01)
+
succuss = True
while succuss:
try:
response = completion(
- engine=engine, # defined by os.environ, default engine="chatgpt0301",
+ engine=engine, # defined by os.environ, default engine="davinci"
messages=messages,
- temperature=0.7,
+ temperature=temperature,
max_tokens=3200,
top_p=0.95,
frequency_penalty=0,
@@ -73,6 +96,10 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str
return response, inputs["query_input"]["id"]
+# Backward-compatible alias
+query_gpt = query_llm
+
+
def split_question_and_answer(pair_of_answer: str, file_id: str) -> tuple[bool, dict[str, str]]:
"""
Split the question and answer from the pair of question and answer.
diff --git a/pipeline/benchmarks/datasets/magnifierbench.py b/pipeline/benchmarks/datasets/magnifierbench.py
index a0c4ed97..7eb99fd4 100644
--- a/pipeline/benchmarks/datasets/magnifierbench.py
+++ b/pipeline/benchmarks/datasets/magnifierbench.py
@@ -16,58 +16,40 @@
import time
import requests
+from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client
+
utc_plus_8 = pytz.timezone("Asia/Singapore") # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
utc_now = pytz.utc.localize(datetime.datetime.utcnow())
utc_plus_8_time = utc_now.astimezone(utc_plus_8)
-def get_chat_response(promot, api_key, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5):
- headers = {
- "Authorization": f"Bearer {api_key}",
- "Content-Type": "application/json",
- }
+def get_chat_response(promot, api_key=None, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None):
+ if eval_llm_client is None:
+ eval_llm_client = get_eval_llm_client(api_key=api_key, model=model)
messages = [
{"role": "system", "content": "You are a helpful AI assistant. Your task is to judge whether the model response is correct to answer the given question or not."},
{"role": "user", "content": promot},
]
- payload = {"model": model, "messages": messages}
-
- while patience > 0:
- patience -= 1
- try:
- response = requests.post(
- "https://api.openai.com/v1/chat/completions",
- headers=headers,
- data=json.dumps(payload),
- timeout=30,
- )
- response.raise_for_status()
- response_data = response.json()
-
- prediction = response_data["choices"][0]["message"]["content"].strip()
- if prediction != "" and prediction is not None:
- return prediction
-
- except Exception as e:
- if "Rate limit" not in str(e):
- print(e)
- time.sleep(sleep_time)
-
- return ""
+ return eval_llm_client.chat_completion(
+ messages=messages,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ patience=patience,
+ sleep_time=sleep_time,
+ )
-def prepare_query(model_answer_item, api_key):
+def prepare_query(model_answer_item, api_key=None, eval_llm_client=None):
freeform_question = model_answer_item["freeform_question"]
freeform_response = model_answer_item["freeform_response"]
correct_answer = model_answer_item["freeform_answer"]
- # Formulating the prompt for ChatGPT
+ # Formulating the prompt for evaluation LLM
prompt = f"Question: {freeform_question}\nModel Response: {freeform_response}\nGround Truth: {correct_answer}\nWill the model response be considered correct? You should only answer yes or no."
- # Querying ChatGPT
- chat_response = get_chat_response(prompt, api_key)
+ chat_response = get_chat_response(prompt, api_key=api_key, eval_llm_client=eval_llm_client)
return chat_response
@@ -83,6 +65,8 @@ def __init__(
debug: bool = False,
prompt="",
api_key=None,
+ eval_provider=None,
+ eval_model=None,
):
super().__init__("MagnifierBench", data_path)
@@ -95,6 +79,11 @@ def __init__(
self.debug = debug
self.prompt = prompt
self.api_key = api_key
+ self.eval_llm_client = get_eval_llm_client(
+ provider=eval_provider,
+ api_key=api_key,
+ model=eval_model,
+ )
def parse_pred_ans(self, pred_ans, question):
match = re.search(r"The answer is ([A-D])", pred_ans)
@@ -122,10 +111,6 @@ def parse_pred_ans(self, pred_ans, question):
def _evaluate(self, model):
model_score_dict = {}
- # output_path = os.path.join(self.default_output_path, f"{model.name}_{self.cur_datetime}")
- # if not os.path.exists(output_path):
- # os.makedirs(output_path)
- # model_path: str = "Salesforce/instructblip-vicuna-7b"
model_version = model.name.split("/")[-1]
model_answer_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_answer.json")
result_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_score.json")
@@ -186,16 +171,16 @@ def _evaluate(self, model):
model_score_dict["total"] = len(self.data)
model_score_dict["accuracy"] = score / len(self.data)
- print(f"Start query GPT-4 for free-form evaluation...")
- for data_id in tqdm(model_answer.keys(), desc="Querying GPT-4"):
+ print(f"Start query evaluation LLM for free-form evaluation...")
+ for data_id in tqdm(model_answer.keys(), desc="Querying evaluation LLM"):
model_answer_item = model_answer[data_id]
- gpt_response = prepare_query(model_answer_item, self.api_key)
+ gpt_response = prepare_query(model_answer_item, eval_llm_client=self.eval_llm_client)
if gpt_response.lower() == "yes":
ff_score += 1
elif gpt_response.lower() == "no":
ff_score += 0
else:
- print(f"Warning: {data_id} has invalid GPT-4 response: {gpt_response}")
+ print(f"Warning: {data_id} has invalid evaluation LLM response: {gpt_response}")
print(f"Skipping {data_id}")
continue
diff --git a/pipeline/benchmarks/datasets/mathvista.py b/pipeline/benchmarks/datasets/mathvista.py
index 939f7bb4..2851cc2b 100644
--- a/pipeline/benchmarks/datasets/mathvista.py
+++ b/pipeline/benchmarks/datasets/mathvista.py
@@ -15,6 +15,8 @@
import io
from Levenshtein import distance
+from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client
+
utc_plus_8 = pytz.timezone("Asia/Singapore") # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
utc_now = pytz.utc.localize(datetime.datetime.utcnow())
utc_plus_8_time = utc_now.astimezone(utc_plus_8)
@@ -65,41 +67,22 @@
import ast
-def get_chat_response(promot, api_key, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5):
- headers = {
- "Authorization": f"Bearer {api_key}",
- "Content-Type": "application/json",
- }
+def get_chat_response(promot, api_key=None, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None):
+ if eval_llm_client is None:
+ eval_llm_client = get_eval_llm_client(api_key=api_key, model=model)
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": promot},
]
- payload = {"model": model, "messages": messages}
-
- while patience > 0:
- patience -= 1
- try:
- response = requests.post(
- "https://api.openai.com/v1/chat/completions",
- headers=headers,
- data=json.dumps(payload),
- timeout=30,
- )
- response.raise_for_status()
- response_data = response.json()
-
- prediction = response_data["choices"][0]["message"]["content"].strip()
- if prediction != "" and prediction is not None:
- return prediction
-
- except Exception as e:
- if "Rate limit" not in str(e):
- print(e)
- time.sleep(sleep_time)
-
- return ""
+ return eval_llm_client.chat_completion(
+ messages=messages,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ patience=patience,
+ sleep_time=sleep_time,
+ )
def create_test_prompt(demo_prompt, query, response):
@@ -109,7 +92,7 @@ def create_test_prompt(demo_prompt, query, response):
return full_prompt
-def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613"):
+def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613", eval_llm_client=None):
question_type = problem["question_type"]
answer_type = problem["answer_type"]
choices = problem["choices"]
@@ -150,7 +133,7 @@ def extract_answer(response, problem, quick_extract=False, api_key=None, pid=Non
# general extraction
try:
full_prompt = create_test_prompt(demo_prompt, query, response)
- extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5)
+ extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5, eval_llm_client=eval_llm_client)
return extraction
except Exception as e:
print(e)
@@ -271,15 +254,14 @@ def __init__(
gpt_model="gpt-4-0613",
debug=False,
quick_extract=False,
+ eval_provider=None,
+ eval_model=None,
):
super().__init__("MathVistaDataset", data_path)
name_converter = {"dev": "validation", "test": "test"}
self.data = load_dataset("Otter-AI/MathVista", split=name_converter[split], cache_dir=cache_dir).to_pandas()
if debug:
self.data = self.data.sample(5)
- # data_path = "/home/luodian/projects/Otter/archived/testmini_image_inside.json"
- # with open(data_path, "r", encoding="utf-8") as f:
- # self.data = json.load(f)
self.debug = debug
self.quick_extract = quick_extract
@@ -290,6 +272,11 @@ def __init__(
self.cur_datetime = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S")
self.api_key = api_key
self.gpt_model = gpt_model
+ self.eval_llm_client = get_eval_llm_client(
+ provider=eval_provider,
+ api_key=api_key,
+ model=eval_model or gpt_model,
+ )
def create_query(self, problem, shot_type):
### [2] Test query
@@ -393,6 +380,7 @@ def _evaluate(self, model):
api_key=self.api_key,
pid=idx_key,
gpt_model=self.gpt_model,
+ eval_llm_client=self.eval_llm_client,
)
results[idx_key].update({"extraction": extraction})
answer = results[idx_key]["answer"]
diff --git a/pipeline/benchmarks/datasets/mmvet.py b/pipeline/benchmarks/datasets/mmvet.py
index d27c01d8..f85b8549 100644
--- a/pipeline/benchmarks/datasets/mmvet.py
+++ b/pipeline/benchmarks/datasets/mmvet.py
@@ -15,6 +15,8 @@
import datetime
from Levenshtein import distance
+from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client
+
utc_plus_8 = pytz.timezone("Asia/Singapore") # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
utc_now = pytz.utc.localize(datetime.datetime.utcnow())
utc_plus_8_time = utc_now.astimezone(utc_plus_8)
@@ -47,6 +49,8 @@ def __init__(
prompt: str = MM_VET_PROMPT,
decimail_places: int = 1, # number of decimal places to round to
debug: bool = False,
+ eval_provider: str = None,
+ eval_model: str = None,
):
super().__init__("MMVetDataset", data_path)
self.df = load_dataset(data_path, split=split, cache_dir=cache_dir).to_pandas()
@@ -58,8 +62,12 @@ def __init__(
self.api_key = api_key
self.cur_datetime = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S")
self.debug = debug
+ self.eval_llm_client = get_eval_llm_client(
+ provider=eval_provider,
+ api_key=api_key,
+ model=eval_model or gpt_model,
+ )
self.prepare()
- self.client = OpenAI(api_key=api_key)
def prepare(self):
self.counter = Counter()
@@ -183,8 +191,12 @@ def need_more_runs():
while not grade_sample_run_complete:
try:
- response = self.client.chat.completions.create(model=self.gpt_model, max_tokens=3, temperature=temperature, messages=messages, timeout=15)
- content = response["choices"][0]["message"]["content"]
+ content, response_data = self.eval_llm_client.chat_completion_raw(
+ messages=messages,
+ temperature=temperature,
+ max_tokens=3,
+ timeout=15,
+ )
flag = True
try_time = 1
while flag:
@@ -211,8 +223,12 @@ def need_more_runs():
messages = [
{"role": "user", "content": question},
]
- response = self.client.chat.completions.create(model=self.gpt_model, max_tokens=3, temperature=temperature, messages=messages, timeout=15)
- content = response["choices"][0]["message"]["content"]
+ content, response_data = self.eval_llm_client.chat_completion_raw(
+ messages=messages,
+ temperature=temperature,
+ max_tokens=3,
+ timeout=15,
+ )
try_time += 1
temperature += 0.5
print(f"{id} try {try_time} times")
@@ -222,17 +238,17 @@ def need_more_runs():
flag = False
grade_sample_run_complete = True
except Exception as e:
- # gpt4 may have token rate limit
+ # evaluation LLM may have token rate limit
print(e)
print("sleep 15s")
time.sleep(15)
if len(sample_grade["model"]) >= j + 1:
- sample_grade["model"][j] = response["model"]
+ sample_grade["model"][j] = response_data.get("model", self.eval_llm_client.model)
sample_grade["content"][j] = content
sample_grade["score"][j] = score
else:
- sample_grade["model"].append(response["model"])
+ sample_grade["model"].append(response_data.get("model", self.eval_llm_client.model))
sample_grade["content"].append(content)
sample_grade["score"].append(score)
sample_grade["query"] = line["instruction"]
diff --git a/pipeline/benchmarks/utils/__init__.py b/pipeline/benchmarks/utils/__init__.py
new file mode 100644
index 00000000..0b0af5b4
--- /dev/null
+++ b/pipeline/benchmarks/utils/__init__.py
@@ -0,0 +1 @@
+from .eval_llm import EvalLLMClient, get_eval_llm_client
diff --git a/pipeline/benchmarks/utils/eval_llm.py b/pipeline/benchmarks/utils/eval_llm.py
new file mode 100644
index 00000000..1f0d14ab
--- /dev/null
+++ b/pipeline/benchmarks/utils/eval_llm.py
@@ -0,0 +1,210 @@
+"""
+Configurable LLM client for benchmark evaluation.
+
+Supports multiple LLM providers (OpenAI, MiniMax) for evaluation judging tasks
+such as answer extraction and correctness scoring.
+
+Usage:
+ # Auto-detect provider from environment variables
+ client = get_eval_llm_client()
+
+ # Explicit provider selection
+ client = EvalLLMClient(provider="minimax", api_key="your-key")
+
+ # Chat completion
+ content = client.chat_completion(
+ messages=[{"role": "user", "content": "Hello"}],
+ temperature=0,
+ max_tokens=256,
+ )
+
+Environment variables:
+ EVAL_LLM_PROVIDER: Provider name ("openai" or "minimax")
+ OPENAI_API_KEY: API key for OpenAI
+ MINIMAX_API_KEY: API key for MiniMax
+"""
+
+import json
+import os
+import re
+import time
+from typing import Dict, List, Optional, Tuple
+
+import requests
+
+
+PROVIDER_CONFIGS: Dict[str, Dict[str, str]] = {
+ "openai": {
+ "api_base": "https://api.openai.com/v1",
+ "default_model": "gpt-4-0613",
+ "api_key_env": "OPENAI_API_KEY",
+ },
+ "minimax": {
+ "api_base": "https://api.minimax.io/v1",
+ # Default model: MiniMax-M3 (latest, 512K context, 128K max output, image input support).
+ # Other supported models: MiniMax-M2.7, MiniMax-M2.7-highspeed.
+ "default_model": "MiniMax-M3",
+ "api_key_env": "MINIMAX_API_KEY",
+ },
+}
+
+
+class EvalLLMClient:
+ """Configurable LLM client for evaluation tasks.
+
+ Supports OpenAI and MiniMax providers with automatic handling of
+ provider-specific quirks (temperature clamping, think-tag stripping).
+ """
+
+ def __init__(
+ self,
+ provider: Optional[str] = None,
+ api_key: Optional[str] = None,
+ model: Optional[str] = None,
+ api_base: Optional[str] = None,
+ ):
+ if provider is None:
+ provider = os.environ.get("EVAL_LLM_PROVIDER", "").lower()
+ if not provider:
+ if os.environ.get("MINIMAX_API_KEY"):
+ provider = "minimax"
+ else:
+ provider = "openai"
+
+ self.provider = provider
+ config = PROVIDER_CONFIGS.get(provider, PROVIDER_CONFIGS["openai"])
+
+ self.api_base = api_base or config["api_base"]
+ self.model = model or config["default_model"]
+ self.api_key = api_key or os.environ.get(config["api_key_env"], "")
+
+ def _clamp_temperature(self, temperature: float) -> float:
+ """Clamp temperature for MiniMax which requires (0.0, 1.0]."""
+ if self.provider == "minimax":
+ return max(temperature, 0.01)
+ return temperature
+
+ def _strip_think_tags(self, content: str) -> str:
+ """Strip ... tags from MiniMax responses."""
+ if self.provider == "minimax" and "" in content:
+ content = re.sub(r".*?\s*", "", content, flags=re.DOTALL).strip()
+ return content
+
+ def chat_completion(
+ self,
+ messages: List[Dict[str, str]],
+ temperature: float = 0,
+ max_tokens: int = 256,
+ patience: int = 5,
+ sleep_time: int = 5,
+ timeout: int = 30,
+ ) -> str:
+ """Send a chat completion request and return the response content.
+
+ Args:
+ messages: List of message dicts with 'role' and 'content'.
+ temperature: Sampling temperature.
+ max_tokens: Maximum tokens in response.
+ patience: Number of retries on failure.
+ sleep_time: Seconds to wait between retries.
+ timeout: Request timeout in seconds.
+
+ Returns:
+ The response content string, or empty string on failure.
+ """
+ headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json",
+ }
+
+ payload = {
+ "model": self.model,
+ "messages": messages,
+ "temperature": self._clamp_temperature(temperature),
+ "max_tokens": max_tokens,
+ }
+
+ while patience > 0:
+ patience -= 1
+ try:
+ response = requests.post(
+ f"{self.api_base}/chat/completions",
+ headers=headers,
+ data=json.dumps(payload),
+ timeout=timeout,
+ )
+ response.raise_for_status()
+ response_data = response.json()
+
+ content = response_data["choices"][0]["message"]["content"].strip()
+ content = self._strip_think_tags(content)
+ if content:
+ return content
+
+ except Exception as e:
+ if "Rate limit" not in str(e):
+ print(e)
+ time.sleep(sleep_time)
+
+ return ""
+
+ def chat_completion_raw(
+ self,
+ messages: List[Dict[str, str]],
+ temperature: float = 0,
+ max_tokens: int = 256,
+ timeout: int = 15,
+ ) -> Tuple[str, dict]:
+ """Send a chat completion request and return both content and raw response.
+
+ Used by evaluation datasets that need the full response object
+ (e.g., MMVet which tracks the model name).
+
+ Returns:
+ Tuple of (content_string, raw_response_dict).
+ """
+ headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json",
+ }
+
+ payload = {
+ "model": self.model,
+ "messages": messages,
+ "temperature": self._clamp_temperature(temperature),
+ "max_tokens": max_tokens,
+ }
+
+ response = requests.post(
+ f"{self.api_base}/chat/completions",
+ headers=headers,
+ data=json.dumps(payload),
+ timeout=timeout,
+ )
+ response.raise_for_status()
+ response_data = response.json()
+
+ content = response_data["choices"][0]["message"]["content"].strip()
+ content = self._strip_think_tags(content)
+ return content, response_data
+
+
+def get_eval_llm_client(
+ provider: Optional[str] = None,
+ api_key: Optional[str] = None,
+ model: Optional[str] = None,
+ api_base: Optional[str] = None,
+) -> EvalLLMClient:
+ """Factory function to create an EvalLLMClient.
+
+ Auto-detects provider from environment variables if not specified:
+ - EVAL_LLM_PROVIDER: Explicit provider name
+ - MINIMAX_API_KEY: Auto-selects MiniMax if set
+ - Falls back to OpenAI otherwise
+ """
+ return EvalLLMClient(
+ provider=provider,
+ api_key=api_key,
+ model=model,
+ api_base=api_base,
+ )
diff --git a/unit_tests/test_eval_llm.py b/unit_tests/test_eval_llm.py
new file mode 100644
index 00000000..3937da30
--- /dev/null
+++ b/unit_tests/test_eval_llm.py
@@ -0,0 +1,260 @@
+"""Unit tests for the configurable evaluation LLM client."""
+
+import json
+import os
+import unittest
+from unittest.mock import patch, MagicMock
+
+from pipeline.benchmarks.utils.eval_llm import EvalLLMClient, get_eval_llm_client, PROVIDER_CONFIGS
+
+
+class TestProviderConfigs(unittest.TestCase):
+ """Test provider configuration constants."""
+
+ def test_openai_config_exists(self):
+ self.assertIn("openai", PROVIDER_CONFIGS)
+ self.assertEqual(PROVIDER_CONFIGS["openai"]["api_base"], "https://api.openai.com/v1")
+ self.assertEqual(PROVIDER_CONFIGS["openai"]["api_key_env"], "OPENAI_API_KEY")
+
+ def test_minimax_config_exists(self):
+ self.assertIn("minimax", PROVIDER_CONFIGS)
+ self.assertEqual(PROVIDER_CONFIGS["minimax"]["api_base"], "https://api.minimax.io/v1")
+ self.assertEqual(PROVIDER_CONFIGS["minimax"]["default_model"], "MiniMax-M3")
+ self.assertEqual(PROVIDER_CONFIGS["minimax"]["api_key_env"], "MINIMAX_API_KEY")
+
+
+class TestEvalLLMClientInit(unittest.TestCase):
+ """Test EvalLLMClient initialization."""
+
+ def test_explicit_openai_provider(self):
+ client = EvalLLMClient(provider="openai", api_key="test-key")
+ self.assertEqual(client.provider, "openai")
+ self.assertEqual(client.api_base, "https://api.openai.com/v1")
+ self.assertEqual(client.model, "gpt-4-0613")
+ self.assertEqual(client.api_key, "test-key")
+
+ def test_explicit_minimax_provider(self):
+ client = EvalLLMClient(provider="minimax", api_key="test-key")
+ self.assertEqual(client.provider, "minimax")
+ self.assertEqual(client.api_base, "https://api.minimax.io/v1")
+ self.assertEqual(client.model, "MiniMax-M3")
+ self.assertEqual(client.api_key, "test-key")
+
+ def test_custom_model_override(self):
+ client = EvalLLMClient(provider="minimax", api_key="key", model="MiniMax-M2.7")
+ self.assertEqual(client.model, "MiniMax-M2.7")
+
+ def test_custom_api_base_override(self):
+ client = EvalLLMClient(provider="openai", api_key="key", api_base="https://custom.api.com/v1")
+ self.assertEqual(client.api_base, "https://custom.api.com/v1")
+
+ @patch.dict(os.environ, {"MINIMAX_API_KEY": "env-minimax-key"}, clear=False)
+ def test_auto_detect_minimax_from_env(self):
+ client = EvalLLMClient()
+ self.assertEqual(client.provider, "minimax")
+ self.assertEqual(client.api_key, "env-minimax-key")
+
+ @patch.dict(os.environ, {"EVAL_LLM_PROVIDER": "minimax", "MINIMAX_API_KEY": "env-key"}, clear=False)
+ def test_explicit_env_provider(self):
+ client = EvalLLMClient()
+ self.assertEqual(client.provider, "minimax")
+
+ @patch.dict(os.environ, {"OPENAI_API_KEY": "env-openai-key"}, clear=False)
+ def test_default_to_openai(self):
+ env = os.environ.copy()
+ env.pop("MINIMAX_API_KEY", None)
+ env.pop("EVAL_LLM_PROVIDER", None)
+ with patch.dict(os.environ, env, clear=True):
+ client = EvalLLMClient()
+ self.assertEqual(client.provider, "openai")
+
+
+class TestTemperatureClamping(unittest.TestCase):
+ """Test temperature clamping for MiniMax."""
+
+ def test_minimax_clamps_zero_temperature(self):
+ client = EvalLLMClient(provider="minimax", api_key="key")
+ self.assertEqual(client._clamp_temperature(0.0), 0.01)
+
+ def test_minimax_preserves_nonzero_temperature(self):
+ client = EvalLLMClient(provider="minimax", api_key="key")
+ self.assertEqual(client._clamp_temperature(0.7), 0.7)
+
+ def test_openai_preserves_zero_temperature(self):
+ client = EvalLLMClient(provider="openai", api_key="key")
+ self.assertEqual(client._clamp_temperature(0.0), 0.0)
+
+
+class TestThinkTagStripping(unittest.TestCase):
+ """Test ... tag stripping for MiniMax."""
+
+ def test_minimax_strips_think_tags(self):
+ client = EvalLLMClient(provider="minimax", api_key="key")
+ content = "Let me think about this...\nThe answer is yes."
+ self.assertEqual(client._strip_think_tags(content), "The answer is yes.")
+
+ def test_minimax_strips_multiline_think_tags(self):
+ client = EvalLLMClient(provider="minimax", api_key="key")
+ content = "\nStep 1: analyze\nStep 2: conclude\n\n0.8"
+ self.assertEqual(client._strip_think_tags(content), "0.8")
+
+ def test_minimax_preserves_content_without_think_tags(self):
+ client = EvalLLMClient(provider="minimax", api_key="key")
+ content = "The answer is yes."
+ self.assertEqual(client._strip_think_tags(content), "The answer is yes.")
+
+ def test_openai_preserves_all_content(self):
+ client = EvalLLMClient(provider="openai", api_key="key")
+ content = "some content\nThe answer is yes."
+ self.assertEqual(client._strip_think_tags(content), content)
+
+
+class TestChatCompletion(unittest.TestCase):
+ """Test chat completion with mocked HTTP responses."""
+
+ @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+ def test_successful_openai_completion(self, mock_post):
+ mock_response = MagicMock()
+ mock_response.status_code = 200
+ mock_response.raise_for_status = MagicMock()
+ mock_response.json.return_value = {
+ "choices": [{"message": {"content": "yes"}}],
+ "model": "gpt-4-0613",
+ }
+ mock_post.return_value = mock_response
+
+ client = EvalLLMClient(provider="openai", api_key="test-key")
+ result = client.chat_completion(
+ messages=[{"role": "user", "content": "Is this correct?"}],
+ temperature=0,
+ max_tokens=256,
+ )
+
+ self.assertEqual(result, "yes")
+ mock_post.assert_called_once()
+ call_args = mock_post.call_args
+ self.assertIn("api.openai.com", call_args[0][0])
+
+ @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+ def test_successful_minimax_completion(self, mock_post):
+ mock_response = MagicMock()
+ mock_response.status_code = 200
+ mock_response.raise_for_status = MagicMock()
+ mock_response.json.return_value = {
+ "choices": [{"message": {"content": "analyzing...\n0.8"}}],
+ "model": "MiniMax-M3",
+ }
+ mock_post.return_value = mock_response
+
+ client = EvalLLMClient(provider="minimax", api_key="test-key")
+ result = client.chat_completion(
+ messages=[{"role": "user", "content": "Score this answer"}],
+ temperature=0,
+ max_tokens=3,
+ )
+
+ self.assertEqual(result, "0.8")
+ call_args = mock_post.call_args
+ payload = json.loads(call_args[1]["data"])
+ self.assertEqual(payload["temperature"], 0.01) # clamped
+ self.assertIn("api.minimax.io", call_args[0][0])
+
+ @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+ @patch("pipeline.benchmarks.utils.eval_llm.time.sleep")
+ def test_retry_on_failure(self, mock_sleep, mock_post):
+ mock_fail = MagicMock()
+ mock_fail.raise_for_status.side_effect = Exception("Rate limit exceeded")
+
+ mock_success = MagicMock()
+ mock_success.raise_for_status = MagicMock()
+ mock_success.json.return_value = {
+ "choices": [{"message": {"content": "yes"}}],
+ }
+
+ mock_post.side_effect = [mock_fail, mock_success]
+
+ client = EvalLLMClient(provider="openai", api_key="test-key")
+ result = client.chat_completion(
+ messages=[{"role": "user", "content": "test"}],
+ patience=3,
+ sleep_time=1,
+ )
+
+ self.assertEqual(result, "yes")
+ self.assertEqual(mock_post.call_count, 2)
+
+ @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+ @patch("pipeline.benchmarks.utils.eval_llm.time.sleep")
+ def test_returns_empty_on_exhausted_retries(self, mock_sleep, mock_post):
+ mock_fail = MagicMock()
+ mock_fail.raise_for_status.side_effect = Exception("Server error")
+ mock_post.return_value = mock_fail
+
+ client = EvalLLMClient(provider="openai", api_key="test-key")
+ result = client.chat_completion(
+ messages=[{"role": "user", "content": "test"}],
+ patience=2,
+ sleep_time=0,
+ )
+
+ self.assertEqual(result, "")
+ self.assertEqual(mock_post.call_count, 2)
+
+
+class TestChatCompletionRaw(unittest.TestCase):
+ """Test raw chat completion that returns response dict."""
+
+ @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+ def test_returns_content_and_response_data(self, mock_post):
+ response_data = {
+ "choices": [{"message": {"content": "0.7"}}],
+ "model": "gpt-4-0613",
+ }
+ mock_response = MagicMock()
+ mock_response.raise_for_status = MagicMock()
+ mock_response.json.return_value = response_data
+ mock_post.return_value = mock_response
+
+ client = EvalLLMClient(provider="openai", api_key="test-key")
+ content, raw = client.chat_completion_raw(
+ messages=[{"role": "user", "content": "test"}],
+ )
+
+ self.assertEqual(content, "0.7")
+ self.assertEqual(raw["model"], "gpt-4-0613")
+
+ @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+ def test_minimax_strips_think_tags_in_raw(self, mock_post):
+ response_data = {
+ "choices": [{"message": {"content": "thinking\n0.9"}}],
+ "model": "MiniMax-M3",
+ }
+ mock_response = MagicMock()
+ mock_response.raise_for_status = MagicMock()
+ mock_response.json.return_value = response_data
+ mock_post.return_value = mock_response
+
+ client = EvalLLMClient(provider="minimax", api_key="test-key")
+ content, raw = client.chat_completion_raw(
+ messages=[{"role": "user", "content": "test"}],
+ )
+
+ self.assertEqual(content, "0.9")
+
+
+class TestGetEvalLLMClient(unittest.TestCase):
+ """Test factory function."""
+
+ def test_creates_client_with_defaults(self):
+ client = get_eval_llm_client(provider="openai", api_key="key")
+ self.assertIsInstance(client, EvalLLMClient)
+ self.assertEqual(client.provider, "openai")
+
+ def test_creates_minimax_client(self):
+ client = get_eval_llm_client(provider="minimax", api_key="key", model="MiniMax-M2.7")
+ self.assertEqual(client.provider, "minimax")
+ self.assertEqual(client.model, "MiniMax-M2.7")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/unit_tests/test_eval_llm_integration.py b/unit_tests/test_eval_llm_integration.py
new file mode 100644
index 00000000..dfa98c21
--- /dev/null
+++ b/unit_tests/test_eval_llm_integration.py
@@ -0,0 +1,79 @@
+"""Integration tests for MiniMax evaluation LLM provider.
+
+These tests make real API calls to the MiniMax API.
+Set MINIMAX_API_KEY environment variable to run.
+
+Usage:
+ MINIMAX_API_KEY=your-key python -m pytest unit_tests/test_eval_llm_integration.py -v
+"""
+
+import os
+import unittest
+
+from pipeline.benchmarks.utils.eval_llm import EvalLLMClient, get_eval_llm_client
+
+
+MINIMAX_API_KEY = os.environ.get("MINIMAX_API_KEY", "")
+
+
+@unittest.skipUnless(MINIMAX_API_KEY, "MINIMAX_API_KEY not set")
+class TestMiniMaxIntegration(unittest.TestCase):
+ """Integration tests against the live MiniMax API."""
+
+ def setUp(self):
+ self.client = EvalLLMClient(
+ provider="minimax",
+ api_key=MINIMAX_API_KEY,
+ model="MiniMax-M3",
+ )
+
+ def test_basic_chat_completion(self):
+ result = self.client.chat_completion(
+ messages=[
+ {"role": "system", "content": "You are a helpful assistant. Answer briefly."},
+ {"role": "user", "content": "What is 2 + 2? Answer with just the number."},
+ ],
+ temperature=0.01,
+ max_tokens=256,
+ )
+ self.assertIn("4", result)
+
+ def test_evaluation_judge_yes_no(self):
+ result = self.client.chat_completion(
+ messages=[
+ {"role": "system", "content": "You are a helpful AI assistant. Your task is to judge whether the model response is correct to answer the given question or not."},
+ {"role": "user", "content": "Question: What color is the sky?\nModel Response: The sky is blue.\nGround Truth: blue\nWill the model response be considered correct? You should only answer yes or no."},
+ ],
+ temperature=0.01,
+ max_tokens=256,
+ )
+ self.assertIn("yes", result.lower())
+
+ def test_scoring_correctness(self):
+ result = self.client.chat_completion(
+ messages=[
+ {"role": "user", "content": "Compare the ground truth and prediction, give a correctness score from 0.0 to 1.0.\n\nQuestion: What is 2+2?\nGround Truth: 4\nPrediction: 4\n\nJust output the score number."},
+ ],
+ temperature=0.01,
+ max_tokens=256,
+ )
+ self.assertTrue(len(result) > 0, "Response should not be empty")
+ # Should contain a high score
+ self.assertTrue(
+ any(s in result for s in ["1.0", "1", "0.9", "0.8"]),
+ f"Expected high score in response: {result}",
+ )
+
+
+@unittest.skipUnless(MINIMAX_API_KEY, "MINIMAX_API_KEY not set")
+class TestMiniMaxAutoDetect(unittest.TestCase):
+ """Test auto-detection of MiniMax provider."""
+
+ def test_auto_detect_creates_minimax_client(self):
+ client = get_eval_llm_client()
+ self.assertEqual(client.provider, "minimax")
+ self.assertEqual(client.api_base, "https://api.minimax.io/v1")
+
+
+if __name__ == "__main__":
+ unittest.main()