diff --git a/README.md b/README.md
index d8578827..24362229 100755
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@
 ![](https://img.shields.io/github/stars/luodian/otter?style=social)
 [![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FLuodian%2Fotter&count_bg=%23FFA500&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false)](https://hits.seeyoufarm.com)
 [![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere-blue?color=green)](https://github.com/BerriAI/litellm)
+[![MiniMax](https://img.shields.io/badge/MiniMax-M3-blue)](https://www.minimax.io)
 
 [Project Credits](https://github.com/Luodian/Otter/blob/main/docs/credits.md) | [Otter Paper](https://arxiv.org/abs/2305.03726) | [OtterHD Paper](https://arxiv.org/abs/2311.04219) | [MIMIC-IT Paper](https://arxiv.org/abs/2306.05425)
 
@@ -41,6 +42,7 @@ For who in the mainland China: [![Open in OpenXLab](https://cdn-static.openxlab.
             split: test
             prompt: Answer with the option's letter from the given choices directly.
             api_key: [Your API Key] # GPT4 or GPT3.5 to evaluate the answers and ground truth.
+            eval_provider: minimax # Optional: use "minimax" or "openai" (default)
             debug: true # put debug=true will save the model response in log file.
         - name: mme
             split: test
@@ -70,6 +72,18 @@ For who in the mainland China: [![Open in OpenXLab](https://cdn-static.openxlab.
 **[2023-08]**
 
 1. Added Support for using Azure, Anthropic, Palm, Cohere models for Self-Instruct with Syphus pipeline, for information on usage modify [this line](https://github.com/Luodian/Otter/blob/16d73b399fac6352ebff7504b1acb1f228fbf3f4/mimic-it/syphus/file_utils.py#L53) with your selected model and set your API keys in the environment. For more information see [LiteLLM](https://github.com/BerriAI/litellm/)
+2. Added [MiniMax](https://www.minimax.io) as a supported LLM provider for both the Syphus data generation pipeline and benchmark evaluation. Configure via environment variables:
+   ```bash
+   # For Syphus data generation (via liteLLM)
+   export MINIMAX_API_KEY="your-minimax-key"
+   export OPENAI_API_ENGINE="openai/MiniMax-M3"
+   export OPENAI_API_BASE="https://api.minimax.io/v1"
+
+   # For benchmark evaluation (MagnifierBench, MathVista, MM-Vet)
+   export EVAL_LLM_PROVIDER="minimax"
+   export MINIMAX_API_KEY="your-minimax-key"
+   ```
+   MiniMax M3 offers a 512K context window, up to 128K max output, and image input support. M2.7 and M2.7-highspeed remain available as alternatives. See `pipeline/benchmarks/utils/eval_llm.py` for details.
 
 **[2023-07]: Anouncing MIMIC-IT dataset for multiple interleaved image-text/video instruction tuning.**
 
diff --git a/mimic-it/syphus/file_utils.py b/mimic-it/syphus/file_utils.py
index ec1870ef..8ac1d351 100755
--- a/mimic-it/syphus/file_utils.py
+++ b/mimic-it/syphus/file_utils.py
@@ -1,5 +1,18 @@
 """
 file utils
+
+Supports multiple LLM providers via liteLLM. Configure via environment variables:
+
+OpenAI (default):
+    export OPENAI_API_KEY="your-openai-key"
+    export OPENAI_API_ENGINE="gpt-4"
+
+MiniMax:
+    export MINIMAX_API_KEY="your-minimax-key"
+    export OPENAI_API_ENGINE="openai/MiniMax-M3"
+    export OPENAI_API_BASE="https://api.minimax.io/v1"
+
+See https://docs.litellm.ai/docs/providers for all supported providers.
 """
 
 import json
@@ -13,11 +26,15 @@
 engine = os.environ.get("OPENAI_API_ENGINE", "davinci")
 
 
-def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]:
+def query_llm(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]:
     """
-    Query the GPT API with the given inputs.
+    Query the LLM API with the given inputs.
+
+    Supports multiple providers via liteLLM (OpenAI, MiniMax, Anthropic, etc.).
+    Configure via OPENAI_API_ENGINE and OPENAI_API_BASE environment variables.
+
     Returns:
-        Response (dict[str, str]): the response from GPT API.
+        Response (dict[str, str]): the response from the LLM API.
         Input ID (str): the id that specifics the input.
     """
     if dataset_name == "3d.SceneNavigation":
@@ -47,13 +64,19 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str
                 "content": inputs["query_input"]["sentences"],
             },
         )
+
+    # Clamp temperature for MiniMax (requires (0.0, 1.0])
+    temperature = 0.7
+    if os.environ.get("MINIMAX_API_KEY"):
+        temperature = max(temperature, 0.01)
+
     succuss = True
     while succuss:
         try:
             response = completion(
-                engine=engine,  # defined by os.environ, default engine="chatgpt0301",
+                engine=engine,  # defined by os.environ, default engine="davinci"
                 messages=messages,
-                temperature=0.7,
+                temperature=temperature,
                 max_tokens=3200,
                 top_p=0.95,
                 frequency_penalty=0,
@@ -73,6 +96,10 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str
     return response, inputs["query_input"]["id"]
 
 
+# Backward-compatible alias
+query_gpt = query_llm
+
+
 def split_question_and_answer(pair_of_answer: str, file_id: str) -> tuple[bool, dict[str, str]]:
     """
     Split the question and answer from the pair of question and answer.
diff --git a/pipeline/benchmarks/datasets/magnifierbench.py b/pipeline/benchmarks/datasets/magnifierbench.py
index a0c4ed97..7eb99fd4 100644
--- a/pipeline/benchmarks/datasets/magnifierbench.py
+++ b/pipeline/benchmarks/datasets/magnifierbench.py
@@ -16,58 +16,40 @@
 import time
 import requests
 
+from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client
+
 utc_plus_8 = pytz.timezone("Asia/Singapore")  # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
 utc_now = pytz.utc.localize(datetime.datetime.utcnow())
 utc_plus_8_time = utc_now.astimezone(utc_plus_8)
 
 
-def get_chat_response(promot, api_key, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5):
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
+def get_chat_response(promot, api_key=None, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None):
+    if eval_llm_client is None:
+        eval_llm_client = get_eval_llm_client(api_key=api_key, model=model)
 
     messages = [
         {"role": "system", "content": "You are a helpful AI assistant. Your task is to judge whether the model response is correct to answer the given question or not."},
         {"role": "user", "content": promot},
     ]
 
-    payload = {"model": model, "messages": messages}
-
-    while patience > 0:
-        patience -= 1
-        try:
-            response = requests.post(
-                "https://api.openai.com/v1/chat/completions",
-                headers=headers,
-                data=json.dumps(payload),
-                timeout=30,
-            )
-            response.raise_for_status()
-            response_data = response.json()
-
-            prediction = response_data["choices"][0]["message"]["content"].strip()
-            if prediction != "" and prediction is not None:
-                return prediction
-
-        except Exception as e:
-            if "Rate limit" not in str(e):
-                print(e)
-            time.sleep(sleep_time)
-
-    return ""
+    return eval_llm_client.chat_completion(
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        patience=patience,
+        sleep_time=sleep_time,
+    )
 
 
-def prepare_query(model_answer_item, api_key):
+def prepare_query(model_answer_item, api_key=None, eval_llm_client=None):
     freeform_question = model_answer_item["freeform_question"]
     freeform_response = model_answer_item["freeform_response"]
     correct_answer = model_answer_item["freeform_answer"]
 
-    # Formulating the prompt for ChatGPT
+    # Formulating the prompt for evaluation LLM
     prompt = f"Question: {freeform_question}\nModel Response: {freeform_response}\nGround Truth: {correct_answer}\nWill the model response be considered correct? You should only answer yes or no."
 
-    # Querying ChatGPT
-    chat_response = get_chat_response(prompt, api_key)
+    chat_response = get_chat_response(prompt, api_key=api_key, eval_llm_client=eval_llm_client)
 
     return chat_response
 
@@ -83,6 +65,8 @@ def __init__(
         debug: bool = False,
         prompt="",
         api_key=None,
+        eval_provider=None,
+        eval_model=None,
     ):
         super().__init__("MagnifierBench", data_path)
 
@@ -95,6 +79,11 @@ def __init__(
         self.debug = debug
         self.prompt = prompt
         self.api_key = api_key
+        self.eval_llm_client = get_eval_llm_client(
+            provider=eval_provider,
+            api_key=api_key,
+            model=eval_model,
+        )
 
     def parse_pred_ans(self, pred_ans, question):
         match = re.search(r"The answer is ([A-D])", pred_ans)
@@ -122,10 +111,6 @@ def parse_pred_ans(self, pred_ans, question):
     def _evaluate(self, model):
         model_score_dict = {}
 
-        # output_path = os.path.join(self.default_output_path, f"{model.name}_{self.cur_datetime}")
-        # if not os.path.exists(output_path):
-        #     os.makedirs(output_path)
-        # model_path: str = "Salesforce/instructblip-vicuna-7b"
         model_version = model.name.split("/")[-1]
         model_answer_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_answer.json")
         result_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_score.json")
@@ -186,16 +171,16 @@ def _evaluate(self, model):
         model_score_dict["total"] = len(self.data)
         model_score_dict["accuracy"] = score / len(self.data)
 
-        print(f"Start query GPT-4 for free-form evaluation...")
-        for data_id in tqdm(model_answer.keys(), desc="Querying GPT-4"):
+        print(f"Start query evaluation LLM for free-form evaluation...")
+        for data_id in tqdm(model_answer.keys(), desc="Querying evaluation LLM"):
             model_answer_item = model_answer[data_id]
-            gpt_response = prepare_query(model_answer_item, self.api_key)
+            gpt_response = prepare_query(model_answer_item, eval_llm_client=self.eval_llm_client)
             if gpt_response.lower() == "yes":
                 ff_score += 1
             elif gpt_response.lower() == "no":
                 ff_score += 0
             else:
-                print(f"Warning: {data_id} has invalid GPT-4 response: {gpt_response}")
+                print(f"Warning: {data_id} has invalid evaluation LLM response: {gpt_response}")
                 print(f"Skipping {data_id}")
                 continue
 
diff --git a/pipeline/benchmarks/datasets/mathvista.py b/pipeline/benchmarks/datasets/mathvista.py
index 939f7bb4..2851cc2b 100644
--- a/pipeline/benchmarks/datasets/mathvista.py
+++ b/pipeline/benchmarks/datasets/mathvista.py
@@ -15,6 +15,8 @@
 import io
 from Levenshtein import distance
 
+from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client
+
 utc_plus_8 = pytz.timezone("Asia/Singapore")  # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
 utc_now = pytz.utc.localize(datetime.datetime.utcnow())
 utc_plus_8_time = utc_now.astimezone(utc_plus_8)
@@ -65,41 +67,22 @@
 import ast
 
 
-def get_chat_response(promot, api_key, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5):
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
+def get_chat_response(promot, api_key=None, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None):
+    if eval_llm_client is None:
+        eval_llm_client = get_eval_llm_client(api_key=api_key, model=model)
 
     messages = [
         {"role": "system", "content": "You are a helpful AI assistant."},
         {"role": "user", "content": promot},
     ]
 
-    payload = {"model": model, "messages": messages}
-
-    while patience > 0:
-        patience -= 1
-        try:
-            response = requests.post(
-                "https://api.openai.com/v1/chat/completions",
-                headers=headers,
-                data=json.dumps(payload),
-                timeout=30,
-            )
-            response.raise_for_status()
-            response_data = response.json()
-
-            prediction = response_data["choices"][0]["message"]["content"].strip()
-            if prediction != "" and prediction is not None:
-                return prediction
-
-        except Exception as e:
-            if "Rate limit" not in str(e):
-                print(e)
-            time.sleep(sleep_time)
-
-    return ""
+    return eval_llm_client.chat_completion(
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        patience=patience,
+        sleep_time=sleep_time,
+    )
 
 
 def create_test_prompt(demo_prompt, query, response):
@@ -109,7 +92,7 @@ def create_test_prompt(demo_prompt, query, response):
     return full_prompt
 
 
-def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613"):
+def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613", eval_llm_client=None):
     question_type = problem["question_type"]
     answer_type = problem["answer_type"]
     choices = problem["choices"]
@@ -150,7 +133,7 @@ def extract_answer(response, problem, quick_extract=False, api_key=None, pid=Non
         # general extraction
         try:
             full_prompt = create_test_prompt(demo_prompt, query, response)
-            extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5)
+            extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5, eval_llm_client=eval_llm_client)
             return extraction
         except Exception as e:
             print(e)
@@ -271,15 +254,14 @@ def __init__(
         gpt_model="gpt-4-0613",
         debug=False,
         quick_extract=False,
+        eval_provider=None,
+        eval_model=None,
     ):
         super().__init__("MathVistaDataset", data_path)
         name_converter = {"dev": "validation", "test": "test"}
         self.data = load_dataset("Otter-AI/MathVista", split=name_converter[split], cache_dir=cache_dir).to_pandas()
         if debug:
             self.data = self.data.sample(5)
-        # data_path = "/home/luodian/projects/Otter/archived/testmini_image_inside.json"
-        # with open(data_path, "r", encoding="utf-8") as f:
-        #     self.data = json.load(f)
 
         self.debug = debug
         self.quick_extract = quick_extract
@@ -290,6 +272,11 @@ def __init__(
         self.cur_datetime = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S")
         self.api_key = api_key
         self.gpt_model = gpt_model
+        self.eval_llm_client = get_eval_llm_client(
+            provider=eval_provider,
+            api_key=api_key,
+            model=eval_model or gpt_model,
+        )
 
     def create_query(self, problem, shot_type):
         ### [2] Test query
@@ -393,6 +380,7 @@ def _evaluate(self, model):
                 api_key=self.api_key,
                 pid=idx_key,
                 gpt_model=self.gpt_model,
+                eval_llm_client=self.eval_llm_client,
             )
             results[idx_key].update({"extraction": extraction})
             answer = results[idx_key]["answer"]
diff --git a/pipeline/benchmarks/datasets/mmvet.py b/pipeline/benchmarks/datasets/mmvet.py
index d27c01d8..f85b8549 100644
--- a/pipeline/benchmarks/datasets/mmvet.py
+++ b/pipeline/benchmarks/datasets/mmvet.py
@@ -15,6 +15,8 @@
 import datetime
 from Levenshtein import distance
 
+from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client
+
 utc_plus_8 = pytz.timezone("Asia/Singapore")  # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
 utc_now = pytz.utc.localize(datetime.datetime.utcnow())
 utc_plus_8_time = utc_now.astimezone(utc_plus_8)
@@ -47,6 +49,8 @@ def __init__(
         prompt: str = MM_VET_PROMPT,
         decimail_places: int = 1,  # number of decimal places to round to
         debug: bool = False,
+        eval_provider: str = None,
+        eval_model: str = None,
     ):
         super().__init__("MMVetDataset", data_path)
         self.df = load_dataset(data_path, split=split, cache_dir=cache_dir).to_pandas()
@@ -58,8 +62,12 @@ def __init__(
         self.api_key = api_key
         self.cur_datetime = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S")
         self.debug = debug
+        self.eval_llm_client = get_eval_llm_client(
+            provider=eval_provider,
+            api_key=api_key,
+            model=eval_model or gpt_model,
+        )
         self.prepare()
-        self.client = OpenAI(api_key=api_key)
 
     def prepare(self):
         self.counter = Counter()
@@ -183,8 +191,12 @@ def need_more_runs():
 
                     while not grade_sample_run_complete:
                         try:
-                            response = self.client.chat.completions.create(model=self.gpt_model, max_tokens=3, temperature=temperature, messages=messages, timeout=15)
-                            content = response["choices"][0]["message"]["content"]
+                            content, response_data = self.eval_llm_client.chat_completion_raw(
+                                messages=messages,
+                                temperature=temperature,
+                                max_tokens=3,
+                                timeout=15,
+                            )
                             flag = True
                             try_time = 1
                             while flag:
@@ -211,8 +223,12 @@ def need_more_runs():
                                     messages = [
                                         {"role": "user", "content": question},
                                     ]
-                                    response = self.client.chat.completions.create(model=self.gpt_model, max_tokens=3, temperature=temperature, messages=messages, timeout=15)
-                                    content = response["choices"][0]["message"]["content"]
+                                    content, response_data = self.eval_llm_client.chat_completion_raw(
+                                        messages=messages,
+                                        temperature=temperature,
+                                        max_tokens=3,
+                                        timeout=15,
+                                    )
                                     try_time += 1
                                     temperature += 0.5
                                     print(f"{id} try {try_time} times")
@@ -222,17 +238,17 @@ def need_more_runs():
                                         flag = False
                             grade_sample_run_complete = True
                         except Exception as e:
-                            # gpt4 may have token rate limit
+                            # evaluation LLM may have token rate limit
                             print(e)
                             print("sleep 15s")
                             time.sleep(15)
 
                     if len(sample_grade["model"]) >= j + 1:
-                        sample_grade["model"][j] = response["model"]
+                        sample_grade["model"][j] = response_data.get("model", self.eval_llm_client.model)
                         sample_grade["content"][j] = content
                         sample_grade["score"][j] = score
                     else:
-                        sample_grade["model"].append(response["model"])
+                        sample_grade["model"].append(response_data.get("model", self.eval_llm_client.model))
                         sample_grade["content"].append(content)
                         sample_grade["score"].append(score)
                         sample_grade["query"] = line["instruction"]
diff --git a/pipeline/benchmarks/utils/__init__.py b/pipeline/benchmarks/utils/__init__.py
new file mode 100644
index 00000000..0b0af5b4
--- /dev/null
+++ b/pipeline/benchmarks/utils/__init__.py
@@ -0,0 +1 @@
+from .eval_llm import EvalLLMClient, get_eval_llm_client
diff --git a/pipeline/benchmarks/utils/eval_llm.py b/pipeline/benchmarks/utils/eval_llm.py
new file mode 100644
index 00000000..1f0d14ab
--- /dev/null
+++ b/pipeline/benchmarks/utils/eval_llm.py
@@ -0,0 +1,210 @@
+"""
+Configurable LLM client for benchmark evaluation.
+
+Supports multiple LLM providers (OpenAI, MiniMax) for evaluation judging tasks
+such as answer extraction and correctness scoring.
+
+Usage:
+    # Auto-detect provider from environment variables
+    client = get_eval_llm_client()
+
+    # Explicit provider selection
+    client = EvalLLMClient(provider="minimax", api_key="your-key")
+
+    # Chat completion
+    content = client.chat_completion(
+        messages=[{"role": "user", "content": "Hello"}],
+        temperature=0,
+        max_tokens=256,
+    )
+
+Environment variables:
+    EVAL_LLM_PROVIDER: Provider name ("openai" or "minimax")
+    OPENAI_API_KEY: API key for OpenAI
+    MINIMAX_API_KEY: API key for MiniMax
+"""
+
+import json
+import os
+import re
+import time
+from typing import Dict, List, Optional, Tuple
+
+import requests
+
+
+PROVIDER_CONFIGS: Dict[str, Dict[str, str]] = {
+    "openai": {
+        "api_base": "https://api.openai.com/v1",
+        "default_model": "gpt-4-0613",
+        "api_key_env": "OPENAI_API_KEY",
+    },
+    "minimax": {
+        "api_base": "https://api.minimax.io/v1",
+        # Default model: MiniMax-M3 (latest, 512K context, 128K max output, image input support).
+        # Other supported models: MiniMax-M2.7, MiniMax-M2.7-highspeed.
+        "default_model": "MiniMax-M3",
+        "api_key_env": "MINIMAX_API_KEY",
+    },
+}
+
+
+class EvalLLMClient:
+    """Configurable LLM client for evaluation tasks.
+
+    Supports OpenAI and MiniMax providers with automatic handling of
+    provider-specific quirks (temperature clamping, think-tag stripping).
+    """
+
+    def __init__(
+        self,
+        provider: Optional[str] = None,
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ):
+        if provider is None:
+            provider = os.environ.get("EVAL_LLM_PROVIDER", "").lower()
+            if not provider:
+                if os.environ.get("MINIMAX_API_KEY"):
+                    provider = "minimax"
+                else:
+                    provider = "openai"
+
+        self.provider = provider
+        config = PROVIDER_CONFIGS.get(provider, PROVIDER_CONFIGS["openai"])
+
+        self.api_base = api_base or config["api_base"]
+        self.model = model or config["default_model"]
+        self.api_key = api_key or os.environ.get(config["api_key_env"], "")
+
+    def _clamp_temperature(self, temperature: float) -> float:
+        """Clamp temperature for MiniMax which requires (0.0, 1.0]."""
+        if self.provider == "minimax":
+            return max(temperature, 0.01)
+        return temperature
+
+    def _strip_think_tags(self, content: str) -> str:
+        """Strip <think>...</think> tags from MiniMax responses."""
+        if self.provider == "minimax" and "<think>" in content:
+            content = re.sub(r"<think>.*?</think>\s*", "", content, flags=re.DOTALL).strip()
+        return content
+
+    def chat_completion(
+        self,
+        messages: List[Dict[str, str]],
+        temperature: float = 0,
+        max_tokens: int = 256,
+        patience: int = 5,
+        sleep_time: int = 5,
+        timeout: int = 30,
+    ) -> str:
+        """Send a chat completion request and return the response content.
+
+        Args:
+            messages: List of message dicts with 'role' and 'content'.
+            temperature: Sampling temperature.
+            max_tokens: Maximum tokens in response.
+            patience: Number of retries on failure.
+            sleep_time: Seconds to wait between retries.
+            timeout: Request timeout in seconds.
+
+        Returns:
+            The response content string, or empty string on failure.
+        """
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+
+        payload = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": self._clamp_temperature(temperature),
+            "max_tokens": max_tokens,
+        }
+
+        while patience > 0:
+            patience -= 1
+            try:
+                response = requests.post(
+                    f"{self.api_base}/chat/completions",
+                    headers=headers,
+                    data=json.dumps(payload),
+                    timeout=timeout,
+                )
+                response.raise_for_status()
+                response_data = response.json()
+
+                content = response_data["choices"][0]["message"]["content"].strip()
+                content = self._strip_think_tags(content)
+                if content:
+                    return content
+
+            except Exception as e:
+                if "Rate limit" not in str(e):
+                    print(e)
+                time.sleep(sleep_time)
+
+        return ""
+
+    def chat_completion_raw(
+        self,
+        messages: List[Dict[str, str]],
+        temperature: float = 0,
+        max_tokens: int = 256,
+        timeout: int = 15,
+    ) -> Tuple[str, dict]:
+        """Send a chat completion request and return both content and raw response.
+
+        Used by evaluation datasets that need the full response object
+        (e.g., MMVet which tracks the model name).
+
+        Returns:
+            Tuple of (content_string, raw_response_dict).
+        """
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+
+        payload = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": self._clamp_temperature(temperature),
+            "max_tokens": max_tokens,
+        }
+
+        response = requests.post(
+            f"{self.api_base}/chat/completions",
+            headers=headers,
+            data=json.dumps(payload),
+            timeout=timeout,
+        )
+        response.raise_for_status()
+        response_data = response.json()
+
+        content = response_data["choices"][0]["message"]["content"].strip()
+        content = self._strip_think_tags(content)
+        return content, response_data
+
+
+def get_eval_llm_client(
+    provider: Optional[str] = None,
+    api_key: Optional[str] = None,
+    model: Optional[str] = None,
+    api_base: Optional[str] = None,
+) -> EvalLLMClient:
+    """Factory function to create an EvalLLMClient.
+
+    Auto-detects provider from environment variables if not specified:
+    - EVAL_LLM_PROVIDER: Explicit provider name
+    - MINIMAX_API_KEY: Auto-selects MiniMax if set
+    - Falls back to OpenAI otherwise
+    """
+    return EvalLLMClient(
+        provider=provider,
+        api_key=api_key,
+        model=model,
+        api_base=api_base,
+    )
diff --git a/unit_tests/test_eval_llm.py b/unit_tests/test_eval_llm.py
new file mode 100644
index 00000000..3937da30
--- /dev/null
+++ b/unit_tests/test_eval_llm.py
@@ -0,0 +1,260 @@
+"""Unit tests for the configurable evaluation LLM client."""
+
+import json
+import os
+import unittest
+from unittest.mock import patch, MagicMock
+
+from pipeline.benchmarks.utils.eval_llm import EvalLLMClient, get_eval_llm_client, PROVIDER_CONFIGS
+
+
+class TestProviderConfigs(unittest.TestCase):
+    """Test provider configuration constants."""
+
+    def test_openai_config_exists(self):
+        self.assertIn("openai", PROVIDER_CONFIGS)
+        self.assertEqual(PROVIDER_CONFIGS["openai"]["api_base"], "https://api.openai.com/v1")
+        self.assertEqual(PROVIDER_CONFIGS["openai"]["api_key_env"], "OPENAI_API_KEY")
+
+    def test_minimax_config_exists(self):
+        self.assertIn("minimax", PROVIDER_CONFIGS)
+        self.assertEqual(PROVIDER_CONFIGS["minimax"]["api_base"], "https://api.minimax.io/v1")
+        self.assertEqual(PROVIDER_CONFIGS["minimax"]["default_model"], "MiniMax-M3")
+        self.assertEqual(PROVIDER_CONFIGS["minimax"]["api_key_env"], "MINIMAX_API_KEY")
+
+
+class TestEvalLLMClientInit(unittest.TestCase):
+    """Test EvalLLMClient initialization."""
+
+    def test_explicit_openai_provider(self):
+        client = EvalLLMClient(provider="openai", api_key="test-key")
+        self.assertEqual(client.provider, "openai")
+        self.assertEqual(client.api_base, "https://api.openai.com/v1")
+        self.assertEqual(client.model, "gpt-4-0613")
+        self.assertEqual(client.api_key, "test-key")
+
+    def test_explicit_minimax_provider(self):
+        client = EvalLLMClient(provider="minimax", api_key="test-key")
+        self.assertEqual(client.provider, "minimax")
+        self.assertEqual(client.api_base, "https://api.minimax.io/v1")
+        self.assertEqual(client.model, "MiniMax-M3")
+        self.assertEqual(client.api_key, "test-key")
+
+    def test_custom_model_override(self):
+        client = EvalLLMClient(provider="minimax", api_key="key", model="MiniMax-M2.7")
+        self.assertEqual(client.model, "MiniMax-M2.7")
+
+    def test_custom_api_base_override(self):
+        client = EvalLLMClient(provider="openai", api_key="key", api_base="https://custom.api.com/v1")
+        self.assertEqual(client.api_base, "https://custom.api.com/v1")
+
+    @patch.dict(os.environ, {"MINIMAX_API_KEY": "env-minimax-key"}, clear=False)
+    def test_auto_detect_minimax_from_env(self):
+        client = EvalLLMClient()
+        self.assertEqual(client.provider, "minimax")
+        self.assertEqual(client.api_key, "env-minimax-key")
+
+    @patch.dict(os.environ, {"EVAL_LLM_PROVIDER": "minimax", "MINIMAX_API_KEY": "env-key"}, clear=False)
+    def test_explicit_env_provider(self):
+        client = EvalLLMClient()
+        self.assertEqual(client.provider, "minimax")
+
+    @patch.dict(os.environ, {"OPENAI_API_KEY": "env-openai-key"}, clear=False)
+    def test_default_to_openai(self):
+        env = os.environ.copy()
+        env.pop("MINIMAX_API_KEY", None)
+        env.pop("EVAL_LLM_PROVIDER", None)
+        with patch.dict(os.environ, env, clear=True):
+            client = EvalLLMClient()
+            self.assertEqual(client.provider, "openai")
+
+
+class TestTemperatureClamping(unittest.TestCase):
+    """Test temperature clamping for MiniMax."""
+
+    def test_minimax_clamps_zero_temperature(self):
+        client = EvalLLMClient(provider="minimax", api_key="key")
+        self.assertEqual(client._clamp_temperature(0.0), 0.01)
+
+    def test_minimax_preserves_nonzero_temperature(self):
+        client = EvalLLMClient(provider="minimax", api_key="key")
+        self.assertEqual(client._clamp_temperature(0.7), 0.7)
+
+    def test_openai_preserves_zero_temperature(self):
+        client = EvalLLMClient(provider="openai", api_key="key")
+        self.assertEqual(client._clamp_temperature(0.0), 0.0)
+
+
+class TestThinkTagStripping(unittest.TestCase):
+    """Test <think>...</think> tag stripping for MiniMax."""
+
+    def test_minimax_strips_think_tags(self):
+        client = EvalLLMClient(provider="minimax", api_key="key")
+        content = "<think>Let me think about this...</think>\nThe answer is yes."
+        self.assertEqual(client._strip_think_tags(content), "The answer is yes.")
+
+    def test_minimax_strips_multiline_think_tags(self):
+        client = EvalLLMClient(provider="minimax", api_key="key")
+        content = "<think>\nStep 1: analyze\nStep 2: conclude\n</think>\n0.8"
+        self.assertEqual(client._strip_think_tags(content), "0.8")
+
+    def test_minimax_preserves_content_without_think_tags(self):
+        client = EvalLLMClient(provider="minimax", api_key="key")
+        content = "The answer is yes."
+        self.assertEqual(client._strip_think_tags(content), "The answer is yes.")
+
+    def test_openai_preserves_all_content(self):
+        client = EvalLLMClient(provider="openai", api_key="key")
+        content = "<think>some content</think>\nThe answer is yes."
+        self.assertEqual(client._strip_think_tags(content), content)
+
+
+class TestChatCompletion(unittest.TestCase):
+    """Test chat completion with mocked HTTP responses."""
+
+    @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+    def test_successful_openai_completion(self, mock_post):
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.raise_for_status = MagicMock()
+        mock_response.json.return_value = {
+            "choices": [{"message": {"content": "yes"}}],
+            "model": "gpt-4-0613",
+        }
+        mock_post.return_value = mock_response
+
+        client = EvalLLMClient(provider="openai", api_key="test-key")
+        result = client.chat_completion(
+            messages=[{"role": "user", "content": "Is this correct?"}],
+            temperature=0,
+            max_tokens=256,
+        )
+
+        self.assertEqual(result, "yes")
+        mock_post.assert_called_once()
+        call_args = mock_post.call_args
+        self.assertIn("api.openai.com", call_args[0][0])
+
+    @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+    def test_successful_minimax_completion(self, mock_post):
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.raise_for_status = MagicMock()
+        mock_response.json.return_value = {
+            "choices": [{"message": {"content": "<think>analyzing...</think>\n0.8"}}],
+            "model": "MiniMax-M3",
+        }
+        mock_post.return_value = mock_response
+
+        client = EvalLLMClient(provider="minimax", api_key="test-key")
+        result = client.chat_completion(
+            messages=[{"role": "user", "content": "Score this answer"}],
+            temperature=0,
+            max_tokens=3,
+        )
+
+        self.assertEqual(result, "0.8")
+        call_args = mock_post.call_args
+        payload = json.loads(call_args[1]["data"])
+        self.assertEqual(payload["temperature"], 0.01)  # clamped
+        self.assertIn("api.minimax.io", call_args[0][0])
+
+    @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+    @patch("pipeline.benchmarks.utils.eval_llm.time.sleep")
+    def test_retry_on_failure(self, mock_sleep, mock_post):
+        mock_fail = MagicMock()
+        mock_fail.raise_for_status.side_effect = Exception("Rate limit exceeded")
+
+        mock_success = MagicMock()
+        mock_success.raise_for_status = MagicMock()
+        mock_success.json.return_value = {
+            "choices": [{"message": {"content": "yes"}}],
+        }
+
+        mock_post.side_effect = [mock_fail, mock_success]
+
+        client = EvalLLMClient(provider="openai", api_key="test-key")
+        result = client.chat_completion(
+            messages=[{"role": "user", "content": "test"}],
+            patience=3,
+            sleep_time=1,
+        )
+
+        self.assertEqual(result, "yes")
+        self.assertEqual(mock_post.call_count, 2)
+
+    @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+    @patch("pipeline.benchmarks.utils.eval_llm.time.sleep")
+    def test_returns_empty_on_exhausted_retries(self, mock_sleep, mock_post):
+        mock_fail = MagicMock()
+        mock_fail.raise_for_status.side_effect = Exception("Server error")
+        mock_post.return_value = mock_fail
+
+        client = EvalLLMClient(provider="openai", api_key="test-key")
+        result = client.chat_completion(
+            messages=[{"role": "user", "content": "test"}],
+            patience=2,
+            sleep_time=0,
+        )
+
+        self.assertEqual(result, "")
+        self.assertEqual(mock_post.call_count, 2)
+
+
+class TestChatCompletionRaw(unittest.TestCase):
+    """Test raw chat completion that returns response dict."""
+
+    @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+    def test_returns_content_and_response_data(self, mock_post):
+        response_data = {
+            "choices": [{"message": {"content": "0.7"}}],
+            "model": "gpt-4-0613",
+        }
+        mock_response = MagicMock()
+        mock_response.raise_for_status = MagicMock()
+        mock_response.json.return_value = response_data
+        mock_post.return_value = mock_response
+
+        client = EvalLLMClient(provider="openai", api_key="test-key")
+        content, raw = client.chat_completion_raw(
+            messages=[{"role": "user", "content": "test"}],
+        )
+
+        self.assertEqual(content, "0.7")
+        self.assertEqual(raw["model"], "gpt-4-0613")
+
+    @patch("pipeline.benchmarks.utils.eval_llm.requests.post")
+    def test_minimax_strips_think_tags_in_raw(self, mock_post):
+        response_data = {
+            "choices": [{"message": {"content": "<think>thinking</think>\n0.9"}}],
+            "model": "MiniMax-M3",
+        }
+        mock_response = MagicMock()
+        mock_response.raise_for_status = MagicMock()
+        mock_response.json.return_value = response_data
+        mock_post.return_value = mock_response
+
+        client = EvalLLMClient(provider="minimax", api_key="test-key")
+        content, raw = client.chat_completion_raw(
+            messages=[{"role": "user", "content": "test"}],
+        )
+
+        self.assertEqual(content, "0.9")
+
+
+class TestGetEvalLLMClient(unittest.TestCase):
+    """Test factory function."""
+
+    def test_creates_client_with_defaults(self):
+        client = get_eval_llm_client(provider="openai", api_key="key")
+        self.assertIsInstance(client, EvalLLMClient)
+        self.assertEqual(client.provider, "openai")
+
+    def test_creates_minimax_client(self):
+        client = get_eval_llm_client(provider="minimax", api_key="key", model="MiniMax-M2.7")
+        self.assertEqual(client.provider, "minimax")
+        self.assertEqual(client.model, "MiniMax-M2.7")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/unit_tests/test_eval_llm_integration.py b/unit_tests/test_eval_llm_integration.py
new file mode 100644
index 00000000..dfa98c21
--- /dev/null
+++ b/unit_tests/test_eval_llm_integration.py
@@ -0,0 +1,79 @@
+"""Integration tests for MiniMax evaluation LLM provider.
+
+These tests make real API calls to the MiniMax API.
+Set MINIMAX_API_KEY environment variable to run.
+
+Usage:
+    MINIMAX_API_KEY=your-key python -m pytest unit_tests/test_eval_llm_integration.py -v
+"""
+
+import os
+import unittest
+
+from pipeline.benchmarks.utils.eval_llm import EvalLLMClient, get_eval_llm_client
+
+
+MINIMAX_API_KEY = os.environ.get("MINIMAX_API_KEY", "")
+
+
+@unittest.skipUnless(MINIMAX_API_KEY, "MINIMAX_API_KEY not set")
+class TestMiniMaxIntegration(unittest.TestCase):
+    """Integration tests against the live MiniMax API."""
+
+    def setUp(self):
+        self.client = EvalLLMClient(
+            provider="minimax",
+            api_key=MINIMAX_API_KEY,
+            model="MiniMax-M3",
+        )
+
+    def test_basic_chat_completion(self):
+        result = self.client.chat_completion(
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant. Answer briefly."},
+                {"role": "user", "content": "What is 2 + 2? Answer with just the number."},
+            ],
+            temperature=0.01,
+            max_tokens=256,
+        )
+        self.assertIn("4", result)
+
+    def test_evaluation_judge_yes_no(self):
+        result = self.client.chat_completion(
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant. Your task is to judge whether the model response is correct to answer the given question or not."},
+                {"role": "user", "content": "Question: What color is the sky?\nModel Response: The sky is blue.\nGround Truth: blue\nWill the model response be considered correct? You should only answer yes or no."},
+            ],
+            temperature=0.01,
+            max_tokens=256,
+        )
+        self.assertIn("yes", result.lower())
+
+    def test_scoring_correctness(self):
+        result = self.client.chat_completion(
+            messages=[
+                {"role": "user", "content": "Compare the ground truth and prediction, give a correctness score from 0.0 to 1.0.\n\nQuestion: What is 2+2?\nGround Truth: 4\nPrediction: 4\n\nJust output the score number."},
+            ],
+            temperature=0.01,
+            max_tokens=256,
+        )
+        self.assertTrue(len(result) > 0, "Response should not be empty")
+        # Should contain a high score
+        self.assertTrue(
+            any(s in result for s in ["1.0", "1", "0.9", "0.8"]),
+            f"Expected high score in response: {result}",
+        )
+
+
+@unittest.skipUnless(MINIMAX_API_KEY, "MINIMAX_API_KEY not set")
+class TestMiniMaxAutoDetect(unittest.TestCase):
+    """Test auto-detection of MiniMax provider."""
+
+    def test_auto_detect_creates_minimax_client(self):
+        client = get_eval_llm_client()
+        self.assertEqual(client.provider, "minimax")
+        self.assertEqual(client.api_base, "https://api.minimax.io/v1")
+
+
+if __name__ == "__main__":
+    unittest.main()