EvolvingLMMs-Lab · octo-patch · Mar 26, 2026 · Jun 7, 2026
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@
 ![](https://img.shields.io/github/stars/luodian/otter?style=social)
 [![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FLuodian%2Fotter&count_bg=%23FFA500&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false)](https://hits.seeyoufarm.com)
 [![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere-blue?color=green)](https://github.com/BerriAI/litellm)
+[![MiniMax](https://img.shields.io/badge/MiniMax-M3-blue)](https://www.minimax.io)
 
 [Project Credits](https://github.com/Luodian/Otter/blob/main/docs/credits.md) | [Otter Paper](https://arxiv.org/abs/2305.03726) | [OtterHD Paper](https://arxiv.org/abs/2311.04219) | [MIMIC-IT Paper](https://arxiv.org/abs/2306.05425)
 
@@ -41,6 +42,7 @@ For who in the mainland China: [![Open in OpenXLab](https://cdn-static.openxlab.
             split: test
             prompt: Answer with the option's letter from the given choices directly.
             api_key: [Your API Key] # GPT4 or GPT3.5 to evaluate the answers and ground truth.
+            eval_provider: minimax # Optional: use "minimax" or "openai" (default)
             debug: true # put debug=true will save the model response in log file.
         - name: mme
             split: test
@@ -70,6 +72,18 @@ For who in the mainland China: [![Open in OpenXLab](https://cdn-static.openxlab.
 **[2023-08]**
 
 1. Added Support for using Azure, Anthropic, Palm, Cohere models for Self-Instruct with Syphus pipeline, for information on usage modify [this line](https://github.com/Luodian/Otter/blob/16d73b399fac6352ebff7504b1acb1f228fbf3f4/mimic-it/syphus/file_utils.py#L53) with your selected model and set your API keys in the environment. For more information see [LiteLLM](https://github.com/BerriAI/litellm/)
+2. Added [MiniMax](https://www.minimax.io) as a supported LLM provider for both the Syphus data generation pipeline and benchmark evaluation. Configure via environment variables:
+   ```bash
+   # For Syphus data generation (via liteLLM)
+   export MINIMAX_API_KEY="your-minimax-key"
+   export OPENAI_API_ENGINE="openai/MiniMax-M3"
+   export OPENAI_API_BASE="https://api.minimax.io/v1"
+
+   # For benchmark evaluation (MagnifierBench, MathVista, MM-Vet)
+   export EVAL_LLM_PROVIDER="minimax"
+   export MINIMAX_API_KEY="your-minimax-key"
+   ```
+   MiniMax M3 offers a 512K context window, up to 128K max output, and image input support. M2.7 and M2.7-highspeed remain available as alternatives. See `pipeline/benchmarks/utils/eval_llm.py` for details.
 
 **[2023-07]: Anouncing MIMIC-IT dataset for multiple interleaved image-text/video instruction tuning.**
 

diff --git a/mimic-it/syphus/file_utils.py b/mimic-it/syphus/file_utils.py
@@ -1,5 +1,18 @@
 """
 file utils
+
+Supports multiple LLM providers via liteLLM. Configure via environment variables:
+
+OpenAI (default):
+    export OPENAI_API_KEY="your-openai-key"
+    export OPENAI_API_ENGINE="gpt-4"
+
+MiniMax:
+    export MINIMAX_API_KEY="your-minimax-key"
+    export OPENAI_API_ENGINE="openai/MiniMax-M3"
+    export OPENAI_API_BASE="https://api.minimax.io/v1"
+
+See https://docs.litellm.ai/docs/providers for all supported providers.
 """
 
 import json
@@ -13,11 +26,15 @@
 engine = os.environ.get("OPENAI_API_ENGINE", "davinci")
 
 
-def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]:
+def query_llm(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]:
     """
-    Query the GPT API with the given inputs.
+    Query the LLM API with the given inputs.
+
+    Supports multiple providers via liteLLM (OpenAI, MiniMax, Anthropic, etc.).
+    Configure via OPENAI_API_ENGINE and OPENAI_API_BASE environment variables.
+
     Returns:
-        Response (dict[str, str]): the response from GPT API.
+        Response (dict[str, str]): the response from the LLM API.
         Input ID (str): the id that specifics the input.
     """
     if dataset_name == "3d.SceneNavigation":
@@ -47,13 +64,19 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str
                 "content": inputs["query_input"]["sentences"],
             },
         )
+
+    # Clamp temperature for MiniMax (requires (0.0, 1.0])
+    temperature = 0.7
+    if os.environ.get("MINIMAX_API_KEY"):
+        temperature = max(temperature, 0.01)
+
     succuss = True
     while succuss:
         try:
             response = completion(
-                engine=engine,  # defined by os.environ, default engine="chatgpt0301",
+                engine=engine,  # defined by os.environ, default engine="davinci"
                 messages=messages,
-                temperature=0.7,
+                temperature=temperature,
                 max_tokens=3200,
                 top_p=0.95,
                 frequency_penalty=0,
@@ -73,6 +96,10 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str
     return response, inputs["query_input"]["id"]
 
 
+# Backward-compatible alias
+query_gpt = query_llm
+
+
 def split_question_and_answer(pair_of_answer: str, file_id: str) -> tuple[bool, dict[str, str]]:
     """
     Split the question and answer from the pair of question and answer.

diff --git a/pipeline/benchmarks/datasets/magnifierbench.py b/pipeline/benchmarks/datasets/magnifierbench.py
@@ -16,58 +16,40 @@
 import time
 import requests
 
+from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client
+
 utc_plus_8 = pytz.timezone("Asia/Singapore")  # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
 utc_now = pytz.utc.localize(datetime.datetime.utcnow())
 utc_plus_8_time = utc_now.astimezone(utc_plus_8)
 
 
-def get_chat_response(promot, api_key, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5):
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
+def get_chat_response(promot, api_key=None, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None):
+    if eval_llm_client is None:
+        eval_llm_client = get_eval_llm_client(api_key=api_key, model=model)
 
     messages = [
         {"role": "system", "content": "You are a helpful AI assistant. Your task is to judge whether the model response is correct to answer the given question or not."},
         {"role": "user", "content": promot},
     ]
 
-    payload = {"model": model, "messages": messages}
-
-    while patience > 0:
-        patience -= 1
-        try:
-            response = requests.post(
-                "https://api.openai.com/v1/chat/completions",
-                headers=headers,
-                data=json.dumps(payload),
-                timeout=30,
-            )
-            response.raise_for_status()
-            response_data = response.json()
-
-            prediction = response_data["choices"][0]["message"]["content"].strip()
-            if prediction != "" and prediction is not None:
-                return prediction
-
-        except Exception as e:
-            if "Rate limit" not in str(e):
-                print(e)
-            time.sleep(sleep_time)
-
-    return ""
+    return eval_llm_client.chat_completion(
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        patience=patience,
+        sleep_time=sleep_time,
+    )
 
 
-def prepare_query(model_answer_item, api_key):
+def prepare_query(model_answer_item, api_key=None, eval_llm_client=None):
     freeform_question = model_answer_item["freeform_question"]
     freeform_response = model_answer_item["freeform_response"]
     correct_answer = model_answer_item["freeform_answer"]
 
-    # Formulating the prompt for ChatGPT
+    # Formulating the prompt for evaluation LLM
     prompt = f"Question: {freeform_question}\nModel Response: {freeform_response}\nGround Truth: {correct_answer}\nWill the model response be considered correct? You should only answer yes or no."
 
-    # Querying ChatGPT
-    chat_response = get_chat_response(prompt, api_key)
+    chat_response = get_chat_response(prompt, api_key=api_key, eval_llm_client=eval_llm_client)
 
     return chat_response
 
@@ -83,6 +65,8 @@ def __init__(
         debug: bool = False,
         prompt="",
         api_key=None,
+        eval_provider=None,
+        eval_model=None,
     ):
         super().__init__("MagnifierBench", data_path)
 
@@ -95,6 +79,11 @@ def __init__(
         self.debug = debug
         self.prompt = prompt
         self.api_key = api_key
+        self.eval_llm_client = get_eval_llm_client(
+            provider=eval_provider,
+            api_key=api_key,
+            model=eval_model,
+        )
 
     def parse_pred_ans(self, pred_ans, question):
         match = re.search(r"The answer is ([A-D])", pred_ans)
@@ -122,10 +111,6 @@ def parse_pred_ans(self, pred_ans, question):
     def _evaluate(self, model):
         model_score_dict = {}
 
-        # output_path = os.path.join(self.default_output_path, f"{model.name}_{self.cur_datetime}")
-        # if not os.path.exists(output_path):
-        #     os.makedirs(output_path)
-        # model_path: str = "Salesforce/instructblip-vicuna-7b"
         model_version = model.name.split("/")[-1]
         model_answer_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_answer.json")
         result_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_score.json")
@@ -186,16 +171,16 @@ def _evaluate(self, model):
         model_score_dict["total"] = len(self.data)
         model_score_dict["accuracy"] = score / len(self.data)
 
-        print(f"Start query GPT-4 for free-form evaluation...")
-        for data_id in tqdm(model_answer.keys(), desc="Querying GPT-4"):
+        print(f"Start query evaluation LLM for free-form evaluation...")
+        for data_id in tqdm(model_answer.keys(), desc="Querying evaluation LLM"):
             model_answer_item = model_answer[data_id]
-            gpt_response = prepare_query(model_answer_item, self.api_key)
+            gpt_response = prepare_query(model_answer_item, eval_llm_client=self.eval_llm_client)
             if gpt_response.lower() == "yes":
                 ff_score += 1
             elif gpt_response.lower() == "no":
                 ff_score += 0
             else:
-                print(f"Warning: {data_id} has invalid GPT-4 response: {gpt_response}")
+                print(f"Warning: {data_id} has invalid evaluation LLM response: {gpt_response}")
                 print(f"Skipping {data_id}")
                 continue
 

diff --git a/pipeline/benchmarks/datasets/mathvista.py b/pipeline/benchmarks/datasets/mathvista.py
@@ -15,6 +15,8 @@
 import io
 from Levenshtein import distance
 
+from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client
+
 utc_plus_8 = pytz.timezone("Asia/Singapore")  # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
 utc_now = pytz.utc.localize(datetime.datetime.utcnow())
 utc_plus_8_time = utc_now.astimezone(utc_plus_8)
@@ -65,41 +67,22 @@
 import ast
 
 
-def get_chat_response(promot, api_key, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5):
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
+def get_chat_response(promot, api_key=None, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None):
+    if eval_llm_client is None:
+        eval_llm_client = get_eval_llm_client(api_key=api_key, model=model)
 
     messages = [
         {"role": "system", "content": "You are a helpful AI assistant."},
         {"role": "user", "content": promot},
     ]
 
-    payload = {"model": model, "messages": messages}
-
-    while patience > 0:
-        patience -= 1
-        try:
-            response = requests.post(
-                "https://api.openai.com/v1/chat/completions",
-                headers=headers,
-                data=json.dumps(payload),
-                timeout=30,
-            )
-            response.raise_for_status()
-            response_data = response.json()
-
-            prediction = response_data["choices"][0]["message"]["content"].strip()
-            if prediction != "" and prediction is not None:
-                return prediction
-
-        except Exception as e:
-            if "Rate limit" not in str(e):
-                print(e)
-            time.sleep(sleep_time)
-
-    return ""
+    return eval_llm_client.chat_completion(
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        patience=patience,
+        sleep_time=sleep_time,
+    )
 
 
 def create_test_prompt(demo_prompt, query, response):
@@ -109,7 +92,7 @@ def create_test_prompt(demo_prompt, query, response):
     return full_prompt
 
 
-def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613"):
+def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613", eval_llm_client=None):
     question_type = problem["question_type"]
     answer_type = problem["answer_type"]
     choices = problem["choices"]
@@ -150,7 +133,7 @@ def extract_answer(response, problem, quick_extract=False, api_key=None, pid=Non
         # general extraction
         try:
             full_prompt = create_test_prompt(demo_prompt, query, response)
-            extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5)
+            extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5, eval_llm_client=eval_llm_client)
             return extraction
         except Exception as e:
             print(e)
@@ -271,15 +254,14 @@ def __init__(
         gpt_model="gpt-4-0613",
         debug=False,
         quick_extract=False,
+        eval_provider=None,
+        eval_model=None,
     ):
         super().__init__("MathVistaDataset", data_path)
         name_converter = {"dev": "validation", "test": "test"}
         self.data = load_dataset("Otter-AI/MathVista", split=name_converter[split], cache_dir=cache_dir).to_pandas()
         if debug:
             self.data = self.data.sample(5)
-        # data_path = "/home/luodian/projects/Otter/archived/testmini_image_inside.json"
-        # with open(data_path, "r", encoding="utf-8") as f:
-        #     self.data = json.load(f)
 
         self.debug = debug
         self.quick_extract = quick_extract
@@ -290,6 +272,11 @@ def __init__(
         self.cur_datetime = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S")
         self.api_key = api_key
         self.gpt_model = gpt_model
+        self.eval_llm_client = get_eval_llm_client(
+            provider=eval_provider,
+            api_key=api_key,
+            model=eval_model or gpt_model,
+        )
 
     def create_query(self, problem, shot_type):
         ### [2] Test query
@@ -393,6 +380,7 @@ def _evaluate(self, model):
                 api_key=self.api_key,
                 pid=idx_key,
                 gpt_model=self.gpt_model,
+                eval_llm_client=self.eval_llm_client,
             )
             results[idx_key].update({"extraction": extraction})
             answer = results[idx_key]["answer"]