diff --git a/README.md b/README.md index d8578827..24362229 100755 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ ![](https://img.shields.io/github/stars/luodian/otter?style=social) [![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FLuodian%2Fotter&count_bg=%23FFA500&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false)](https://hits.seeyoufarm.com) [![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere-blue?color=green)](https://github.com/BerriAI/litellm) +[![MiniMax](https://img.shields.io/badge/MiniMax-M3-blue)](https://www.minimax.io) [Project Credits](https://github.com/Luodian/Otter/blob/main/docs/credits.md) | [Otter Paper](https://arxiv.org/abs/2305.03726) | [OtterHD Paper](https://arxiv.org/abs/2311.04219) | [MIMIC-IT Paper](https://arxiv.org/abs/2306.05425) @@ -41,6 +42,7 @@ For who in the mainland China: [![Open in OpenXLab](https://cdn-static.openxlab. split: test prompt: Answer with the option's letter from the given choices directly. api_key: [Your API Key] # GPT4 or GPT3.5 to evaluate the answers and ground truth. + eval_provider: minimax # Optional: use "minimax" or "openai" (default) debug: true # put debug=true will save the model response in log file. - name: mme split: test @@ -70,6 +72,18 @@ For who in the mainland China: [![Open in OpenXLab](https://cdn-static.openxlab. **[2023-08]** 1. Added Support for using Azure, Anthropic, Palm, Cohere models for Self-Instruct with Syphus pipeline, for information on usage modify [this line](https://github.com/Luodian/Otter/blob/16d73b399fac6352ebff7504b1acb1f228fbf3f4/mimic-it/syphus/file_utils.py#L53) with your selected model and set your API keys in the environment. For more information see [LiteLLM](https://github.com/BerriAI/litellm/) +2. Added [MiniMax](https://www.minimax.io) as a supported LLM provider for both the Syphus data generation pipeline and benchmark evaluation. Configure via environment variables: + ```bash + # For Syphus data generation (via liteLLM) + export MINIMAX_API_KEY="your-minimax-key" + export OPENAI_API_ENGINE="openai/MiniMax-M3" + export OPENAI_API_BASE="https://api.minimax.io/v1" + + # For benchmark evaluation (MagnifierBench, MathVista, MM-Vet) + export EVAL_LLM_PROVIDER="minimax" + export MINIMAX_API_KEY="your-minimax-key" + ``` + MiniMax M3 offers a 512K context window, up to 128K max output, and image input support. M2.7 and M2.7-highspeed remain available as alternatives. See `pipeline/benchmarks/utils/eval_llm.py` for details. **[2023-07]: Anouncing MIMIC-IT dataset for multiple interleaved image-text/video instruction tuning.** diff --git a/mimic-it/syphus/file_utils.py b/mimic-it/syphus/file_utils.py index ec1870ef..8ac1d351 100755 --- a/mimic-it/syphus/file_utils.py +++ b/mimic-it/syphus/file_utils.py @@ -1,5 +1,18 @@ """ file utils + +Supports multiple LLM providers via liteLLM. Configure via environment variables: + +OpenAI (default): + export OPENAI_API_KEY="your-openai-key" + export OPENAI_API_ENGINE="gpt-4" + +MiniMax: + export MINIMAX_API_KEY="your-minimax-key" + export OPENAI_API_ENGINE="openai/MiniMax-M3" + export OPENAI_API_BASE="https://api.minimax.io/v1" + +See https://docs.litellm.ai/docs/providers for all supported providers. """ import json @@ -13,11 +26,15 @@ engine = os.environ.get("OPENAI_API_ENGINE", "davinci") -def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]: +def query_llm(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]: """ - Query the GPT API with the given inputs. + Query the LLM API with the given inputs. + + Supports multiple providers via liteLLM (OpenAI, MiniMax, Anthropic, etc.). + Configure via OPENAI_API_ENGINE and OPENAI_API_BASE environment variables. + Returns: - Response (dict[str, str]): the response from GPT API. + Response (dict[str, str]): the response from the LLM API. Input ID (str): the id that specifics the input. """ if dataset_name == "3d.SceneNavigation": @@ -47,13 +64,19 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str "content": inputs["query_input"]["sentences"], }, ) + + # Clamp temperature for MiniMax (requires (0.0, 1.0]) + temperature = 0.7 + if os.environ.get("MINIMAX_API_KEY"): + temperature = max(temperature, 0.01) + succuss = True while succuss: try: response = completion( - engine=engine, # defined by os.environ, default engine="chatgpt0301", + engine=engine, # defined by os.environ, default engine="davinci" messages=messages, - temperature=0.7, + temperature=temperature, max_tokens=3200, top_p=0.95, frequency_penalty=0, @@ -73,6 +96,10 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str return response, inputs["query_input"]["id"] +# Backward-compatible alias +query_gpt = query_llm + + def split_question_and_answer(pair_of_answer: str, file_id: str) -> tuple[bool, dict[str, str]]: """ Split the question and answer from the pair of question and answer. diff --git a/pipeline/benchmarks/datasets/magnifierbench.py b/pipeline/benchmarks/datasets/magnifierbench.py index a0c4ed97..7eb99fd4 100644 --- a/pipeline/benchmarks/datasets/magnifierbench.py +++ b/pipeline/benchmarks/datasets/magnifierbench.py @@ -16,58 +16,40 @@ import time import requests +from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client + utc_plus_8 = pytz.timezone("Asia/Singapore") # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc. utc_now = pytz.utc.localize(datetime.datetime.utcnow()) utc_plus_8_time = utc_now.astimezone(utc_plus_8) -def get_chat_response(promot, api_key, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5): - headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", - } +def get_chat_response(promot, api_key=None, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None): + if eval_llm_client is None: + eval_llm_client = get_eval_llm_client(api_key=api_key, model=model) messages = [ {"role": "system", "content": "You are a helpful AI assistant. Your task is to judge whether the model response is correct to answer the given question or not."}, {"role": "user", "content": promot}, ] - payload = {"model": model, "messages": messages} - - while patience > 0: - patience -= 1 - try: - response = requests.post( - "https://api.openai.com/v1/chat/completions", - headers=headers, - data=json.dumps(payload), - timeout=30, - ) - response.raise_for_status() - response_data = response.json() - - prediction = response_data["choices"][0]["message"]["content"].strip() - if prediction != "" and prediction is not None: - return prediction - - except Exception as e: - if "Rate limit" not in str(e): - print(e) - time.sleep(sleep_time) - - return "" + return eval_llm_client.chat_completion( + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + patience=patience, + sleep_time=sleep_time, + ) -def prepare_query(model_answer_item, api_key): +def prepare_query(model_answer_item, api_key=None, eval_llm_client=None): freeform_question = model_answer_item["freeform_question"] freeform_response = model_answer_item["freeform_response"] correct_answer = model_answer_item["freeform_answer"] - # Formulating the prompt for ChatGPT + # Formulating the prompt for evaluation LLM prompt = f"Question: {freeform_question}\nModel Response: {freeform_response}\nGround Truth: {correct_answer}\nWill the model response be considered correct? You should only answer yes or no." - # Querying ChatGPT - chat_response = get_chat_response(prompt, api_key) + chat_response = get_chat_response(prompt, api_key=api_key, eval_llm_client=eval_llm_client) return chat_response @@ -83,6 +65,8 @@ def __init__( debug: bool = False, prompt="", api_key=None, + eval_provider=None, + eval_model=None, ): super().__init__("MagnifierBench", data_path) @@ -95,6 +79,11 @@ def __init__( self.debug = debug self.prompt = prompt self.api_key = api_key + self.eval_llm_client = get_eval_llm_client( + provider=eval_provider, + api_key=api_key, + model=eval_model, + ) def parse_pred_ans(self, pred_ans, question): match = re.search(r"The answer is ([A-D])", pred_ans) @@ -122,10 +111,6 @@ def parse_pred_ans(self, pred_ans, question): def _evaluate(self, model): model_score_dict = {} - # output_path = os.path.join(self.default_output_path, f"{model.name}_{self.cur_datetime}") - # if not os.path.exists(output_path): - # os.makedirs(output_path) - # model_path: str = "Salesforce/instructblip-vicuna-7b" model_version = model.name.split("/")[-1] model_answer_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_answer.json") result_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_score.json") @@ -186,16 +171,16 @@ def _evaluate(self, model): model_score_dict["total"] = len(self.data) model_score_dict["accuracy"] = score / len(self.data) - print(f"Start query GPT-4 for free-form evaluation...") - for data_id in tqdm(model_answer.keys(), desc="Querying GPT-4"): + print(f"Start query evaluation LLM for free-form evaluation...") + for data_id in tqdm(model_answer.keys(), desc="Querying evaluation LLM"): model_answer_item = model_answer[data_id] - gpt_response = prepare_query(model_answer_item, self.api_key) + gpt_response = prepare_query(model_answer_item, eval_llm_client=self.eval_llm_client) if gpt_response.lower() == "yes": ff_score += 1 elif gpt_response.lower() == "no": ff_score += 0 else: - print(f"Warning: {data_id} has invalid GPT-4 response: {gpt_response}") + print(f"Warning: {data_id} has invalid evaluation LLM response: {gpt_response}") print(f"Skipping {data_id}") continue diff --git a/pipeline/benchmarks/datasets/mathvista.py b/pipeline/benchmarks/datasets/mathvista.py index 939f7bb4..2851cc2b 100644 --- a/pipeline/benchmarks/datasets/mathvista.py +++ b/pipeline/benchmarks/datasets/mathvista.py @@ -15,6 +15,8 @@ import io from Levenshtein import distance +from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client + utc_plus_8 = pytz.timezone("Asia/Singapore") # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc. utc_now = pytz.utc.localize(datetime.datetime.utcnow()) utc_plus_8_time = utc_now.astimezone(utc_plus_8) @@ -65,41 +67,22 @@ import ast -def get_chat_response(promot, api_key, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5): - headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", - } +def get_chat_response(promot, api_key=None, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None): + if eval_llm_client is None: + eval_llm_client = get_eval_llm_client(api_key=api_key, model=model) messages = [ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": promot}, ] - payload = {"model": model, "messages": messages} - - while patience > 0: - patience -= 1 - try: - response = requests.post( - "https://api.openai.com/v1/chat/completions", - headers=headers, - data=json.dumps(payload), - timeout=30, - ) - response.raise_for_status() - response_data = response.json() - - prediction = response_data["choices"][0]["message"]["content"].strip() - if prediction != "" and prediction is not None: - return prediction - - except Exception as e: - if "Rate limit" not in str(e): - print(e) - time.sleep(sleep_time) - - return "" + return eval_llm_client.chat_completion( + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + patience=patience, + sleep_time=sleep_time, + ) def create_test_prompt(demo_prompt, query, response): @@ -109,7 +92,7 @@ def create_test_prompt(demo_prompt, query, response): return full_prompt -def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613"): +def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613", eval_llm_client=None): question_type = problem["question_type"] answer_type = problem["answer_type"] choices = problem["choices"] @@ -150,7 +133,7 @@ def extract_answer(response, problem, quick_extract=False, api_key=None, pid=Non # general extraction try: full_prompt = create_test_prompt(demo_prompt, query, response) - extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5) + extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5, eval_llm_client=eval_llm_client) return extraction except Exception as e: print(e) @@ -271,15 +254,14 @@ def __init__( gpt_model="gpt-4-0613", debug=False, quick_extract=False, + eval_provider=None, + eval_model=None, ): super().__init__("MathVistaDataset", data_path) name_converter = {"dev": "validation", "test": "test"} self.data = load_dataset("Otter-AI/MathVista", split=name_converter[split], cache_dir=cache_dir).to_pandas() if debug: self.data = self.data.sample(5) - # data_path = "/home/luodian/projects/Otter/archived/testmini_image_inside.json" - # with open(data_path, "r", encoding="utf-8") as f: - # self.data = json.load(f) self.debug = debug self.quick_extract = quick_extract @@ -290,6 +272,11 @@ def __init__( self.cur_datetime = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S") self.api_key = api_key self.gpt_model = gpt_model + self.eval_llm_client = get_eval_llm_client( + provider=eval_provider, + api_key=api_key, + model=eval_model or gpt_model, + ) def create_query(self, problem, shot_type): ### [2] Test query @@ -393,6 +380,7 @@ def _evaluate(self, model): api_key=self.api_key, pid=idx_key, gpt_model=self.gpt_model, + eval_llm_client=self.eval_llm_client, ) results[idx_key].update({"extraction": extraction}) answer = results[idx_key]["answer"] diff --git a/pipeline/benchmarks/datasets/mmvet.py b/pipeline/benchmarks/datasets/mmvet.py index d27c01d8..f85b8549 100644 --- a/pipeline/benchmarks/datasets/mmvet.py +++ b/pipeline/benchmarks/datasets/mmvet.py @@ -15,6 +15,8 @@ import datetime from Levenshtein import distance +from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client + utc_plus_8 = pytz.timezone("Asia/Singapore") # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc. utc_now = pytz.utc.localize(datetime.datetime.utcnow()) utc_plus_8_time = utc_now.astimezone(utc_plus_8) @@ -47,6 +49,8 @@ def __init__( prompt: str = MM_VET_PROMPT, decimail_places: int = 1, # number of decimal places to round to debug: bool = False, + eval_provider: str = None, + eval_model: str = None, ): super().__init__("MMVetDataset", data_path) self.df = load_dataset(data_path, split=split, cache_dir=cache_dir).to_pandas() @@ -58,8 +62,12 @@ def __init__( self.api_key = api_key self.cur_datetime = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S") self.debug = debug + self.eval_llm_client = get_eval_llm_client( + provider=eval_provider, + api_key=api_key, + model=eval_model or gpt_model, + ) self.prepare() - self.client = OpenAI(api_key=api_key) def prepare(self): self.counter = Counter() @@ -183,8 +191,12 @@ def need_more_runs(): while not grade_sample_run_complete: try: - response = self.client.chat.completions.create(model=self.gpt_model, max_tokens=3, temperature=temperature, messages=messages, timeout=15) - content = response["choices"][0]["message"]["content"] + content, response_data = self.eval_llm_client.chat_completion_raw( + messages=messages, + temperature=temperature, + max_tokens=3, + timeout=15, + ) flag = True try_time = 1 while flag: @@ -211,8 +223,12 @@ def need_more_runs(): messages = [ {"role": "user", "content": question}, ] - response = self.client.chat.completions.create(model=self.gpt_model, max_tokens=3, temperature=temperature, messages=messages, timeout=15) - content = response["choices"][0]["message"]["content"] + content, response_data = self.eval_llm_client.chat_completion_raw( + messages=messages, + temperature=temperature, + max_tokens=3, + timeout=15, + ) try_time += 1 temperature += 0.5 print(f"{id} try {try_time} times") @@ -222,17 +238,17 @@ def need_more_runs(): flag = False grade_sample_run_complete = True except Exception as e: - # gpt4 may have token rate limit + # evaluation LLM may have token rate limit print(e) print("sleep 15s") time.sleep(15) if len(sample_grade["model"]) >= j + 1: - sample_grade["model"][j] = response["model"] + sample_grade["model"][j] = response_data.get("model", self.eval_llm_client.model) sample_grade["content"][j] = content sample_grade["score"][j] = score else: - sample_grade["model"].append(response["model"]) + sample_grade["model"].append(response_data.get("model", self.eval_llm_client.model)) sample_grade["content"].append(content) sample_grade["score"].append(score) sample_grade["query"] = line["instruction"] diff --git a/pipeline/benchmarks/utils/__init__.py b/pipeline/benchmarks/utils/__init__.py new file mode 100644 index 00000000..0b0af5b4 --- /dev/null +++ b/pipeline/benchmarks/utils/__init__.py @@ -0,0 +1 @@ +from .eval_llm import EvalLLMClient, get_eval_llm_client diff --git a/pipeline/benchmarks/utils/eval_llm.py b/pipeline/benchmarks/utils/eval_llm.py new file mode 100644 index 00000000..1f0d14ab --- /dev/null +++ b/pipeline/benchmarks/utils/eval_llm.py @@ -0,0 +1,210 @@ +""" +Configurable LLM client for benchmark evaluation. + +Supports multiple LLM providers (OpenAI, MiniMax) for evaluation judging tasks +such as answer extraction and correctness scoring. + +Usage: + # Auto-detect provider from environment variables + client = get_eval_llm_client() + + # Explicit provider selection + client = EvalLLMClient(provider="minimax", api_key="your-key") + + # Chat completion + content = client.chat_completion( + messages=[{"role": "user", "content": "Hello"}], + temperature=0, + max_tokens=256, + ) + +Environment variables: + EVAL_LLM_PROVIDER: Provider name ("openai" or "minimax") + OPENAI_API_KEY: API key for OpenAI + MINIMAX_API_KEY: API key for MiniMax +""" + +import json +import os +import re +import time +from typing import Dict, List, Optional, Tuple + +import requests + + +PROVIDER_CONFIGS: Dict[str, Dict[str, str]] = { + "openai": { + "api_base": "https://api.openai.com/v1", + "default_model": "gpt-4-0613", + "api_key_env": "OPENAI_API_KEY", + }, + "minimax": { + "api_base": "https://api.minimax.io/v1", + # Default model: MiniMax-M3 (latest, 512K context, 128K max output, image input support). + # Other supported models: MiniMax-M2.7, MiniMax-M2.7-highspeed. + "default_model": "MiniMax-M3", + "api_key_env": "MINIMAX_API_KEY", + }, +} + + +class EvalLLMClient: + """Configurable LLM client for evaluation tasks. + + Supports OpenAI and MiniMax providers with automatic handling of + provider-specific quirks (temperature clamping, think-tag stripping). + """ + + def __init__( + self, + provider: Optional[str] = None, + api_key: Optional[str] = None, + model: Optional[str] = None, + api_base: Optional[str] = None, + ): + if provider is None: + provider = os.environ.get("EVAL_LLM_PROVIDER", "").lower() + if not provider: + if os.environ.get("MINIMAX_API_KEY"): + provider = "minimax" + else: + provider = "openai" + + self.provider = provider + config = PROVIDER_CONFIGS.get(provider, PROVIDER_CONFIGS["openai"]) + + self.api_base = api_base or config["api_base"] + self.model = model or config["default_model"] + self.api_key = api_key or os.environ.get(config["api_key_env"], "") + + def _clamp_temperature(self, temperature: float) -> float: + """Clamp temperature for MiniMax which requires (0.0, 1.0].""" + if self.provider == "minimax": + return max(temperature, 0.01) + return temperature + + def _strip_think_tags(self, content: str) -> str: + """Strip ... tags from MiniMax responses.""" + if self.provider == "minimax" and "" in content: + content = re.sub(r".*?\s*", "", content, flags=re.DOTALL).strip() + return content + + def chat_completion( + self, + messages: List[Dict[str, str]], + temperature: float = 0, + max_tokens: int = 256, + patience: int = 5, + sleep_time: int = 5, + timeout: int = 30, + ) -> str: + """Send a chat completion request and return the response content. + + Args: + messages: List of message dicts with 'role' and 'content'. + temperature: Sampling temperature. + max_tokens: Maximum tokens in response. + patience: Number of retries on failure. + sleep_time: Seconds to wait between retries. + timeout: Request timeout in seconds. + + Returns: + The response content string, or empty string on failure. + """ + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + payload = { + "model": self.model, + "messages": messages, + "temperature": self._clamp_temperature(temperature), + "max_tokens": max_tokens, + } + + while patience > 0: + patience -= 1 + try: + response = requests.post( + f"{self.api_base}/chat/completions", + headers=headers, + data=json.dumps(payload), + timeout=timeout, + ) + response.raise_for_status() + response_data = response.json() + + content = response_data["choices"][0]["message"]["content"].strip() + content = self._strip_think_tags(content) + if content: + return content + + except Exception as e: + if "Rate limit" not in str(e): + print(e) + time.sleep(sleep_time) + + return "" + + def chat_completion_raw( + self, + messages: List[Dict[str, str]], + temperature: float = 0, + max_tokens: int = 256, + timeout: int = 15, + ) -> Tuple[str, dict]: + """Send a chat completion request and return both content and raw response. + + Used by evaluation datasets that need the full response object + (e.g., MMVet which tracks the model name). + + Returns: + Tuple of (content_string, raw_response_dict). + """ + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + payload = { + "model": self.model, + "messages": messages, + "temperature": self._clamp_temperature(temperature), + "max_tokens": max_tokens, + } + + response = requests.post( + f"{self.api_base}/chat/completions", + headers=headers, + data=json.dumps(payload), + timeout=timeout, + ) + response.raise_for_status() + response_data = response.json() + + content = response_data["choices"][0]["message"]["content"].strip() + content = self._strip_think_tags(content) + return content, response_data + + +def get_eval_llm_client( + provider: Optional[str] = None, + api_key: Optional[str] = None, + model: Optional[str] = None, + api_base: Optional[str] = None, +) -> EvalLLMClient: + """Factory function to create an EvalLLMClient. + + Auto-detects provider from environment variables if not specified: + - EVAL_LLM_PROVIDER: Explicit provider name + - MINIMAX_API_KEY: Auto-selects MiniMax if set + - Falls back to OpenAI otherwise + """ + return EvalLLMClient( + provider=provider, + api_key=api_key, + model=model, + api_base=api_base, + ) diff --git a/unit_tests/test_eval_llm.py b/unit_tests/test_eval_llm.py new file mode 100644 index 00000000..3937da30 --- /dev/null +++ b/unit_tests/test_eval_llm.py @@ -0,0 +1,260 @@ +"""Unit tests for the configurable evaluation LLM client.""" + +import json +import os +import unittest +from unittest.mock import patch, MagicMock + +from pipeline.benchmarks.utils.eval_llm import EvalLLMClient, get_eval_llm_client, PROVIDER_CONFIGS + + +class TestProviderConfigs(unittest.TestCase): + """Test provider configuration constants.""" + + def test_openai_config_exists(self): + self.assertIn("openai", PROVIDER_CONFIGS) + self.assertEqual(PROVIDER_CONFIGS["openai"]["api_base"], "https://api.openai.com/v1") + self.assertEqual(PROVIDER_CONFIGS["openai"]["api_key_env"], "OPENAI_API_KEY") + + def test_minimax_config_exists(self): + self.assertIn("minimax", PROVIDER_CONFIGS) + self.assertEqual(PROVIDER_CONFIGS["minimax"]["api_base"], "https://api.minimax.io/v1") + self.assertEqual(PROVIDER_CONFIGS["minimax"]["default_model"], "MiniMax-M3") + self.assertEqual(PROVIDER_CONFIGS["minimax"]["api_key_env"], "MINIMAX_API_KEY") + + +class TestEvalLLMClientInit(unittest.TestCase): + """Test EvalLLMClient initialization.""" + + def test_explicit_openai_provider(self): + client = EvalLLMClient(provider="openai", api_key="test-key") + self.assertEqual(client.provider, "openai") + self.assertEqual(client.api_base, "https://api.openai.com/v1") + self.assertEqual(client.model, "gpt-4-0613") + self.assertEqual(client.api_key, "test-key") + + def test_explicit_minimax_provider(self): + client = EvalLLMClient(provider="minimax", api_key="test-key") + self.assertEqual(client.provider, "minimax") + self.assertEqual(client.api_base, "https://api.minimax.io/v1") + self.assertEqual(client.model, "MiniMax-M3") + self.assertEqual(client.api_key, "test-key") + + def test_custom_model_override(self): + client = EvalLLMClient(provider="minimax", api_key="key", model="MiniMax-M2.7") + self.assertEqual(client.model, "MiniMax-M2.7") + + def test_custom_api_base_override(self): + client = EvalLLMClient(provider="openai", api_key="key", api_base="https://custom.api.com/v1") + self.assertEqual(client.api_base, "https://custom.api.com/v1") + + @patch.dict(os.environ, {"MINIMAX_API_KEY": "env-minimax-key"}, clear=False) + def test_auto_detect_minimax_from_env(self): + client = EvalLLMClient() + self.assertEqual(client.provider, "minimax") + self.assertEqual(client.api_key, "env-minimax-key") + + @patch.dict(os.environ, {"EVAL_LLM_PROVIDER": "minimax", "MINIMAX_API_KEY": "env-key"}, clear=False) + def test_explicit_env_provider(self): + client = EvalLLMClient() + self.assertEqual(client.provider, "minimax") + + @patch.dict(os.environ, {"OPENAI_API_KEY": "env-openai-key"}, clear=False) + def test_default_to_openai(self): + env = os.environ.copy() + env.pop("MINIMAX_API_KEY", None) + env.pop("EVAL_LLM_PROVIDER", None) + with patch.dict(os.environ, env, clear=True): + client = EvalLLMClient() + self.assertEqual(client.provider, "openai") + + +class TestTemperatureClamping(unittest.TestCase): + """Test temperature clamping for MiniMax.""" + + def test_minimax_clamps_zero_temperature(self): + client = EvalLLMClient(provider="minimax", api_key="key") + self.assertEqual(client._clamp_temperature(0.0), 0.01) + + def test_minimax_preserves_nonzero_temperature(self): + client = EvalLLMClient(provider="minimax", api_key="key") + self.assertEqual(client._clamp_temperature(0.7), 0.7) + + def test_openai_preserves_zero_temperature(self): + client = EvalLLMClient(provider="openai", api_key="key") + self.assertEqual(client._clamp_temperature(0.0), 0.0) + + +class TestThinkTagStripping(unittest.TestCase): + """Test ... tag stripping for MiniMax.""" + + def test_minimax_strips_think_tags(self): + client = EvalLLMClient(provider="minimax", api_key="key") + content = "Let me think about this...\nThe answer is yes." + self.assertEqual(client._strip_think_tags(content), "The answer is yes.") + + def test_minimax_strips_multiline_think_tags(self): + client = EvalLLMClient(provider="minimax", api_key="key") + content = "\nStep 1: analyze\nStep 2: conclude\n\n0.8" + self.assertEqual(client._strip_think_tags(content), "0.8") + + def test_minimax_preserves_content_without_think_tags(self): + client = EvalLLMClient(provider="minimax", api_key="key") + content = "The answer is yes." + self.assertEqual(client._strip_think_tags(content), "The answer is yes.") + + def test_openai_preserves_all_content(self): + client = EvalLLMClient(provider="openai", api_key="key") + content = "some content\nThe answer is yes." + self.assertEqual(client._strip_think_tags(content), content) + + +class TestChatCompletion(unittest.TestCase): + """Test chat completion with mocked HTTP responses.""" + + @patch("pipeline.benchmarks.utils.eval_llm.requests.post") + def test_successful_openai_completion(self, mock_post): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.raise_for_status = MagicMock() + mock_response.json.return_value = { + "choices": [{"message": {"content": "yes"}}], + "model": "gpt-4-0613", + } + mock_post.return_value = mock_response + + client = EvalLLMClient(provider="openai", api_key="test-key") + result = client.chat_completion( + messages=[{"role": "user", "content": "Is this correct?"}], + temperature=0, + max_tokens=256, + ) + + self.assertEqual(result, "yes") + mock_post.assert_called_once() + call_args = mock_post.call_args + self.assertIn("api.openai.com", call_args[0][0]) + + @patch("pipeline.benchmarks.utils.eval_llm.requests.post") + def test_successful_minimax_completion(self, mock_post): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.raise_for_status = MagicMock() + mock_response.json.return_value = { + "choices": [{"message": {"content": "analyzing...\n0.8"}}], + "model": "MiniMax-M3", + } + mock_post.return_value = mock_response + + client = EvalLLMClient(provider="minimax", api_key="test-key") + result = client.chat_completion( + messages=[{"role": "user", "content": "Score this answer"}], + temperature=0, + max_tokens=3, + ) + + self.assertEqual(result, "0.8") + call_args = mock_post.call_args + payload = json.loads(call_args[1]["data"]) + self.assertEqual(payload["temperature"], 0.01) # clamped + self.assertIn("api.minimax.io", call_args[0][0]) + + @patch("pipeline.benchmarks.utils.eval_llm.requests.post") + @patch("pipeline.benchmarks.utils.eval_llm.time.sleep") + def test_retry_on_failure(self, mock_sleep, mock_post): + mock_fail = MagicMock() + mock_fail.raise_for_status.side_effect = Exception("Rate limit exceeded") + + mock_success = MagicMock() + mock_success.raise_for_status = MagicMock() + mock_success.json.return_value = { + "choices": [{"message": {"content": "yes"}}], + } + + mock_post.side_effect = [mock_fail, mock_success] + + client = EvalLLMClient(provider="openai", api_key="test-key") + result = client.chat_completion( + messages=[{"role": "user", "content": "test"}], + patience=3, + sleep_time=1, + ) + + self.assertEqual(result, "yes") + self.assertEqual(mock_post.call_count, 2) + + @patch("pipeline.benchmarks.utils.eval_llm.requests.post") + @patch("pipeline.benchmarks.utils.eval_llm.time.sleep") + def test_returns_empty_on_exhausted_retries(self, mock_sleep, mock_post): + mock_fail = MagicMock() + mock_fail.raise_for_status.side_effect = Exception("Server error") + mock_post.return_value = mock_fail + + client = EvalLLMClient(provider="openai", api_key="test-key") + result = client.chat_completion( + messages=[{"role": "user", "content": "test"}], + patience=2, + sleep_time=0, + ) + + self.assertEqual(result, "") + self.assertEqual(mock_post.call_count, 2) + + +class TestChatCompletionRaw(unittest.TestCase): + """Test raw chat completion that returns response dict.""" + + @patch("pipeline.benchmarks.utils.eval_llm.requests.post") + def test_returns_content_and_response_data(self, mock_post): + response_data = { + "choices": [{"message": {"content": "0.7"}}], + "model": "gpt-4-0613", + } + mock_response = MagicMock() + mock_response.raise_for_status = MagicMock() + mock_response.json.return_value = response_data + mock_post.return_value = mock_response + + client = EvalLLMClient(provider="openai", api_key="test-key") + content, raw = client.chat_completion_raw( + messages=[{"role": "user", "content": "test"}], + ) + + self.assertEqual(content, "0.7") + self.assertEqual(raw["model"], "gpt-4-0613") + + @patch("pipeline.benchmarks.utils.eval_llm.requests.post") + def test_minimax_strips_think_tags_in_raw(self, mock_post): + response_data = { + "choices": [{"message": {"content": "thinking\n0.9"}}], + "model": "MiniMax-M3", + } + mock_response = MagicMock() + mock_response.raise_for_status = MagicMock() + mock_response.json.return_value = response_data + mock_post.return_value = mock_response + + client = EvalLLMClient(provider="minimax", api_key="test-key") + content, raw = client.chat_completion_raw( + messages=[{"role": "user", "content": "test"}], + ) + + self.assertEqual(content, "0.9") + + +class TestGetEvalLLMClient(unittest.TestCase): + """Test factory function.""" + + def test_creates_client_with_defaults(self): + client = get_eval_llm_client(provider="openai", api_key="key") + self.assertIsInstance(client, EvalLLMClient) + self.assertEqual(client.provider, "openai") + + def test_creates_minimax_client(self): + client = get_eval_llm_client(provider="minimax", api_key="key", model="MiniMax-M2.7") + self.assertEqual(client.provider, "minimax") + self.assertEqual(client.model, "MiniMax-M2.7") + + +if __name__ == "__main__": + unittest.main() diff --git a/unit_tests/test_eval_llm_integration.py b/unit_tests/test_eval_llm_integration.py new file mode 100644 index 00000000..dfa98c21 --- /dev/null +++ b/unit_tests/test_eval_llm_integration.py @@ -0,0 +1,79 @@ +"""Integration tests for MiniMax evaluation LLM provider. + +These tests make real API calls to the MiniMax API. +Set MINIMAX_API_KEY environment variable to run. + +Usage: + MINIMAX_API_KEY=your-key python -m pytest unit_tests/test_eval_llm_integration.py -v +""" + +import os +import unittest + +from pipeline.benchmarks.utils.eval_llm import EvalLLMClient, get_eval_llm_client + + +MINIMAX_API_KEY = os.environ.get("MINIMAX_API_KEY", "") + + +@unittest.skipUnless(MINIMAX_API_KEY, "MINIMAX_API_KEY not set") +class TestMiniMaxIntegration(unittest.TestCase): + """Integration tests against the live MiniMax API.""" + + def setUp(self): + self.client = EvalLLMClient( + provider="minimax", + api_key=MINIMAX_API_KEY, + model="MiniMax-M3", + ) + + def test_basic_chat_completion(self): + result = self.client.chat_completion( + messages=[ + {"role": "system", "content": "You are a helpful assistant. Answer briefly."}, + {"role": "user", "content": "What is 2 + 2? Answer with just the number."}, + ], + temperature=0.01, + max_tokens=256, + ) + self.assertIn("4", result) + + def test_evaluation_judge_yes_no(self): + result = self.client.chat_completion( + messages=[ + {"role": "system", "content": "You are a helpful AI assistant. Your task is to judge whether the model response is correct to answer the given question or not."}, + {"role": "user", "content": "Question: What color is the sky?\nModel Response: The sky is blue.\nGround Truth: blue\nWill the model response be considered correct? You should only answer yes or no."}, + ], + temperature=0.01, + max_tokens=256, + ) + self.assertIn("yes", result.lower()) + + def test_scoring_correctness(self): + result = self.client.chat_completion( + messages=[ + {"role": "user", "content": "Compare the ground truth and prediction, give a correctness score from 0.0 to 1.0.\n\nQuestion: What is 2+2?\nGround Truth: 4\nPrediction: 4\n\nJust output the score number."}, + ], + temperature=0.01, + max_tokens=256, + ) + self.assertTrue(len(result) > 0, "Response should not be empty") + # Should contain a high score + self.assertTrue( + any(s in result for s in ["1.0", "1", "0.9", "0.8"]), + f"Expected high score in response: {result}", + ) + + +@unittest.skipUnless(MINIMAX_API_KEY, "MINIMAX_API_KEY not set") +class TestMiniMaxAutoDetect(unittest.TestCase): + """Test auto-detection of MiniMax provider.""" + + def test_auto_detect_creates_minimax_client(self): + client = get_eval_llm_client() + self.assertEqual(client.provider, "minimax") + self.assertEqual(client.api_base, "https://api.minimax.io/v1") + + +if __name__ == "__main__": + unittest.main()