Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
![](https://img.shields.io/github/stars/luodian/otter?style=social)
[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FLuodian%2Fotter&count_bg=%23FFA500&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false)](https://hits.seeyoufarm.com)
[![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere-blue?color=green)](https://github.com/BerriAI/litellm)
[![MiniMax](https://img.shields.io/badge/MiniMax-M3-blue)](https://www.minimax.io)

[Project Credits](https://github.com/Luodian/Otter/blob/main/docs/credits.md) | [Otter Paper](https://arxiv.org/abs/2305.03726) | [OtterHD Paper](https://arxiv.org/abs/2311.04219) | [MIMIC-IT Paper](https://arxiv.org/abs/2306.05425)

Expand Down Expand Up @@ -41,6 +42,7 @@ For who in the mainland China: [![Open in OpenXLab](https://cdn-static.openxlab.
split: test
prompt: Answer with the option's letter from the given choices directly.
api_key: [Your API Key] # GPT4 or GPT3.5 to evaluate the answers and ground truth.
eval_provider: minimax # Optional: use "minimax" or "openai" (default)
debug: true # put debug=true will save the model response in log file.
- name: mme
split: test
Expand Down Expand Up @@ -70,6 +72,18 @@ For who in the mainland China: [![Open in OpenXLab](https://cdn-static.openxlab.
**[2023-08]**

1. Added Support for using Azure, Anthropic, Palm, Cohere models for Self-Instruct with Syphus pipeline, for information on usage modify [this line](https://github.com/Luodian/Otter/blob/16d73b399fac6352ebff7504b1acb1f228fbf3f4/mimic-it/syphus/file_utils.py#L53) with your selected model and set your API keys in the environment. For more information see [LiteLLM](https://github.com/BerriAI/litellm/)
2. Added [MiniMax](https://www.minimax.io) as a supported LLM provider for both the Syphus data generation pipeline and benchmark evaluation. Configure via environment variables:
```bash
# For Syphus data generation (via liteLLM)
export MINIMAX_API_KEY="your-minimax-key"
export OPENAI_API_ENGINE="openai/MiniMax-M3"
export OPENAI_API_BASE="https://api.minimax.io/v1"

# For benchmark evaluation (MagnifierBench, MathVista, MM-Vet)
export EVAL_LLM_PROVIDER="minimax"
export MINIMAX_API_KEY="your-minimax-key"
```
MiniMax M3 offers a 512K context window, up to 128K max output, and image input support. M2.7 and M2.7-highspeed remain available as alternatives. See `pipeline/benchmarks/utils/eval_llm.py` for details.

**[2023-07]: Anouncing MIMIC-IT dataset for multiple interleaved image-text/video instruction tuning.**

Expand Down
37 changes: 32 additions & 5 deletions mimic-it/syphus/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
"""
file utils

Supports multiple LLM providers via liteLLM. Configure via environment variables:

OpenAI (default):
export OPENAI_API_KEY="your-openai-key"
export OPENAI_API_ENGINE="gpt-4"

MiniMax:
export MINIMAX_API_KEY="your-minimax-key"
export OPENAI_API_ENGINE="openai/MiniMax-M3"
export OPENAI_API_BASE="https://api.minimax.io/v1"

See https://docs.litellm.ai/docs/providers for all supported providers.
"""

import json
Expand All @@ -13,11 +26,15 @@
engine = os.environ.get("OPENAI_API_ENGINE", "davinci")


def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]:
def query_llm(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str]:
"""
Query the GPT API with the given inputs.
Query the LLM API with the given inputs.

Supports multiple providers via liteLLM (OpenAI, MiniMax, Anthropic, etc.).
Configure via OPENAI_API_ENGINE and OPENAI_API_BASE environment variables.

Returns:
Response (dict[str, str]): the response from GPT API.
Response (dict[str, str]): the response from the LLM API.
Input ID (str): the id that specifics the input.
"""
if dataset_name == "3d.SceneNavigation":
Expand Down Expand Up @@ -47,13 +64,19 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str
"content": inputs["query_input"]["sentences"],
},
)

# Clamp temperature for MiniMax (requires (0.0, 1.0])
temperature = 0.7
if os.environ.get("MINIMAX_API_KEY"):
temperature = max(temperature, 0.01)

succuss = True
while succuss:
try:
response = completion(
engine=engine, # defined by os.environ, default engine="chatgpt0301",
engine=engine, # defined by os.environ, default engine="davinci"
messages=messages,
temperature=0.7,
temperature=temperature,
max_tokens=3200,
top_p=0.95,
frequency_penalty=0,
Expand All @@ -73,6 +96,10 @@ def query_gpt(inputs: dict[str], dataset_name: str) -> tuple[dict[str, str], str
return response, inputs["query_input"]["id"]


# Backward-compatible alias
query_gpt = query_llm


def split_question_and_answer(pair_of_answer: str, file_id: str) -> tuple[bool, dict[str, str]]:
"""
Split the question and answer from the pair of question and answer.
Expand Down
67 changes: 26 additions & 41 deletions pipeline/benchmarks/datasets/magnifierbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,58 +16,40 @@
import time
import requests

from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client

utc_plus_8 = pytz.timezone("Asia/Singapore") # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
utc_now = pytz.utc.localize(datetime.datetime.utcnow())
utc_plus_8_time = utc_now.astimezone(utc_plus_8)


def get_chat_response(promot, api_key, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5):
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
def get_chat_response(promot, api_key=None, model="gpt-4-0613", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None):
if eval_llm_client is None:
eval_llm_client = get_eval_llm_client(api_key=api_key, model=model)

messages = [
{"role": "system", "content": "You are a helpful AI assistant. Your task is to judge whether the model response is correct to answer the given question or not."},
{"role": "user", "content": promot},
]

payload = {"model": model, "messages": messages}

while patience > 0:
patience -= 1
try:
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=headers,
data=json.dumps(payload),
timeout=30,
)
response.raise_for_status()
response_data = response.json()

prediction = response_data["choices"][0]["message"]["content"].strip()
if prediction != "" and prediction is not None:
return prediction

except Exception as e:
if "Rate limit" not in str(e):
print(e)
time.sleep(sleep_time)

return ""
return eval_llm_client.chat_completion(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
patience=patience,
sleep_time=sleep_time,
)


def prepare_query(model_answer_item, api_key):
def prepare_query(model_answer_item, api_key=None, eval_llm_client=None):
freeform_question = model_answer_item["freeform_question"]
freeform_response = model_answer_item["freeform_response"]
correct_answer = model_answer_item["freeform_answer"]

# Formulating the prompt for ChatGPT
# Formulating the prompt for evaluation LLM
prompt = f"Question: {freeform_question}\nModel Response: {freeform_response}\nGround Truth: {correct_answer}\nWill the model response be considered correct? You should only answer yes or no."

# Querying ChatGPT
chat_response = get_chat_response(prompt, api_key)
chat_response = get_chat_response(prompt, api_key=api_key, eval_llm_client=eval_llm_client)

return chat_response

Expand All @@ -83,6 +65,8 @@ def __init__(
debug: bool = False,
prompt="",
api_key=None,
eval_provider=None,
eval_model=None,
):
super().__init__("MagnifierBench", data_path)

Expand All @@ -95,6 +79,11 @@ def __init__(
self.debug = debug
self.prompt = prompt
self.api_key = api_key
self.eval_llm_client = get_eval_llm_client(
provider=eval_provider,
api_key=api_key,
model=eval_model,
)

def parse_pred_ans(self, pred_ans, question):
match = re.search(r"The answer is ([A-D])", pred_ans)
Expand Down Expand Up @@ -122,10 +111,6 @@ def parse_pred_ans(self, pred_ans, question):
def _evaluate(self, model):
model_score_dict = {}

# output_path = os.path.join(self.default_output_path, f"{model.name}_{self.cur_datetime}")
# if not os.path.exists(output_path):
# os.makedirs(output_path)
# model_path: str = "Salesforce/instructblip-vicuna-7b"
model_version = model.name.split("/")[-1]
model_answer_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_answer.json")
result_path = os.path.join(self.default_output_path, f"{model_version}_{self.cur_datetime}_score.json")
Expand Down Expand Up @@ -186,16 +171,16 @@ def _evaluate(self, model):
model_score_dict["total"] = len(self.data)
model_score_dict["accuracy"] = score / len(self.data)

print(f"Start query GPT-4 for free-form evaluation...")
for data_id in tqdm(model_answer.keys(), desc="Querying GPT-4"):
print(f"Start query evaluation LLM for free-form evaluation...")
for data_id in tqdm(model_answer.keys(), desc="Querying evaluation LLM"):
model_answer_item = model_answer[data_id]
gpt_response = prepare_query(model_answer_item, self.api_key)
gpt_response = prepare_query(model_answer_item, eval_llm_client=self.eval_llm_client)
if gpt_response.lower() == "yes":
ff_score += 1
elif gpt_response.lower() == "no":
ff_score += 0
else:
print(f"Warning: {data_id} has invalid GPT-4 response: {gpt_response}")
print(f"Warning: {data_id} has invalid evaluation LLM response: {gpt_response}")
print(f"Skipping {data_id}")
continue

Expand Down
56 changes: 22 additions & 34 deletions pipeline/benchmarks/datasets/mathvista.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import io
from Levenshtein import distance

from pipeline.benchmarks.utils.eval_llm import get_eval_llm_client

utc_plus_8 = pytz.timezone("Asia/Singapore") # You can also use 'Asia/Shanghai', 'Asia/Taipei', etc.
utc_now = pytz.utc.localize(datetime.datetime.utcnow())
utc_plus_8_time = utc_now.astimezone(utc_plus_8)
Expand Down Expand Up @@ -65,41 +67,22 @@
import ast


def get_chat_response(promot, api_key, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5):
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
def get_chat_response(promot, api_key=None, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=5, sleep_time=5, eval_llm_client=None):
if eval_llm_client is None:
eval_llm_client = get_eval_llm_client(api_key=api_key, model=model)

messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": promot},
]

payload = {"model": model, "messages": messages}

while patience > 0:
patience -= 1
try:
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=headers,
data=json.dumps(payload),
timeout=30,
)
response.raise_for_status()
response_data = response.json()

prediction = response_data["choices"][0]["message"]["content"].strip()
if prediction != "" and prediction is not None:
return prediction

except Exception as e:
if "Rate limit" not in str(e):
print(e)
time.sleep(sleep_time)

return ""
return eval_llm_client.chat_completion(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
patience=patience,
sleep_time=sleep_time,
)


def create_test_prompt(demo_prompt, query, response):
Expand All @@ -109,7 +92,7 @@ def create_test_prompt(demo_prompt, query, response):
return full_prompt


def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613"):
def extract_answer(response, problem, quick_extract=False, api_key=None, pid=None, gpt_model="gpt-4-0613", eval_llm_client=None):
question_type = problem["question_type"]
answer_type = problem["answer_type"]
choices = problem["choices"]
Expand Down Expand Up @@ -150,7 +133,7 @@ def extract_answer(response, problem, quick_extract=False, api_key=None, pid=Non
# general extraction
try:
full_prompt = create_test_prompt(demo_prompt, query, response)
extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5)
extraction = get_chat_response(full_prompt, api_key=api_key, model=gpt_model, n=1, patience=5, sleep_time=5, eval_llm_client=eval_llm_client)
return extraction
except Exception as e:
print(e)
Expand Down Expand Up @@ -271,15 +254,14 @@ def __init__(
gpt_model="gpt-4-0613",
debug=False,
quick_extract=False,
eval_provider=None,
eval_model=None,
):
super().__init__("MathVistaDataset", data_path)
name_converter = {"dev": "validation", "test": "test"}
self.data = load_dataset("Otter-AI/MathVista", split=name_converter[split], cache_dir=cache_dir).to_pandas()
if debug:
self.data = self.data.sample(5)
# data_path = "/home/luodian/projects/Otter/archived/testmini_image_inside.json"
# with open(data_path, "r", encoding="utf-8") as f:
# self.data = json.load(f)

self.debug = debug
self.quick_extract = quick_extract
Expand All @@ -290,6 +272,11 @@ def __init__(
self.cur_datetime = utc_plus_8_time.strftime("%Y-%m-%d_%H-%M-%S")
self.api_key = api_key
self.gpt_model = gpt_model
self.eval_llm_client = get_eval_llm_client(
provider=eval_provider,
api_key=api_key,
model=eval_model or gpt_model,
)

def create_query(self, problem, shot_type):
### [2] Test query
Expand Down Expand Up @@ -393,6 +380,7 @@ def _evaluate(self, model):
api_key=self.api_key,
pid=idx_key,
gpt_model=self.gpt_model,
eval_llm_client=self.eval_llm_client,
)
results[idx_key].update({"extraction": extraction})
answer = results[idx_key]["answer"]
Expand Down
Loading