-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluate.py
More file actions
97 lines (81 loc) · 4.39 KB
/
evaluate.py
File metadata and controls
97 lines (81 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import json
import os
from openai import OpenAI
from datetime import datetime
from argparse import ArgumentParser
from utils.eval_utils import Eval, normalized_score
from literature import main as retrieve_literature_main
client = OpenAI()
def eval_idea(configs, idea, idea_year, sim_scores, time, literature):
with open('utils/prompt_Dean_science.txt') as f:
prompt = f.read()
eval_results = {}
if literature == {}:
lit = "No relevant literature found."
else:
lit = ""
for i, paper in enumerate(literature.keys()):
lit += f'Paper {i+1} ({literature[paper]["year"]}): {literature[paper]["idea"]}\n'
completion = client.beta.chat.completions.parse(
model=configs.llm_model,
messages=[
{"role": "system", "content": f"You are tasked with evaluating a research idea proposed by a PhD student in {idea_year}. Evaluate the idea based on your most up-to-date knowledge."},
{"role": "user", "content": f"Search engine results on relevant literature: \n{lit}.\n{prompt}\nIdea ({idea_year}): {idea}"},
],
response_format=Eval,
temperature=0.2,
top_p=0.1,
logprobs=True,
top_logprobs=20,
)
logprobs = completion.choices[0].logprobs.content
scores = {"originality": 0, "paradigm_relatedness": 0, "acceptability": 0, "implementability": 0, "applicability": 0, "effectiveness": 0, "implicational_explicitness": 0, "clarity": 0}
keywords = [kw+'_score":' for kw in list(scores.keys())]
keyword_pointer = 0
for i, tk in enumerate(logprobs):
window = ''.join([logprobs[j].token for j in range(max(i-8, 0), i)])
if keywords[keyword_pointer] in window:
if keywords[keyword_pointer][:-8] in ["implicational_explicitness", "clarity"]:
eval_results[keywords[keyword_pointer][:-8]] = round(normalized_score(tk.top_logprobs, 3),4)
else:
eval_results[keywords[keyword_pointer][:-8]] = round(normalized_score(tk.top_logprobs, 4),4)
keyword_pointer += 1
if keyword_pointer == 8:
break
sum_scores = round(sum([eval_results[dimension] for dimension in eval_results]),5)
eval_results["total_score"] = sum_scores
eval_results["relevant_paper_similarity"] = sim_scores
eval_results["weighted_score"] = round(sum([
float(eval_results["originality"]) * 0.25 / 4,
float(eval_results["paradigm_relatedness"]) * 0.25 / 4,
float(eval_results["acceptability"]) * 0.05 / 4,
float(eval_results["implementability"]) * 0.10 / 4,
float(eval_results["applicability"]) * 0.10 / 4,
float(eval_results["effectiveness"]) * 0.15 / 4,
float(eval_results["implicational_explicitness"]) * 0.05 / 3,
float(eval_results["clarity"]) * 0.05 / 3,
]),5)
eval_results["time"] = time
return eval_results
def evaluate_idea_file(configs, file_name, is_existing_idea=False):
print(f"Evaluating idea in file: {file_name}")
file_name = file_name.replace('.txt', '')
with open(f'{file_name}.txt', 'r') as file:
file_content = file.read()
year = datetime.now().year
literature, scores, time = retrieve_literature_main(configs.llm_model, file_content, f"{file_name}_literature/", verbose=True, is_existing_idea=is_existing_idea)
literature_p = {k: literature[k] for k in list(literature.keys())[:20]}
eval_results = eval_idea(configs, file_content, year, scores, time, literature_p)
print(eval_results)
with open(f'{file_name}_eval.json', 'w') as file:
file.write(json.dumps(eval_results, indent=4))
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--llm_model", type=str, help="LLM model to use. Must be indexed by OpenAI.", default='gpt-4o-2024-08-06')
parser.add_argument('--idea_path', type=str, required=True, help='Name of the file containing the idea to evaluate (without extension)')
parser.add_argument('--is_existing_idea', action='store_true', help='Flag indicating if the idea is from existing paper')
configs = parser.parse_args()
if configs.llm_model != "gpt-4o-2024-08-06":
# warning
print("Warning: Using a model other than gpt-4o-2024-08-06 may lead to inconsistent evaluation results due to differences in knowledge cutoff dates and capabilities.")
evaluate_idea_file(configs, configs.idea_path, is_existing_idea=configs.is_existing_idea)